def fold10_cv_partition(positive_file, negative_file): # generate the vectorized xs and ys x_dataset, y_dataset = vectorize_data(positive_file, negative_file) print("data vectorization in function fold10_cv_partition finished!") # Divided into 10 parts max_len = len(x_dataset) m = int(max_len / 10) # define lists storing the different partitions x_train_segment = [] y_train_segment = [] x_validation_segment = [] y_validation_segment = [] # Partition of data set for 10-fold cross validation # generate the first nine dataset segments for i in range(9): x_validation_segment.append(x_dataset[m * i:m * (i + 1)]) y_validation_segment.append(y_dataset[m * i:m * (i + 1)]) x_train_segment.append( np.concatenate( [x_dataset[0:m * i], x_dataset[m * (i + 1):max_len]])) y_train_segment.append( np.concatenate( [y_dataset[0:m * i], y_dataset[m * (i + 1):max_len]])) # generate the last dataset segments x_validation_segment.append(x_dataset[m * 9:max_len]) y_validation_segment.append(y_dataset[m * 9:max_len]) x_train_segment.append(x_dataset[0:m * 9]) y_train_segment.append(y_dataset[0:m * 9]) print("Partition of fold 10 finished!") return x_train_segment, y_train_segment, x_validation_segment, y_validation_segment
def train_test_partition(positive_file, negative_file): # generate the vectorized xs and ys x_dataset, y_dataset = vectorize_data(positive_file, negative_file) print("data vectorization in function train_test_partition finished!") print(len(x_dataset)) # generate test and train dataset x_test_dataset = x_dataset[0:752] y_test_dataset = y_dataset[0:752] x_train_dataset = x_dataset[752:] y_train_dataset = y_dataset[752:] print("train_test_partition finished!") return x_train_dataset, y_train_dataset, x_test_dataset, y_test_dataset
# CONV_DEEP = 128 #number of first filter(convolution deepth) STRIDES = [1,1,1,1] #the strid in each of four dimensions during convolution KSIZE = [1,164,1,1] #pooling window size FC_SIZE = 128 #nodes of full-connection layer NUM_CLASSES = 2 # classification number DROPOUT_KEEP_PROB = 0.5 #keep probability of dropout FILE_PATH = "../../data/miRBase_set.csv" FILE_PATH_PUTATIVE = "../../data/putative_mirtrons_set.csv" all_data_array = dataRead.read_data(FILE_PATH,FILE_PATH_PUTATIVE) vectorized_dataset = dataVectorization.vectorize_data(all_data_array) X_train, y_train, X_test, y_test = dataPartition.data_partition(vectorized_dataset) X_train = np.array(X_train) y_train = np.array(y_train) X_test = np.array(X_test) y_test = np.array(y_test) print(y_train.shape) print(X_train.shape) print(y_test.shape) print(X_test.shape) print("dataset vectorization finished!") print("iteration",TRAINING_ITER) dataset_size = len(X_train) #number of training dataset input_X = tf.placeholder(tf.float32,[None,SEQUENCE_LENGTH,EMBEDDING_SIZE,1]) input_y = tf.placeholder(tf.float32,[None, NUM_CLASSES])