from keras.layers import Dense, Dropout, Activation, Flatten from keras.layers import Convolution2D, MaxPooling2D from keras.utils import np_utils from keras.datasets import mnist ############################################# seed = 123 # for reproducibility np.random.seed(seed) # 1. Load data into train and test sets X, y = load_data("../data/splice.data.txt") # sequences, labels X = get_rep_mats(X) # convert to array of representation matrices for i in X: # CUSTOM reshape for idx, j in enumerate(i): i[idx] = j[0] y = conv_labels(y) # convert to integer labels X = np.asarray(X) # work with np arrays Y = np.asarray(y) # define 10-fold cross validation test harness kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) scores = [] for train, test in kfold.split(X, Y): print "====> FOLD [" + str(len(scores) + 1) + "]" # 2. Preprocess input data X_train = X[train].reshape(X[train].shape[0], 1, 58, 64) X_test = X[test].reshape(X[test].shape[0], 1, 58, 64) X_train = X_train.astype('float32') X_test = X_test.astype('float32')
# 1. Load data into train and test sets X, y = load_data("../data/promoters.data.txt") # sequences, labels X = get_rep_mats(X) # convert to array of representation matrices ############ # In[ ]: for i in X: for idx, j in enumerate(i): i[idx] = j[0] ############ # In[ ]: y = conv_labels(y, "promoter") # convert to integer labels X = np.asarray(X) # work with np arrays y = np.asarray(y) X_train = X[0:90] X_test = X[90:] y_train = y[0:90] y_test = y[90:] # In[ ]: # 2. Preprocess input data X_train = X_train.reshape(X_train.shape[0], 1, 55, 64) # (90, 55, 64) --> (90, 1, 55, 64) X_test = X_test.reshape(X_test.shape[0], 1, 55, 64) X_train = X_train.astype('float32') X_test = X_test.astype('float32')
seq = seq.upper() # b/c rep matrix built on uppercase seq = seq.replace("\t","") # present in promoter seq = seq.replace("N","A") # undetermined nucleotides in splice seq = seq.replace("D","G") seq = seq.replace("S","C") seq = seq.replace("R","G") ##### labels.append(label) seqs.append(seq) f.close() return seqs, labels # In[11]: if __name__ == "__main__": # reading in splice junction input data and converting to required format seqs, labels = load_data("../data/splice.data.txt") lbls_mod = conv_labels(labels) seqs_mod = get_rep_mats(seqs) print (len(seqs_mod)) print (len(lbls_mod)) # In[ ]: