np.random.seed(123) # for reproducibility # In[5]: from keras.models import Sequential from keras.layers import Dense, Dropout, Activation, Flatten from keras.layers import Convolution2D, MaxPooling2D from keras.utils import np_utils from keras.datasets import mnist ############################################# # In[6]: # 1. Load data into train and test sets X, y = load_data("../data/promoters.data.txt") # sequences, labels X = get_rep_mats(X) # convert to array of representation matrices ############ # In[ ]: for i in X: for idx, j in enumerate(i): i[idx] = j[0] ############ # In[ ]: y = conv_labels(y, "promoter") # convert to integer labels X = np.asarray(X) # work with np arrays y = np.asarray(y) X_train = X[0:90]
seq = seq.upper() # b/c rep matrix built on uppercase seq = seq.replace("\t","") # present in promoter seq = seq.replace("N","A") # undetermined nucleotides in splice seq = seq.replace("D","G") seq = seq.replace("S","C") seq = seq.replace("R","G") ##### labels.append(label) seqs.append(seq) f.close() return seqs, labels # In[11]: if __name__ == "__main__": # reading in splice junction input data and converting to required format seqs, labels = load_data("../data/splice.data.txt") lbls_mod = conv_labels(labels) seqs_mod = get_rep_mats(seqs) print (len(seqs_mod)) print (len(lbls_mod)) # In[ ]: