from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils
from keras.datasets import mnist
#############################################

seed = 123  # for reproducibility
np.random.seed(seed)

# 1. Load data into train and test sets
X, y = load_data("../data/splice.data.txt")  # sequences, labels
X = get_rep_mats(X)  # convert to array of representation matrices
for i in X:  # CUSTOM reshape
    for idx, j in enumerate(i):
        i[idx] = j[0]
y = conv_labels(y)  # convert to integer labels
X = np.asarray(X)  # work with np arrays
Y = np.asarray(y)

# define 10-fold cross validation test harness
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
scores = []

for train, test in kfold.split(X, Y):
    print "====> FOLD [" + str(len(scores) + 1) + "]"

    # 2. Preprocess input data
    X_train = X[train].reshape(X[train].shape[0], 1, 58, 64)
    X_test = X[test].reshape(X[test].shape[0], 1, 58, 64)
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
Esempio n. 2
0
# 1. Load data into train and test sets
X, y = load_data("../data/promoters.data.txt")  # sequences, labels
X = get_rep_mats(X)  # convert to array of representation matrices
############

# In[ ]:

for i in X:
    for idx, j in enumerate(i):
        i[idx] = j[0]
############

# In[ ]:

y = conv_labels(y, "promoter")  # convert to integer labels
X = np.asarray(X)  # work with np arrays
y = np.asarray(y)
X_train = X[0:90]
X_test = X[90:]
y_train = y[0:90]
y_test = y[90:]

# In[ ]:

# 2. Preprocess input data
X_train = X_train.reshape(X_train.shape[0], 1, 55,
                          64)  # (90, 55, 64) --> (90, 1, 55, 64)
X_test = X_test.reshape(X_test.shape[0], 1, 55, 64)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
Esempio n. 3
0
        seq = seq.upper()    # b/c rep matrix built on uppercase
        seq = seq.replace("\t","")      # present in promoter 
        seq = seq.replace("N","A")  # undetermined nucleotides in splice
        seq = seq.replace("D","G")
        seq = seq.replace("S","C")
        seq = seq.replace("R","G")
        #####
        labels.append(label)
        seqs.append(seq)
    f.close()
    return seqs, labels


# In[11]:


if __name__ == "__main__":
    # reading in splice junction input data and converting to required format
    seqs, labels = load_data("../data/splice.data.txt")
    lbls_mod = conv_labels(labels)
    seqs_mod = get_rep_mats(seqs)
    print (len(seqs_mod))
    print (len(lbls_mod))


# In[ ]: