def load_dataset(organism): from ml_data import SequenceNucsData, SimpleHistData global max_features global maxlen print('Load organism: {}'.format(organism)) npath, ppath = './fasta/{}_neg.fa'.format( organism), './fasta/{}_pos.fa'.format(organism) print(npath, ppath) ini = 59 k = 1 max_features = 4**k samples = SequenceNucsData(npath, ppath, k=k) samples2 = SimpleHistData(npath, ppath, k=3, upto=True) X, y = samples.getX(), samples.getY() X = np.hstack((X, samples2.getX())) # X = X.reshape(-1, 38, 79, 1).astype('float32') np.random.seed(int(time.time())) mask = np.array(np.random.randint(2, size=X.shape[1]) + 1, dtype=bool) # mask = np.array(np.zeros(X.shape[1]) , dtype=bool) # mask[59]=1 # mask[60]=1 # mask[61]=1 # mask[49]=1 # mask[50]=1 # mask[51]=1 # mask[24]=1 # mask[25]=1 # mask[26]=1 mask[ini] = 1 print(mask) X = X[:, mask] X = X.astype('int32') # ini = 199 # X = X[:, (ini-30):(ini+11)] y = y.astype('int32') print('Input Shapes\nX: {} | y: {}'.format(X.shape, y.shape)) maxlen = X.shape[1] return X, y
#npath = "fasta/Bacillus_non_prom.fa" #ppath = "fasta/Bacillus_prom.fa" #npath = "fasta/Arabidopsis_non_prom_big.fa" #ppath = "fasta/Arabidopsis_non_tata.fa" npath = "fasta/Ecoli_non_prom.fa" ppath = "fasta/Ecoli_prom.fa" #mldata = SequenceNucsData(npath, ppath, k=3) mldata = SequenceDinucProperties(npath, ppath) mldata2 = SimpleHistData(npath, ppath, k=4, upto=True) X = mldata.getX() X2 = mldata2.getX() Y = mldata.getY() print X.shape print Y.shape posIndex = numpy.where(Y[:] == 1)[0] negIndex = numpy.where(Y[:] == 0)[0] diff = len(negIndex) - len(posIndex) diff = len(negIndex) - diff print 'DIFF', diff toremove = numpy.arange(len(negIndex)) numpy.random.shuffle(toremove) toremove = toremove[:diff]
factor=0.1, patience=5, verbose=0, mode='auto', epsilon=0.001, cooldown=0, min_lr=0) npath = "fasta/Bacillus_non_prom.fa" ppath = "fasta/Bacillus_prom.fa" #mldata = SequenceNucsData(npath, ppath, k=3) mldata = SimpleHistData(npath, ppath, k=4) mldata2 = SequenceNucsData(npath, ppath, k=3) mldata3 = DinucAutoCovarData(npath, ppath) X = mldata.getX() Y = mldata.getY() X2 = mldata2.getX() X3 = mldata3.getX() kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234) kf.get_n_splits(X, Y) cvscores = [] for train_index, test_index in kf.split(X, Y): X_train = X[train_index, :] X_train2 = X2[train_index, :] #X_train2 = X_train2[:,:,numpy.newaxis] X_train3 = X3[train_index, :]