Example #1
0
def load_dataset(organism):
    from ml_data import SequenceNucsData, SimpleHistData
    global max_features
    global maxlen

    print('Load organism: {}'.format(organism))
    npath, ppath = './fasta/{}_neg.fa'.format(
        organism), './fasta/{}_pos.fa'.format(organism)
    print(npath, ppath)

    ini = 59

    k = 1
    max_features = 4**k
    samples = SequenceNucsData(npath, ppath, k=k)
    samples2 = SimpleHistData(npath, ppath, k=3, upto=True)

    X, y = samples.getX(), samples.getY()
    X = np.hstack((X, samples2.getX()))
    #    X = X.reshape(-1, 38, 79, 1).astype('float32')
    np.random.seed(int(time.time()))
    mask = np.array(np.random.randint(2, size=X.shape[1]) + 1, dtype=bool)
    # mask = np.array(np.zeros(X.shape[1]) , dtype=bool)
    # mask[59]=1
    # mask[60]=1
    # mask[61]=1
    # mask[49]=1
    # mask[50]=1
    # mask[51]=1
    # mask[24]=1
    # mask[25]=1
    # mask[26]=1

    mask[ini] = 1
    print(mask)
    X = X[:, mask]
    X = X.astype('int32')
    #    ini = 199
    # X = X[:, (ini-30):(ini+11)]
    y = y.astype('int32')
    print('Input Shapes\nX: {} | y: {}'.format(X.shape, y.shape))
    maxlen = X.shape[1]
    return X, y
Example #2
0
#npath = "fasta/Bacillus_non_prom.fa"
#ppath = "fasta/Bacillus_prom.fa"

#npath = "fasta/Arabidopsis_non_prom_big.fa"
#ppath = "fasta/Arabidopsis_non_tata.fa"

npath = "fasta/Ecoli_non_prom.fa"
ppath = "fasta/Ecoli_prom.fa"

#mldata = SequenceNucsData(npath, ppath, k=3)
mldata = SequenceDinucProperties(npath, ppath)
mldata2 = SimpleHistData(npath, ppath, k=4, upto=True)

X = mldata.getX()
X2 = mldata2.getX()
Y = mldata.getY()

print X.shape
print Y.shape

posIndex = numpy.where(Y[:] == 1)[0]
negIndex = numpy.where(Y[:] == 0)[0]

diff = len(negIndex) - len(posIndex)
diff = len(negIndex) - diff
print 'DIFF', diff

toremove = numpy.arange(len(negIndex))
numpy.random.shuffle(toremove)
toremove = toremove[:diff]
Example #3
0
                             factor=0.1,
                             patience=5,
                             verbose=0,
                             mode='auto',
                             epsilon=0.001,
                             cooldown=0,
                             min_lr=0)

npath = "fasta/Bacillus_non_prom.fa"
ppath = "fasta/Bacillus_prom.fa"
#mldata = SequenceNucsData(npath, ppath, k=3)
mldata = SimpleHistData(npath, ppath, k=4)
mldata2 = SequenceNucsData(npath, ppath, k=3)
mldata3 = DinucAutoCovarData(npath, ppath)

X = mldata.getX()
Y = mldata.getY()

X2 = mldata2.getX()
X3 = mldata3.getX()

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
kf.get_n_splits(X, Y)

cvscores = []
for train_index, test_index in kf.split(X, Y):

    X_train = X[train_index, :]
    X_train2 = X2[train_index, :]
    #X_train2 = X_train2[:,:,numpy.newaxis]
    X_train3 = X3[train_index, :]