Ejemplo n.º 1
0
def load_dataset(organism):
    from ml_data import SequenceNucsData, SequenceDinucLabelsProperties
    global max_features
    global maxlen

    print('Load organism: {}'.format(organism))
    npath, ppath = './fasta/{}_neg.fa'.format(
        organism), './fasta/{}_pos.fa'.format(organism)
    print(npath, ppath)

    k = 2
    max_features = 4**k
    samples = SequenceNucsData(npath, ppath, k=k)

    # k = 4
    # max_features = 4**k
    # samples = SequenceNucsData(npath, ppath, k=1)

    X, y = samples.getX(), samples.getY()
    #    X = X.reshape(-1, 38, 79, 1).astype('float32')
    X = X.astype('int32')
    # Cut width
    X = X[:, (199 - 10):(199 + 31)]
    y = y.astype('int32')
    print('Input Shapes\nX: {} | y: {}'.format(X.shape, y.shape))
    maxlen = X.shape[1]
    print('>>> maxlen {}'.format(maxlen))
    return X, y
def load_dataset(organism):
    from ml_data import SequenceNucsData

    print('Load organism: {}'.format(organism))
    npath, ppath = './fasta/{}_neg.fa'.format(
        organism), './fasta/{}_pos.fa'.format(organism)
    print(npath, ppath)

    samples = SequenceNucsData(npath, ppath, k=k)

    X, y = samples.getX(), samples.getY()
    #    X = X.reshape(-1, 38, 79, 1).astype('float32')
    X = X.astype('int32')
    y = y.astype('int32')
    print('Input Shapes\nX: {} | y: {}'.format(X.shape, y.shape))
    maxlen = X.shape[1]
    return X, y
Ejemplo n.º 3
0
def load_dataset(organism):
    from ml_data import SequenceNucsData, SimpleHistData
    global max_features
    global maxlen

    print('Load organism: {}'.format(organism))
    npath, ppath = './fasta/{}_neg.fa'.format(
        organism), './fasta/{}_pos.fa'.format(organism)
    print(npath, ppath)

    ini = 59

    k = 1
    max_features = 4**k
    samples = SequenceNucsData(npath, ppath, k=k)
    samples2 = SimpleHistData(npath, ppath, k=3, upto=True)

    X, y = samples.getX(), samples.getY()
    X = np.hstack((X, samples2.getX()))
    #    X = X.reshape(-1, 38, 79, 1).astype('float32')
    np.random.seed(int(time.time()))
    mask = np.array(np.random.randint(2, size=X.shape[1]) + 1, dtype=bool)
    # mask = np.array(np.zeros(X.shape[1]) , dtype=bool)
    # mask[59]=1
    # mask[60]=1
    # mask[61]=1
    # mask[49]=1
    # mask[50]=1
    # mask[51]=1
    # mask[24]=1
    # mask[25]=1
    # mask[26]=1

    mask[ini] = 1
    print(mask)
    X = X[:, mask]
    X = X.astype('int32')
    #    ini = 199
    # X = X[:, (ini-30):(ini+11)]
    y = y.astype('int32')
    print('Input Shapes\nX: {} | y: {}'.format(X.shape, y.shape))
    maxlen = X.shape[1]
    return X, y
Ejemplo n.º 4
0
def load_dataset(organism, coding_type='onehot', k=1):
    from ml_data import SequenceNucsData, SequenceNucHotvector, SequenceMotifHot

    print('Load organism: {}'.format(organism))
    npath, ppath = './fasta/{}_neg.fa'.format(
        organism), './fasta/{}_pos.fa'.format(organism)
    print(npath, ppath)

    if coding_type == 'onehot':
        samples = SequenceNucHotvector(npath, ppath)
    elif coding_type == 'embedding':
        k = 1
        samples = SequenceNucsData(npath, ppath, k=k)

    X, y = samples.getX(), samples.getY()
    X, y = X.astype('int32'), y.astype('int32')

    #    X = X.reshape(-1, 38, 79, 1).astype('float32')
    #     ini = 59
    # #    ini = 199
    #     X = X[:, (ini-30):(ini+11)]
    print('Input Shapes\nX: {} | y: {}'.format(X.shape, y.shape))
    return X, y, X.shape[1:]
numpy.random.seed(7)


earlyStopping=EarlyStopping(monitor='val_loss', patience=30, verbose=0, mode='auto')
reduceLR = ReduceLROnPlateau(monitor='val_loss', factor=0.01, patience=5, verbose=0, mode='auto', epsilon=0.001, cooldown=0, min_lr=0)

#npath = "fasta/Bacillus_non_prom.fa"
#ppath = "fasta/Bacillus_prom.fa"

#npath = "fasta/Arabidopsis_non_prom_big.fa"
#ppath = "fasta/Arabidopsis_non_tata.fa"

npath = "fasta/Ecoli_non_prom.fa"
ppath = "fasta/Ecoli_prom.fa"

mldata = SequenceNucsData(npath, ppath, k=2)
mldata2 = SequenceNucsData(npath, ppath, k=3)

X = mldata.getX()
X2 = mldata2.getX()
newCol = numpy.array([-1 for _ in range(X2.shape[0])])
X2 = numpy.column_stack([ X2, newCol ])
Y = mldata.getY()

print 'X1: {}'.format(X.shape)
print 'X2: {}'.format(X2.shape)
print 'Y: {}'.format(Y.shape)


toRemove = False
if toRemove:
Ejemplo n.º 6
0
def load_partition(train_index, test_index, X, y):
    x_train = X[train_index, :]
    y_train = y[train_index]
    x_test = X[test_index, :]
    y_test = y[test_index]
    return (x_train, y_train), (x_test, y_test)


organism = 'Bacillus'
npath, ppath = './fasta/{}_neg.fa'.format(
    organism), './fasta/{}_pos.fa'.format(organism)

k = 1

data = SequenceNucsData(npath, ppath, k=k)

X = data.getX()
y = data.getY()

kf = StratifiedShuffleSplit(n_splits=5, random_state=34267)
kf.get_n_splits(X, y)

partition = 0
for train_index, test_index in kf.split(X, y):
    partition += 1
    (x_train,
     y_train), (x_test, y_test) = load_partition(train_index, test_index, X, y)

    y_train = np.expand_dims(y_train, axis=1)
    y_test = np.expand_dims(y_test, axis=1)
Ejemplo n.º 7
0
                              verbose=0,
                              mode='auto')
reduceLR = ReduceLROnPlateau(monitor='val_loss',
                             factor=0.1,
                             patience=5,
                             verbose=0,
                             mode='auto',
                             epsilon=0.001,
                             cooldown=0,
                             min_lr=0)

npath = "fasta/Bacillus_non_prom.fa"
ppath = "fasta/Bacillus_prom.fa"
#mldata = SequenceNucsData(npath, ppath, k=3)
mldata = SimpleHistData(npath, ppath, k=4)
mldata2 = SequenceNucsData(npath, ppath, k=3)
mldata3 = DinucAutoCovarData(npath, ppath)

X = mldata.getX()
Y = mldata.getY()

X2 = mldata2.getX()
X3 = mldata3.getX()

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
kf.get_n_splits(X, Y)

cvscores = []
for train_index, test_index in kf.split(X, Y):

    X_train = X[train_index, :]