Ejemplo n.º 1
0
    def _model_build(self, *arg):
        self._prepare_test_data()
        model = KerasClassifier(
            build_fn=self.create_model, verbose=0)
        optimizers = [
            'adam']
        init = [
            'normal', 'uniform']
        epochs = [
            100, 150]
        batches = [
            5, 10]
        param_grid = dict(
            optimizer=optimizers, nb_epoch=epochs, batch_size=batches, init=init)
        grid = GridSearchCV(
            estimator=model, param_grid=param_grid, cv=5)
        grid_result = grid.fit(
            self.x_train, self.y_train)
        print("Best: %f using %s" % (
            grid_result.best_score_, grid_result.best_params_))
        # means = grid_result.cv_results_[
        #     'mean_test_score']
        # stds = grid_result.cv_results_[
        #     'std_test_score']
        # params = grid_result.cv_results_[
        #     'params']
        # for mean, stdev, param in zip(means, stds, params):
        #     print("%f (%f) with: %r" % (
        # mean,
        # stdev,
        # param))

        # Training
        # with Best
        # Parameter
        model = Sequential()
        model.add(Dense(
            12, input_dim=8, init=grid_result.best_params_['init'], activation='relu'))
        model.add(Dense(
            8, init=grid_result.best_params_['init'], activation='relu'))
        model.add(Dense(
            1, init=grid_result.best_params_['init'], activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer=grid_result.best_params_['optimizer'], metrics=['accuracy'])
        # Compile
        # model
        model.fit(
            self.x_train, self.y_train, nb_epoch=grid_result.best_params_['nb_epoch'], batch_size=grid_result.best_params_['batch_size'])
        yy_pred = model.predict(
            self.x_test)
        self.y_pred = [np.round(
            x) for x in yy_pred]
        self.y_true = self.y_test
        self.prob = model.predict_proba(
            self.x_test)
        self._analyse_result()
Ejemplo n.º 2
0
    print("%f (%f) with: %r" % (mean, stdev, param))

#
#
#   RUN MODEL
#
#

model = Sequential()

model.add(Dense(20, activation='relu', input_dim=len(elo_data.columns) - 1))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train_elo,
          Y_train_elo,
          epochs=10,
          batch_size=50,
          validation_split=0.2,
          verbose=1)
model.test_on_batch(X_test_elo, Y_test_elo, sample_weight=None)
model.evaluate(X_test_elo, Y_test_elo, verbose=1)
pred = model.predict_classes(X_test_elo, verbose=1)

plot_model(model, to_file='model.png', show_shapes=True)

SVG(model_to_dot(model).create(prog='dot', format='svg'))
Ejemplo n.º 3
0
print("-------------------------------------------------------------")
print("Best Parameters selected:",grid_result.best_params_)

"""**So we have got the best hyper paramters that we will use to build our final model**"""

model=keras.Sequential()
# Adding the input layer and the first hidden layer
model.add(Dense(128,activation='relu',kernel_initializer = 'he_uniform',input_dim=784))

# Adding the second hidden layer
model.add(Dense(units = 64, kernel_initializer = 'he_uniform',activation='relu'))
# Adding the output layer
model.add(Dense(units= 10, kernel_initializer = 'glorot_uniform', activation = 'softmax'))

# Compiling the ANN
model.compile(optimizer = 'Adamax', loss = 'categorical_crossentropy', metrics = ['accuracy'])

from keras.utils import to_categorical
# Fitting the ANN to the Training set
model_history=model.fit(train_images,
                             to_categorical(train_labels)
                            ,batch_size = 10, 
                              epochs = 20)

"""We have got an accuracy of **97.19** on our training dataset"""

# summarize history for loss
from matplotlib import pyplot as plt
plt.plot(model_history.history['loss'])

plt.title('model loss')
    if resp.lower().startswith("s"):
        print("- CONTINUANDO Programa")
        valid = True
    elif resp.lower().startswith("n"):
        valid = True
        print("- FINALIZANDO Programa")
        exit()

classifier = build_classifier()
print(("- COMPILANDO Rede neural artificial (epochs = {0} - batch_size = {1})"
       ).format(epoch, batch_size))
print(
    "- DEFININDO Compilador: optimizer = adam, loss = categorical_crossentropy, metrics = [accuracy]"
)
classifier.compile(optimizer="adam",
                   loss="categorical_crossentropy",
                   metrics=["accuracy"])
print("- TREINANDO Rede neural artificial (fitting)")
classifier.fit(X_train, Y_train, batch_size=batch_size, epochs=epoch)

# ---------------------- PREDICTION ----------------------
print("\n- COMECANDO Fase de testes (previsoes)")


def prepare4pred(X):
    Y = np.array([])
    for line in X:
        if line[0] > line[1] and line[0] > line[2]:
            Y = np.append(Y, 0)
        elif line[1] > line[0] and line[1] > line[2]:
            Y = np.append(Y, 1)
Ejemplo n.º 5
0
    x = layers.Dense(300, activation='relu')(inputs)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(300, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    y = layers.Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inputs, outputs=y)
    return model


model = create_model_1()
learning_rate = 1e-3
label_smoothing = 0.0
model.compile(optimizer=tfa.optimizers.SWA(
    tf.keras.optimizers.Adam(learning_rate=learning_rate)),
              loss=losses.BinaryCrossentropy(label_smoothing=label_smoothing),
              metrics=['accuracy'])
es = callbacks.EarlyStopping(monitor='val_accuracy',
                             min_delta=0.000001,
                             patience=10,
                             verbose=Verbose,
                             mode='max',
                             baseline=None,
                             restore_best_weights=True)
sb = callbacks.ModelCheckpoint('./nn_model.w8',
                               save_weights_only=True,
                               save_best_only=True,
                               verbose=Verbose,
                               monitor='val_accuracy',
                               mode='max')
plateau = callbacks.ReduceLROnPlateau(monitor='val_accuracy',
Ejemplo n.º 6
0
grid.fit(sequence_matrix, y_train)

print(
    grid.best_score_,
    grid.best_params_)  #returns the best cross validation score and parameters
# gave the result 0.9844460397224216 {'batch_size': 128, 'epochs': 4}

#train the model with the best epoch and and batch size

inputs = Input(name='inputs', shape=[maximum_length])
layer = Embedding(maximum_word, 50, input_length=maximum_length)(inputs)
layer = LSTM(64)(layer)
layer = Dense(units=256, activation='relu')(layer)
layer = Dropout(0.5)(layer)
layer = Dense(units=1, activation='sigmoid')(layer)
model = Model(inputs=inputs, outputs=layer)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
#run the model with batch_size:128 and epochs 4
model.fit(sequence_matrix, y_train, batch_size=128, epochs=4)

#test performance on test set
test_sequences = token.texts_to_sequences(x_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,
                                               maxlen=maximum_length)

loss_and_accuracy = model.evaluate(test_sequences_matrix, y_test)

print(loss_and_accuracy)
Ejemplo n.º 7
0
# model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1, validation_data=(x_test, y_test))
# score = model.evaluate(x_test, y_test, verbose=0)
# print('Test loss: ', score[0])
# print('Test accuracy: ', score[1])

# 将定义好的model传入sklearn进行cross_validation
model = KerasClassifier(build_fn=create_model, nb_epoch=20, batch_size=128)
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=41)
# results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
scores_history = []
for train_ind, test_ind in kfold.split(X, Y0):
    model = Sequential()
    model.add(Dense(units=512, input_dim=784, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(units=512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(units=num_classes, activation='softmax'))

    model.summary()

    model.compile(optimizer=RMSprop(),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(X[train_ind],
              Y[train_ind],
              epochs=epochs,
              batch_size=batch_size,
              verbose=0)
    scores = model.evaluate(X[test_ind], Y[test_ind], verbose=0)
    scores_history.append(scores)
Ejemplo n.º 8
0
seed = 7
np.random.seed(seed)
classifier = Sequential()
classifier.add(
    Dense(units=66,
          kernel_initializer='uniform',
          activation='relu',
          input_dim=22))
classifier.add(Dropout(rate=0.1))
classifier.add(Dense(units=44, kernel_initializer='uniform',
                     activation='relu'))
classifier.add(Dropout(rate=0.3))
classifier.add(
    Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
classifier.compile(optimizer='adam',
                   loss='binary_crossentropy',
                   metrics=['accuracy'])
hist = classifier.fit(X_train, y_train, batch_size=1, epochs=100)

y_pred = classifier.predict(X_test)
y_pred = (y_pred >= 0.5)

cm = confusion_matrix(y_test, y_pred)
cm

###########################
# Tuning the ANN (GridSearch)
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from time import gmtime, strftime
model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=(100, 800, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-4),
              metrics=['acc'])

history = model.fit(X_train,
                    y_train,
                    epochs=35,
                    batch_size=150, 
                    validation_data=(X_val, y_val))

model.save('best_model_imag_3clases.h5')

Y_pred_proba=model.predict(X_test)
Y_pred=model.predict_classes(X_test)
test_loss_trained_net, test_acc_trained_net = model.evaluate(X_test, y_test)
print('test_acc:', test_acc_trained_net)
#Resultado 75% de acuraccy sobre el test
Ejemplo n.º 10
0
## Adding the second hidden layer

classifier.add(Dense(output_dim = 16, init = 'uniform', activation = 'relu'))


classifier.add(Dense(output_dim = 16, init = 'uniform', activation = 'relu'))

classifier.add(Dense(output_dim = 16, init = 'uniform', activation = 'relu'))

## Adding the output layer  
classifier.add(Dense(output_dim = 1, init = 'uniform', activation = 'tanh'))  ## only one node(yes/no) and activation function changed to sigmoid
                                                                                 ## if output variable is encoded then dim=3, activation = softmax

## Compiling the Artificial Neural Network - finding the best weight values with stochastic approach
classifier.compile(optimizer = 'nadam', loss = 'binary_crossentropy', metrics = ['accuracy'] )    ## logarithmic loss

## Fitting the ANN to the Training set. Two additional arguments - batch size & number of epochs
classifier.fit(X_train, y_train, batch_size=25, epochs=180)




y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

Ejemplo n.º 11
0
class NeuralNetwork:
    def __init__(self, ds: Dataset):
        self.ds = ds

    def create_model(self, typ):
        clear_session()
        if typ == 'Sequential':
            self.model = Sequential()

    def add_final_layer(self):
        if self.ds.y_train.nunique() == 2:
            self.model.add(layers.Dense(1, activation='sigmoid'))
            self.model.compile(optimizer='adam',
                               loss='binary_crossentropy',
                               metrics=['accuracy'])

    def sequential(self, layers, activation='relu'):
        """Create a sequential Keras model"""
        self.create_model('Sequential')
        self.add_dense(layers, activation)
        self.add_final_layer()

    def add_dense(self, filters, activation='relu'):
        for x in filters:
            self.model.add(layers.Dense(x, activation=activation))

    def add_embedding_layer(self):
        if self.ds.weights is not None:
            self.model.add(
                layers.Embedding(input_dim=self.ds.vocab_size,
                                 output_dim=self.ds.weights.shape[1],
                                 input_length=self.ds.X_train.shape[1],
                                 weights=[self.ds.weights]))
        else:
            self.model.add(
                layers.Embedding(input_dim=self.ds.vocab_size,
                                 output_dim=self.ds.weights.shape[1],
                                 input_length=self.ds.X_train.shape[1]))

    def embedding_to_sequential(self, filters, activation='relu'):
        self.create_model('Sequential')
        self.add_embedding_layer()
        self.model.add(layers.GlobalMaxPool1D())
        self.add_dense(filters)
        self.add_final_layer()

    def cnn(self, cnn_filters, kernel_sizes, dense_filters, activation='relu'):
        self.create_model('Sequential')
        self.add_embedding_layer()
        self.add_cnn(cnn_filters, kernel_sizes)
        self.model.add(layers.GlobalMaxPool1D)
        self.add_dense(dense_filters)
        self.add_final_layer()

    def grid_search(self, param_grid, **kwargs):
        #kwargs: epoch=10, verbose=False
        #not sure if this works, might need to return model
        if 'kernel_size' in param_grid:
            self.model = KerasClassifier(build_fn=self.cnn, **kwargs)
        elif 'filters' in param_grid:
            self.model = KerasClassifier(build_fn=self.embedding_to_sequential,
                                         **kwargs)
        else:
            self.model = KerasClassifier(build_fn=self.sequential, **kwargs)
        grid = RandomizedSearchCV(estimator=self.model,
                                  param_distributions=param_grid,
                                  verbose=1)
        grid.fit(self.ds.X_train, self.ds.y_train)
        test_accuracy = grid.score(self.ds.X_test, self.ds.y_test)
        print(f'Test accuracy for best model is {test_accuracy}')
        return grid

    def add_cnn(self, filters, kernels):
        assert len(filters) == len(kernels)
        for i in range(len(filters)):
            self.model.add(layers.Conv1D(filters[i], kernels[i]))

    def train(self, **kwargs):
        self.model.fit(self.ds.X_train, self.ds.y_train, **kwargs)

    def evaluate(self):
        trainl, traina = self.model.evaluate(self.ds.X_train,
                                             self.ds.y_train,
                                             verbose=False)
        vall, vala = self.model.evaluate(self.ds.X_test,
                                         self.ds.y_test,
                                         verbose=False)
        print(f'Training loss {trainl}, training accuracy {traina}')
        print(f'Validation loss {vall}, validation accuracy {vala}')
        return vall, vala

    def plot_history(self, path):
        metrics = ['acc', 'val_acc', 'loss', 'val_loss']
        metric_dict = {
            'acc': ['Training Accuracy', 'b'],
            'val_acc': ['Validation Accuracy', 'r'],
            'loss': ['Training Loss', 'b'],
            'val_loss': ['Validation Loss', 'r']
        }

        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        plt.title('Training and validation accuracy')
        plt.legend()
        for metric in metrics:
            if 'loss' in metric:
                plt.subplot(1, 2, 2)
            met = self.model.history.history[metric]
            x = range(1, len(met) + 1)
            plt.plot(x, met, metric_dict[met][1], label=metric_dict[met][0])

        plt.title('Training and validation loss')
        plt.legend()
        plt.savefig(path)
Ejemplo n.º 12
0
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

pipe = Pipeline([('minmax', MinMaxScaler()), ('model', model)])
search = RandomizedSearchCV(estimator=pipe, param_distributions=hyperparameters, n_iter=20, n_jobs=16, cv=5)

search.fit(x_train, y_train)
print(search.best_params_)
print('score :', search.score(x_test, y_test))
# 모델 실행

# 결과 출력
# print('\n Accuracy: %.4f'%(model.evaluate(x, y)[1]))


'''
inputs = Input(shape=(8, ), name='input')
x = Dense(512, activation='relu', name='hidden1')(inputs)
x = Dropout(keep_prob)(x)
x = Dense(256, activation='relu', name='hidden2')(x)
x = Dropout(keep_prob)(x)
x = Dense(128, activation='relu', name='hidden3')(x)
x = Dropout(keep_prob)(x)
x = Dense(128, activation='relu', name='hidden4')(x)
x = Dropout(keep_prob)(x)
x = Dense(256, activation='relu', name='hidden5')(x)
x = Dropout(keep_prob)(x)
x = Dense(512, activation='relu', name='hidden6')(x)
x = Dropout(keep_prob)(x)
prediction = Dense(1, activation='sigmoid', name='output')(x)
model = Model(inputs=inputs, outputs=prediction)
X_train = np.array(X_train)
X_test= np.array(X_test)
X_val= np.array(X_val)
y_train=np.asarray(pd.get_dummies(y_train))
y_val=np.asarray(pd.get_dummies(y_val))
y_test=np.asarray(pd.get_dummies(y_test))


model = models.Sequential()
model.add(layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01), input_shape=(32,)))
model.add(layers.Dense(128, activation='relu', ))
model.add(layers.Dense(64, activation='relu',))
model.add(layers.Dense(10, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model.summary()

history = model.fit(X_train,
                    y_train,
                    epochs=35,
                    batch_size=120, 
                    validation_data=(X_val, y_val))

model.save('best_model_feat_10classes.h5')

Y_pred_proba=model.predict(X_test)
Y_pred=model.predict_classes(X_test)
test_loss_trained_net, test_acc_trained_net = model.evaluate(X_test, y_test)
Ejemplo n.º 14
0
L = GRU(n_x, 
        activation='tanh', 
        kernel_regularizer=regularizers.l2(0.001))(L)
print ('Post GRU is:', L)
L = Dense(2, 
          activation='softmax', 
          kernel_regularizer=regularizers.l2(0.001))(L)
print('Dense layer is:', L)

model = Model(inputs=sequence_input, outputs=L)

# Optimization and compile
opt = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, decay=0.01)
print('Begin compiling...')
model.compile(loss='categorical_crossentropy', 
              optimizer=opt, 
              metrics=['accuracy'])
model.summary()

# Begin training
model.fit(data_train, 
          Y_train, 
          batch_size=batch_size, 
          epochs=epochs, 
          verbose=2,
          validation_data=(data_val, Y_val))
score = model.evaluate(data_test, Y_test, batch_size=batch_size)
print ('The evaluation is: ', score)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)
 
# adding Input layer and first hidden layer with dropout
classifier.add( Dense(output_dim=6,init= "uniform",activation="relu",input_dim=11))  
classifier.add(Dropout(p=0.1))
# adding second hidden layer
classifier.add( Dense(output_dim=6,init= "uniform",activation="relu")) 
classifier.add(Dropout(p=0.1))

 # adding output layer
 #if  we have more than 1 output layer we use  
classifier.add( Dense(output_dim=1,init= "uniform",activation="sigmoid"))  
 
 #compiling the ann
 #dependent variable has binary outcome we use loss function as binary_crossentropy
 #dependentvariable are more than two outcome then we use loss function as categorical_crossentropy
classifier.compile(optimizer="adam",loss= "binary_crossentropy", metrics=["accuracy"]) 
 

#Tuning the ann
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
def build_classifier(optimizer):
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 11))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier)
Ejemplo n.º 16
0
test_X = X[split_value:, :]
test_Y = Y[split_value:]

train_X = X[:split_value, :]
train_Y = Y[:split_value]

opt = Adamax(lr=0.01)
opt = Adam(lr=0.01)
opt = SGD(lr=0.01, momentum=0.0)

model = Sequential()
model.add(LSTM(40, return_sequences=True))
model.add(Dropout(0.0))
model.add(Dense(1))
model.add(Flatten())
model.compile(loss='mean_absolute_error', optimizer=opt, metrics=['accuracy'])

history = model.fit(train_X,
                    train_Y,
                    epochs=100,
                    batch_size=40,
                    validation_data=(test_X, test_Y),
                    verbose=1)

print(history.history.keys())
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
Ejemplo n.º 17
0
def main():
    ########################
    ### Parse Input Args ###
    ########################
    parser = argparse.ArgumentParser(
        description='Predicting Traits Using Convolutional Neural Networks. \
			See README.md for more information about the pipeline usage.\n'
        'Written by: Emily Bolger\nModified by: Christina Azodi',
        epilog='https://github.com/ShiuLab')

    ## Required
    req_group = parser.add_argument_group(title='REQUIRED INPUT')
    req_group.add_argument(
        '-x',
        help='Feature data. Format options available: '
        ' 1) Matrix with one row for every instance. 2) Directory with one '
        'matrix for every instance with file name matching instance name '
        'provided in -y',
        required=True)
    req_group.add_argument('-y',
                           help='Matrix with label to predict',
                           required=True)
    req_group.add_argument('-test',
                           help='List of instances to use as test set',
                           required=True)
    req_group.add_argument('-save', help='Name for Output File', required=True)

    ## Optional
    inp_group = parser.add_argument_group(title='OPTIONAL INPUT')
    inp_group.add_argument('-feat',
                           help="List of column names in -x to use",
                           default=False)
    inp_group.add_argument(
        '-x_sort',
        help='Method to sort feature (-x) data '
        'by column. Options: (False, alpha, file_with_order, cluster)',
        default=False)
    inp_group.add_argument('-shape',
                           help='Dimension of -x (e.g. for input '
                           'with 4 rows and 6 columns: -shape 4,6)',
                           default='default')
    inp_group.add_argument('-onehot',
                           help='T/F to convert 1xX data into matrix'
                           ' by one-hot encoding',
                           default='F')
    inp_group.add_argument('-onehot_order',
                           help='Order for 1-hot y axis. For '
                           'example: -onehot_order 1,0,-1)',
                           default=False)
    inp_group.add_argument('-y_name', help='Col name to predict', default='Y')
    inp_group.add_argument('-sep', help='Deliminator for X & Y', default='\t')

    # How to run CNN
    inp_group.add_argument('-run',
                           help='T/F to run the final model. If F, will'
                           'only run the grid search if -gs T',
                           default='f')
    inp_group.add_argument('-n',
                           help='Num replicates of model... Different'
                           'train/validation split each replicate',
                           type=int,
                           default=10)
    inp_group.add_argument('-n_jobs',
                           '-p',
                           help='Number of processors for '
                           'parallel computing (max for HPCC = 14)',
                           type=int,
                           default=1)
    inp_group.add_argument('-cv',
                           help='Number of cross validation folds',
                           type=int,
                           default=5)

    # Parameter Selection
    inp_group.add_argument(
        '-params',
        help='How to select parameters. Options: '
        'grid search (gs), default, from XXX_GridSearch.txt (provide path).',
        default='default')
    inp_group.add_argument('-gs_reps',
                           help='Number of combinations of '
                           'parameters to test in the grid search',
                           type=int,
                           default=100)

    # Default CNN parameters
    inp_group.add_argument('-cnn_type',
                           help='CNN architecture. Options: '
                           '(simple, DeepGS)',
                           default="simple")
    inp_group.add_argument('-filters',
                           help='Number of kernels/filters in each '
                           'CNN layer',
                           type=int,
                           default=32)
    inp_group.add_argument('-kernel_l',
                           help='Length of kernel (height '
                           'defaults to the full height of the dataset)',
                           type=int,
                           default=16)
    inp_group.add_argument('-stride_len',
                           help='Stride of Convolution kernels '
                           '(width defaults to 1)',
                           type=int,
                           default=1)
    inp_group.add_argument('-activation',
                           help='Activation function in all but '
                           'last dense layer, which is set to linear',
                           type=str,
                           default='relu')
    inp_group.add_argument('-pool_size',
                           help='Size of max pooling layer filter '
                           '(first number only, second defaults to 1)',
                           type=int,
                           default=8)
    inp_group.add_argument('-optimizer',
                           help='Optimization function to use)',
                           type=str,
                           default='Adam')
    inp_group.add_argument('-dropout',
                           help='Value for Dropout Rate',
                           type=float,
                           default=0.5)
    inp_group.add_argument('-l2',
                           help='Value for L2 regularization',
                           type=float,
                           default=0.2)
    inp_group.add_argument('-learn_rate',
                           help='Value for Learning Rate',
                           type=float,
                           default=0.01)
    inp_group.add_argument('-clip_value',
                           help='Clip Value',
                           type=float,
                           default=0.5)
    inp_group.add_argument('-patience',
                           help='Patience for Early Stopping',
                           type=int,
                           default=10)
    inp_group.add_argument('-min_delta',
                           help='Minimum Delta Value for Early '
                           'Stopping',
                           type=float,
                           default=0)
    inp_group.add_argument('-num_epochs',
                           help='Max number of Epochs',
                           type=int,
                           default=1000)
    inp_group.add_argument('-n_channels',
                           help='Num channels',
                           type=int,
                           default=1)

    # Argument parsing
    args = parser.parse_args()

    if args.shape == 'default':
        tmp = pd.read_csv(args.x, sep=args.sep, index_col=0)
        shape_r, shape_c = 1, tmp.shape[1]
    else:
        shape_r, shape_c = args.shape.strip().split(',')
        shape_r = int(shape_r)
        shape_c = int(shape_c)

    ########################
    ### Parse Input Data ###
    ########################
    print("\n***** Loading Data ******\n")

    # Step 1: Read in x file, if feat file given only keep those features
    if os.path.isfile(args.x):
        x = pd.read_csv(args.x, sep=args.sep, index_col=0)
        x.index = x.index.astype('str')
        instance_order = list(x.index.values)
        with open(args.test) as test_file:
            test_instances = test_file.read().splitlines()
            test_instances = [str(i) for i in test_instances]
            train_val_instances = list(
                set(instance_order) - set(test_instances))
            test_index = [x.index.get_loc(i) for i in test_instances]
            train_val_index = [x.index.get_loc(i) for i in train_val_instances]
        if args.feat:
            with open(args.feat) as f:
                features = f.read().strip().splitlines()
            x = x.loc[:, features]
    elif os.path.isdir(args.x):
        x = ANN.fun.Image2Features(args.x, shape_r, shape_c)
    n_instances = x.shape[0]
    n_feats = x.shape[1]
    print("Total number of instances: %i" % n_instances)
    print("Number of features used: %i" % n_feats)

    # Step 2: Sort x data
    if args.x_sort == 'alpha':
        print('Sorting feature data by column alpha numerically...')
        x = x.reindex(sorted(x.columns), axis=1)
    elif args.x_sort == 'cluster':
        print('Sorting feature data by column using clustering...')
        print('\n\nNOT IMPLEMENTED YET... PROGRESSING WITHOUT SORTING...\n\n')
    else:
        if not args.x_sort:
            print('Using -x in the order provided in -x or in -feat')
        else:
            with open(args.x_sort) as order:
                order_list = order.read().strip().splitlines()
            x = x.loc[:, order_list]
    print('\nSnapshot of input feature data:')
    print(x.head())

    # Step 3: One-hot-encode X if required
    if args.onehot.lower() in ['t', 'true']:
        x_1hot_list = []
        x = x.round(0)
        labels = pd.unique(x.values.ravel())
        ohe = preprocessing.OneHotEncoder(categories='auto', sparse=False)
        for i in range(len(x)):
            x_row = np.array(x.iloc[i, ]).reshape(n_feats, -1)
            oh_matrix = ohe.fit_transform(x_row)
            if oh_matrix.shape[1] < len(labels):
                labels_present = pd.unique(x_row.ravel())
                missing = list(set(labels) - set(labels_present))
                print(
                    "Instance in row %i is has no '%s', so adding by hand..." %
                    (i, missing))
                x_row = np.append(x_row, np.array([missing]), axis=0)
                oh_matrix = ohe.fit_transform(x_row)
                oh_matrix = oh_matrix[:-1, :]
            x_1hot_list.append(oh_matrix)
        x = np.swapaxes(np.array(x_1hot_list), 1, 2)

    data_height = x.shape[1]
    data_width = x.shape[2]
    x = x.reshape((n_instances, data_height, data_width, args.n_channels))
    print("\nShape of feature data used for training/validation/testing:")
    print(x.shape)

    print("\nSnapshot of feature data for first instance in data set:")
    print(x[0, :, :, 0])

    # Step 4: Read in Y data and make sure sorted as in -x
    y = pd.read_csv(args.y, sep=args.sep, index_col=0)
    y.index = y.index.astype('str')
    y = y[[args.y_name]]

    print("\nShape of Label Data:")
    print(y.shape)

    # Step 5: Remove testing data
    x_test = x[test_index, :, :, :]
    x_train = x[train_val_index, :, :, :]
    y_test = y.ix[test_instances]
    y_train = y.ix[train_val_instances]

    ################################
    ### Define CNN architectures ###
    ################################

    def tfp_pearson(y_true, y_pred):
        return tfp.stats.correlation(y_pred, y_true, event_axis=None)

    def make_cnn_model(cnn_type=args.cnn_type,
                       learn_rate=args.learn_rate,
                       filters=args.filters,
                       pool_size=args.pool_size,
                       kernel_l=args.kernel_l,
                       kernel_h=data_height,
                       activation=args.activation,
                       optimizer=args.optimizer,
                       units=1):

        if optimizer.lower() == 'adam':
            opt = tf.keras.optimizers.Adam(lr=learn_rate,
                                           clipvalue=args.clip_value)
        elif optimizer.lower() == 'nadam':
            opt = tf.keras.optimizers.Nadam(lr=learn_rate,
                                            clipvalue=args.clip_value)
        elif optimizer.lower() == 'rmsprop':
            opt = tf.keras.optimizers.RMSprop(lr=learn_rate,
                                              clipvalue=args.clip_value)

        if cnn_type.lower() == 'simple':
            K.clear_session()
            model = models.Sequential()
            model.add(
                layers.Conv2D(
                    filters=filters,
                    kernel_size=tuple([kernel_h, kernel_l]),
                    kernel_regularizer=tf.keras.regularizers.l2(args.l2),
                    strides=tuple([args.stride_len, 1]),
                    activation=activation,
                    kernel_initializer='glorot_normal',
                    input_shape=(data_height, data_width, args.n_channels)))
            model.add(layers.MaxPooling2D(pool_size=tuple([1, pool_size])))
            model.add(layers.Flatten())
            model.add(layers.Dropout(args.dropout))
            model.add(layers.Dense(24, activation=activation))
            model.add(layers.BatchNormalization())
            model.add(layers.Dense(units=units, activation='linear'))
            model.compile(optimizer=opt, loss='mean_squared_error')

        elif cnn_type.lower() == 'deepgs':
            K.clear_session()
            model = models.Sequential()
            model.add(
                layers.Conv2D(filters=filters,
                              kernel_size=tuple([kernel_h, kernel_l]),
                              strides=tuple([args.stride_len, 1]),
                              activation=activation,
                              kernel_initializer='glorot_normal',
                              input_shape=(data_height, data_width,
                                           args.n_channels)))
            model.add(layers.MaxPooling2D(pool_size=tuple([1, pool_size])))
            model.add(
                layers.Conv2D(filters=filters,
                              kernel_size=tuple([1, kernel_l]),
                              strides=tuple([args.stride_len, 1]),
                              activation=activation))
            model.add(layers.MaxPooling2D(pool_size=tuple([1, pool_size])))
            model.add(layers.Dropout(args.dropout))
            model.add(layers.Flatten())
            model.add(layers.Dense(units=24, activation=activation))
            model.add(layers.BatchNormalization())
            model.add(layers.Dropout(args.dropout))
            model.add(layers.Dense(units=units, activation='linear'))
            model.compile(optimizer=opt, loss='mean_squared_error')
        return model

    ####################
    ### Grid Search  ###
    ####################

    if args.params.lower() == 'gs':
        print('\n***** Starting Random Search with %i reps using %i testing '
              'instances and %i fold cross-validation *****\n' %
              (args.gs_reps, x_train.shape[0], args.cv))
        scoring = {
            'neg_mse': 'neg_mean_squared_error',
            'exp_var': 'explained_variance'
        }
        param_grid = dict(learn_rate=[1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
                          filters=[8, 16, 32],
                          kernel_l=[8, 16, 32],
                          pool_size=[4, 8, 16],
                          activation=["relu", "selu", "elu"],
                          optimizer=['RMSprop', 'Adam', 'nadam'],
                          cnn_type=['simple', 'deepgs'])
        model = KerasClassifier(build_fn=make_cnn_model,
                                batch_size=100,
                                epochs=50,
                                verbose=1)
        rand_search = RandomizedSearchCV(estimator=model,
                                         param_distributions=param_grid,
                                         cv=args.cv,
                                         n_iter=args.gs_reps,
                                         n_jobs=args.n_jobs,
                                         verbose=1,
                                         scoring=scoring,
                                         refit='neg_mse')
        gs_result = rand_search.fit(x_train, y_train)
        gs_result_df = pd.DataFrame.from_dict(gs_result.cv_results_)

        print("Saving Grid Search Results....")
        print(gs_result_df.head())
        with open(args.save + "_GridSearch.txt", 'a') as out_gs:
            gs_result_df.to_csv(out_gs, header=out_gs.tell() == 0, sep='\t')

    ########################
    ### Run Final Models ###
    ########################

    if args.run.lower() in ['t', 'true']:

        # Step 1: Define the parameters from the Grid Search or use default
        if args.params.lower() != 'default':
            if args.params.lower() != 'gs':
                gs_result_df = pd.read_csv(args.params, sep='\t')
                gs_result_df.fillna(0, inplace=True)

            gs_mean = gs_result_df.groupby([
                'param_filters', 'param_optimizer', 'param_learn_rate',
                'param_kernel_l', 'param_pool_size', 'param_cnn_type',
                'param_activation'
            ]).agg({
                'mean_test_score': 'mean',
                'std_test_score': 'mean'
            }).reset_index()

            gs_mean = gs_mean.sort_values(by='mean_test_score',
                                          ascending=False)
            print('\nSnapshot of grid search results:')
            print(gs_mean.head())

            args.cnn_type = gs_mean['param_cnn_type'].iloc[0]
            args.pool_size = int(gs_mean['param_pool_size'].iloc[0])
            args.learn_rate = float(gs_mean['param_learn_rate'].iloc[0])
            args.kernel_l = int(gs_mean['param_kernel_l'].iloc[0])
            args.filters = int(gs_mean['param_filters'].iloc[0])
            args.activation = gs_mean['param_activation'].iloc[0]
            args.optimizer = gs_mean['param_optimizer'].iloc[0]

        print('\n***** Running CNN models ******')
        print('CNN Architecture: %s\nOptimizer: %s\nActivation function:'
              ' %s\nLearning Rate: %f\nNumber of kernels: '
              '%i\nKernel shape: [%i, %i]\nPooling Size: [%i, 1]\n' %
              (args.cnn_type, args.optimizer, args.activation, args.learn_rate,
               args.filters, args.kernel_l, data_height, args.pool_size))

        for i in range(args.n):
            print('Rep %i of %i' % (i, args.n))
            run = True

            while run:
                # Step 2: Creating CNN model using Tensorflow
                model = make_cnn_model(cnn_type=args.cnn_type,
                                       learn_rate=args.learn_rate,
                                       optimizer=args.optimizer,
                                       filters=args.filters,
                                       pool_size=args.pool_size,
                                       kernel_l=args.kernel_l,
                                       kernel_h=data_height,
                                       activation=args.activation,
                                       units=1)
                #print(model.summary())

                # Step 3: Split training into training2 and validation
                x_train2, x_val, y_train2, y_val = train_test_split(
                    x_train, y_train, test_size=0.2)
                print('Train on %i, validate on %i, test on %i' %
                      (x_train2.shape[0], x_val.shape[0], x_test.shape[0]))

                # Step 4: Define optimizer and early stopping criteria & train
                model.compile(optimizer=args.optimizer,
                              loss='mean_squared_error',
                              metrics=[tfp_pearson])

                earlystop_callback = EarlyStopping(monitor='val_loss',
                                                   mode='min',
                                                   min_delta=args.min_delta,
                                                   patience=args.patience,
                                                   restore_best_weights=True,
                                                   verbose=1)

                model.fit(x_train2,
                          y_train2,
                          batch_size=100,
                          epochs=args.num_epochs,
                          verbose=1,
                          callbacks=[earlystop_callback],
                          validation_data=(x_val, y_val))

                # Step 5: Apply best model to train, val, test, and report results
                train_mse, train_pcc = model.evaluate(x_train2, y_train2)
                val_mse, val_pcc = model.evaluate(x_val, y_val)
                if val_pcc > 0:
                    run = False
                else:
                    print('\nPCC was negative on valid data.. retraining...')

            test_mse, test_pcc = model.evaluate(x_test, y_test)
            if np.isnan(test_pcc):
                # Still don't know why this happens, but this fixes it...
                print('Recalculating PCC using Numpy...')
                pred = model.predict(x_test).tolist()
                pred2 = [i for sublist in pred for i in sublist]
                test_pcc = np.corrcoef(pred2, y_test[args.y_name].values)[0, 1]

            print('PCC: train, val, and test: %3f, %3f, %3f' %
                  (train_pcc, val_pcc, test_pcc))

            if not os.path.isfile('RESULTS.txt'):
                out = open('RESULTS.txt', 'w')
                out.write(
                    'ID\tX\tY\ty_name\ttest_set\t'
                    'CNN_Type\tLearn_Rate\tMin_Delta\tPatience\tActivation\t'
                    'Optimizer\tKernel_num\tKernel_len\tPooling_Size\tDropout'
                    '\tTrain_mse\tTrain_PCC\tVal_mse\tVal_PCC\tTest_mse\t'
                    'Test_PCC\n')
                out.close()

            out = open('RESULTS.txt', "a")
            out.write(
                '%s\t%s\t%s\t%s\t%s\t'
                '%s\t%f\t%f\t%i\t%s\t%s\t'
                '%i\t%i\t%i\t%f\t%f\t'
                '%f\t%f\t%f\t%f\t%f\n' %
                (args.save, args.x, args.y, args.y_name, args.test,
                 args.cnn_type, args.learn_rate, args.min_delta, args.patience,
                 args.activation, args.optimizer, args.filters, args.kernel_l,
                 args.pool_size, args.dropout, train_mse, train_pcc, val_mse,
                 val_pcc, test_mse, test_pcc))
            out.close()

        print('\nDone!')
Ejemplo n.º 18
0
def main():

    ########################
    ### Parse Input Args ###
    ########################
    parser = argparse.ArgumentParser(
        description='CNN classification code implemented using TensorFlow v2.0',
        epilog='https://github.com/azodichr')

    parser.add_argument('-x', help='Feature numpy dataset', required=True)
    parser.add_argument('-y', help='Class/Y numpy dataset', required=True)
    parser.add_argument('-run', help='T/F to run final models', default='t')
    parser.add_argument('-splits',
                        help='Values for train/val/test',
                        default='70,10,20')
    parser.add_argument('-y_name', help='Phenotype Trait')
    parser.add_argument('-f', help='Function: gs, run, full', default='full')
    parser.add_argument('-save', help='Name for Output File', default='test')
    parser.add_argument('-balance',
                        help='t/f to downsample so balance classes',
                        default='t')
    parser.add_argument('-n_channels',
                        help='Number of channels',
                        default=1,
                        type=int)
    parser.add_argument('-cv',
                        help='Number of cross validation folds',
                        type=int,
                        default=5)
    parser.add_argument('-n_jobs',
                        '-p',
                        help='Number of processors for '
                        'parallel computing (max for HPCC = 14)',
                        type=int,
                        default=1)
    parser.add_argument('-save_model',
                        help='T/F if want to save final models',
                        type=str,
                        default='f')
    parser.add_argument('-tag',
                        help='Identifier String to add to RESULTS',
                        type=str,
                        default='cnn')
    parser.add_argument('-save_detailed',
                        help='T/F Save detailed model performance',
                        type=str,
                        default='f')
    parser.add_argument('-original_df',
                        help='DF fed into input_converter.py',
                        type=str,
                        default='')
    parser.add_argument('-imp_m',
                        help='T/F to calculate importance of each motif',
                        type=str,
                        default='f')
    parser.add_argument('-imp_k',
                        help='T/F to calculate importance of each kernel',
                        type=str,
                        default='f')

    # Default Hyperparameters
    parser.add_argument('-params',
                        help='Output from -f gs (i.e. '
                        'SAVE_GridSearch.txt)',
                        default='default')
    parser.add_argument('-actfun',
                        help='Activation function. (relu, sigmoid)',
                        default='relu')
    parser.add_argument('-learn_rate',
                        help='Learning Rate',
                        default=0.01,
                        type=float)
    parser.add_argument('-dropout',
                        help='Dropout rate',
                        default=0.25,
                        type=float)
    parser.add_argument('-l2',
                        help='Shrinkage parameter for L2 regularization',
                        default=0.25,
                        type=float)
    parser.add_argument('-filters',
                        help='Number of Kernels/filters',
                        default=8,
                        type=int)
    parser.add_argument('-optimizer',
                        help='Optimization function to use)',
                        type=str,
                        default='Adam')
    parser.add_argument('-dense',
                        help='Number of nodes in dense layer',
                        type=int,
                        default=16)
    parser.add_argument('-activation',
                        help='Activation function in all but '
                        'last dense layer, which is set to linear',
                        type=str,
                        default='relu')
    parser.add_argument('-n_reps',
                        '-n',
                        help='Number of replicates (unique '
                        'validation set/starting weights for each)',
                        default=100,
                        type=int)
    parser.add_argument('-clip_value',
                        help='Clip Value',
                        type=float,
                        default=0.5)
    parser.add_argument('-patience',
                        help='Patience for Early Stopping',
                        type=int,
                        default=5)
    parser.add_argument('-min_delta',
                        help='Minimum Delta Value for Early '
                        'Stopping',
                        type=float,
                        default=0)

    # Grid Search reps/space
    parser.add_argument('-gs_reps',
                        '-gs_n',
                        help='Number of Grid Search Reps'
                        '(will append results if SAVE_GridSearch.csv exists)',
                        type=int,
                        default=10)
    parser.add_argument('-actfun_gs',
                        help='Activation functions for Grid '
                        'Search',
                        nargs='*',
                        default=['relu', 'selu', 'elu'])
    parser.add_argument('-dropout_gs',
                        help='Dropout rates for Grid Search',
                        nargs='*',
                        type=float,
                        default=[0.0, 0.1, 0.25])
    parser.add_argument('-l2_gs',
                        help='Shrinkage parameters for L2 for Grid '
                        'Search',
                        nargs='*',
                        type=float,
                        default=[0.01, 0.1, 0.25])
    parser.add_argument('-lrate_gs',
                        help='Learning Rate',
                        nargs='*',
                        type=float,
                        default=[0.1, 0.01, 0.001, 0.0001])
    parser.add_argument('-kernels_gs',
                        help='Number of Kernels for Grid Search',
                        default=[4, 8, 16, 24],
                        type=int)

    args = parser.parse_args()
    k_height = 'tmp'
    args.k_len = 'tmp'

    def downsample(x, y):
        unique, counts = np.unique(y_all, return_counts=True)
        smaller_index = list(counts).index(min(counts))
        bigger_index = list(counts).index(max(counts))

        i_smaller = np.where(y_all == unique[smaller_index])[0]
        i_bigger = np.where(y_all == unique[bigger_index])[0]
        downsample_n = len(i_smaller)
        i_bigger_downsampled = np.random.choice(i_bigger,
                                                size=downsample_n,
                                                replace=False)

        i_keep = list(i_smaller) + list(i_bigger_downsampled)
        y = y_all[i_keep]
        x = x_all[i_keep]

        return x, y

    def make_cnn_model(learn_rate=args.learn_rate,
                       filters=args.filters,
                       dropout=args.dropout,
                       dense=args.dense,
                       l2=args.l2,
                       activation=args.activation,
                       optimizer=args.optimizer,
                       units=1):

        if optimizer.lower() == 'adam':
            opt = tf.keras.optimizers.Adam(lr=learn_rate,
                                           clipvalue=args.clip_value)
        elif optimizer.lower() == 'nadam':
            opt = tf.keras.optimizers.Nadam(lr=learn_rate,
                                            clipvalue=args.clip_value)
        elif optimizer.lower() == 'rmsprop':
            opt = tf.keras.optimizers.RMSprop(lr=learn_rate,
                                              clipvalue=args.clip_value)
        elif optimizer.lower() == 'sgdm':
            opt = tf.keras.optimizers.SGD(lr=learn_rate,
                                          decay=1e-6,
                                          clipvalue=args.clip_value,
                                          momentum=0.9,
                                          nesterov=True)

        conv2d_layer = layers.Conv2D(
            filters=filters,
            kernel_size=tuple([k_height, 1]),
            kernel_regularizer=tf.keras.regularizers.l2(l2),
            activation=activation,
            kernel_initializer='glorot_normal',
            input_shape=(n_rows, n_columns, args.n_channels),
            name='conv2d_layer')
        K.clear_session()
        model = models.Sequential()
        model.add(conv2d_layer)
        model.add(layers.Flatten())
        model.add(layers.Dense(dense, activation=activation))
        model.add(layers.Dropout(dropout))
        model.add(layers.Dense(units=1, activation='sigmoid'))
        model.compile(optimizer=opt, loss='binary_crossentropy')

        return model, conv2d_layer

    ##########################
    ### Data preprocessing ###
    ##########################
    x_all = np.load(args.x)
    y_all = np.load(args.y)
    x_all = x_all.reshape(x_all.shape + (args.n_channels, ))

    if args.balance.lower() in ['t', 'true']:
        x, y = downsample(x_all, y_all)

        print('Y shape (down-sampled): %s' % str(y.shape))
        print('X shape (down-sampled): %s' % str(x.shape))
    else:
        y = y_all
        x = x_all

    print("\nSnapshot of feature data for first instance in data set:")
    print(x[0, :, 0:5, 0])
    n = y.shape[0]
    n_rows = x.shape[1]
    n_columns = x.shape[2]

    k_height = x.shape[1]
    args.k_len = 1
    print('Kernel dimensions: ', k_height, args.k_len)

    ###################
    ### Grid Search ###
    ###################

    if args.params.lower() == 'gs':
        print('\n***** Starting Random Search with %i reps using %i testing '
              'instances and %i fold cross-validation *****\n' %
              (args.gs_reps, x.shape[0], args.cv))
        scoring = {'acc': 'accuracy', 'f1': 'f1'}
        param_grid = dict(
            learn_rate=[0.1, 0.01, 0.001],
            filters=[8, 16],
            dense=[8, 16, 32],
            l2=[0.1, 0.25],  #, 0.5],
            dropout=[0.1, 0.25],  #, 0.5],
            activation=["relu"],  #, 'selu', 'elu'],
            optimizer=['RMSprop', 'Adam', 'nadam'])
        model, conv2d_layer = KerasClassifier(build_fn=make_cnn_model,
                                              batch_size=100,
                                              epochs=30,
                                              verbose=0)
        rand_search = RandomizedSearchCV(estimator=model,
                                         param_distributions=param_grid,
                                         cv=args.cv,
                                         n_iter=args.gs_reps,
                                         n_jobs=args.n_jobs,
                                         scoring=scoring,
                                         refit='acc',
                                         verbose=0)
        gs_result = rand_search.fit(x, y)
        gs_result_df = pd.DataFrame.from_dict(gs_result.cv_results_)

        print("Saving Grid Search Results....")
        print(gs_result_df.head())
        with open(args.save + "_GridSearch.txt", 'a') as out_gs:
            gs_result_df.to_csv(out_gs, header=out_gs.tell() == 0, sep='\t')

    print('\n\n Grid Search results saved to: %s_GridSearch.txt\n' % args.save)

    ################
    ### Run final model
    ################

    if args.run.lower() in ['t', 'true']:
        print('####### Running Final Model(s) ###########')

        # Step 1: Define the parameters from the Grid Search or use default
        if args.params.lower() != 'default':
            if args.params.lower() != 'gs':
                gs_result_df = pd.read_csv(args.params, sep='\t')
                gs_result_df.fillna(0, inplace=True)

            gs_mean = gs_result_df.groupby([
                'param_filters', 'param_optimizer', 'param_learn_rate',
                'param_dropout', 'param_l2', 'param_dense', 'param_activation'
            ]).agg({
                'mean_test_acc': 'mean',
                'std_test_acc': 'mean',
                'mean_fit_time': 'count'
            }).reset_index()

            print('Parameter Search Coverage: \nMin: %i\nMean: %3f\nMax:%i' %
                  (gs_mean['mean_fit_time'].min(),
                   gs_mean['mean_fit_time'].mean(),
                   gs_mean['mean_fit_time'].max()))

            if gs_mean['mean_fit_time'].min() == 1:
                print('Dropping parameter combinations with < 2 replicates...')
                gs_mean = gs_mean[gs_mean['mean_fit_time'] >= 2]

            gs_mean = gs_mean.sort_values(by='mean_test_acc', ascending=False)
            print('\nSnapshot of grid search results:')
            print(gs_mean.head())

            args.learn_rate = float(gs_mean['param_learn_rate'].iloc[0])
            args.l2 = float(gs_mean['param_l2'].iloc[0])
            args.dropout = float(gs_mean['param_dropout'].iloc[0])
            args.filters = int(gs_mean['param_filters'].iloc[0])
            args.dense = int(gs_mean['param_dense'].iloc[0])
            args.activation = gs_mean['param_activation'].iloc[0]
            args.optimizer = gs_mean['param_optimizer'].iloc[0]

        print('\n***** Running CNN models ******')
        print('Optimizer: %s\nActivation function:'
              ' %s\nLearning Rate: %4f\nNumber of kernels: '
              '%i\nL2: %4f\nDropout: %4f\nDense nodes: %s\n' %
              (args.optimizer, args.activation, args.learn_rate, args.filters,
               args.l2, args.dropout, args.dense))

        final_results = pd.DataFrame()
        motif_imps = pd.DataFrame()
        kern_imp = []

        for n in range(args.n_reps):
            print("\nReplicate %i/%i" % (n, args.n_reps))
            x, y = downsample(x_all, y_all)
            print(x.shape)

            model, conv2d_layer = make_cnn_model(learn_rate=args.learn_rate,
                                                 optimizer='sgdm',
                                                 filters=args.filters,
                                                 dense=args.dense,
                                                 l2=args.l2,
                                                 dropout=args.dropout,
                                                 activation=args.activation)
            #print(model.summary())

            # Step 3: Split training into training2 and validation
            x_train, x_test, y_train, y_test = train_test_split(x,
                                                                y,
                                                                stratify=y,
                                                                test_size=0.1)
            x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                              y_train,
                                                              stratify=y_train,
                                                              test_size=0.111)
            print('Train on %i, validate on %i, test on %i' %
                  (x_train.shape[0], x_val.shape[0], x_test.shape[0]))

            # Step 4: Define optimizer and early stopping criteria & train
            model.compile(optimizer=args.optimizer,
                          loss='binary_crossentropy',
                          metrics=['accuracy'])

            earlystop_callback = EarlyStopping(monitor='val_loss',
                                               mode='min',
                                               min_delta=args.min_delta,
                                               patience=args.patience,
                                               restore_best_weights=True,
                                               verbose=0)

            model.fit(x_train,
                      y_train,
                      batch_size=50,
                      epochs=1000,
                      verbose=0,
                      callbacks=[earlystop_callback],
                      validation_data=(x_val, y_val))

            train_loss, train_acc = model.evaluate(x_train, y_train)
            val_loss, val_acc = model.evaluate(x_val, y_val)
            test_loss, test_acc = model.evaluate(x_test, y_test)

            val_yhat = model.predict(x_val)
            max_f1 = 0
            best_thresh = 0
            for thr in np.arange(0.01, 1, 0.01):
                thr_pred = val_yhat.copy()
                thr_pred[thr_pred >= thr] = 1
                thr_pred[thr_pred < thr] = 0
                if sum(
                        thr_pred
                ) > 1:  # Eliminates cases where all predictions are negative and the f1 and auROC are undefined
                    f1 = f1_score(y_val, thr_pred,
                                  pos_label=1)  # Returns F1 for positive class
                    if f1 >= max_f1:
                        max_f1 = f1
                        best_thresh = thr
            print('Threshold for F1 measure: %3f' % best_thresh)

            # Calculate AUC-ROC and F-measure from train, val, and test.
            yhat_train = model.predict(x_train)
            train_auroc = roc_auc_score(y_train, yhat_train)
            yhat_train[yhat_train >= best_thresh] = 1
            yhat_train[yhat_train < best_thresh] = 0
            train_f1 = f1_score(y_train, yhat_train, pos_label=1)

            yhat_val = model.predict(x_val)
            val_auroc = roc_auc_score(y_val, yhat_val)
            yhat_val[yhat_val >= best_thresh] = 1
            yhat_val[yhat_val < best_thresh] = 0
            val_f1 = f1_score(y_val, yhat_val, pos_label=1)

            yhat_test = model.predict(x_test)
            test_auroc = roc_auc_score(y_test, yhat_test)
            yhat_test[yhat_test >= best_thresh] = 1
            yhat_test[yhat_test < best_thresh] = 0
            test_f1 = f1_score(y_test, yhat_test, pos_label=1)

            if args.save_model.lower() in ['t', 'true']:
                model.save(args.save + '_model_' + str(n) + '.h5')

            final_results = final_results.append(
                {
                    'ID': args.save,
                    'Tag': args.tag,
                    'Rep': n,
                    'X_file': args.x,
                    'Y_file': args.y,
                    'ActFun': args.activation,
                    'dropout': args.dropout,
                    'L2': args.l2,
                    'LearnRate': args.learn_rate,
                    'Optimizer': args.optimizer,
                    'n_Kernels': args.filters,
                    'F1_threshold': best_thresh,
                    'n_Dense': args.dense,
                    'Acc_train': train_acc,
                    'Loss_train': train_loss,
                    'auROC_train': train_auroc,
                    'F1_train': train_f1,
                    'Acc_val': val_acc,
                    'Loss_val': val_loss,
                    'auROC_val': val_auroc,
                    'F1_val': val_f1,
                    'Acc_test': test_acc,
                    'Loss_test': test_loss,
                    'auROC_test': test_auroc,
                    'F1_test': test_f1
                },
                ignore_index=True)

            ##########################
            ## Model Interpretation ##
            ##########################

            if (args.imp_m.lower() in ['t', 'true']
                    or args.imp_k.lower() in ['t', 'true']):
                # Step 1: Read in x data meta data
                key = pd.read_csv(
                    args.original_df,
                    sep='\t',
                    index_col=0,
                )
                key_index_list = key.columns.str.split('_', expand=True).values
                key.columns = pd.MultiIndex.from_tuples([
                    (x[1], x[0]) for x in key_index_list
                ])
                key = key.sort_index(axis=1)
                motifs = key.columns.levels[0].values
                omic_stack = list(key[list(key.columns.levels[0])[0]])
                omic_stack.append('PA')

                # Calculate Motif importance (zero-out-each-feature)
                if args.imp_m.lower() in ['t', 'true']:
                    motif_imp = np.empty((0, 2))
                    model_mot_imp = model
                    for mx in range(0, x_test.shape[2] - 1):
                        x_test_tmp = np.copy(x_test)
                        x_test_tmp[:, ..., mx, :] = 0
                        yhat_m_imp = model_mot_imp.predict(x_test_tmp)
                        auroc_m_imp = roc_auc_score(y_test, yhat_m_imp)
                        imp_m_auc = test_auroc - auroc_m_imp
                        motif_imp = np.vstack(
                            (motif_imp, np.array([motifs[mx], imp_m_auc])))
                    motif_imp = pd.DataFrame(
                        motif_imp, columns=['motif', 'auROC_test_decrease'])
                    if n == 0:
                        motif_imps = motif_imp
                    else:
                        motif_imps = pd.merge(motif_imps,
                                              motif_imp,
                                              on='motif')

                # Calculate Kernel Importance (zero-out-weights)
                if args.imp_k.lower() in ['t', 'true']:
                    all_weights = model.get_weights()
                    all_weights_2 = all_weights.copy()
                    print(
                        'Performing Leave-One-Kernel-Out importance analysis...'
                    )
                    for kx in range(0, args.filters):
                        orig_weights = all_weights[0][:, :, 0, kx].copy()
                        orig_weights = orig_weights.tolist()
                        orig_weights = [i for l in orig_weights for i in l]
                        conv2d_drop = copy.deepcopy(all_weights)
                        conv2d_drop[0][:, :, 0, kx] = 0.0
                        print(conv2d_drop[0][1, :, 0, 0:10])
                        model_LOKO = tf.keras.models.clone_model(model)
                        model_LOKO.set_weights(weights=conv2d_drop)
                        yhat_k_imp = model_LOKO.predict(x_test)
                        auroc_k_imp = roc_auc_score(y_test, yhat_k_imp)
                        imp_k_auc = test_auroc - auroc_k_imp
                        old = roc_auc_score(y_test, model.predict(x_test))
                        print(old, imp_k_auc)
                        kern_imp.append([n, imp_k_auc, orig_weights])

        if args.imp_m.lower() in ['t', 'true']:
            print('Snapshor ot motif importance scores...')
            motif_imps = motif_imps.set_index('motif')
            motif_imps = motif_imps.apply(pd.to_numeric, errors='coerce')
            motif_imps['mean_imp'] = motif_imps.mean(axis=1)
            motif_imps = motif_imps.sort_values('mean_imp', 0, ascending=False)
            print(motif_imps['mean_imp'].head())
            motif_imps['mean_imp'].to_csv(args.save + "_Motif_imp",
                                          sep="\t",
                                          index=True)

        if args.imp_k.lower() in ['t', 'true']:
            print('\nSnapshot of kernel importance scores:')
            kern_imp = pd.DataFrame(
                kern_imp, columns=['rep', 'auROC_test_decrease', 'kernel'])
            print(kern_imp.head())
            kern_imp.to_csv(args.save + "_Kernel_imp", sep="\t", index=True)

        final_results.to_csv(args.save + "_results.txt", header=True, sep='\t')

        # Save summary of results to RESULTS.txt
        calc_cols = [
            'F1_threshold', 'Acc_train', 'Acc_val', 'Acc_test', 'Loss_train',
            'Loss_val', 'Loss_test', 'auROC_train', 'auROC_val', 'auROC_test',
            'F1_train', 'F1_val', 'F1_test'
        ]
        final_results = final_results.drop(['Rep'], axis=1)
        std = final_results[calc_cols].std(axis=0, skipna=True)
        std = std.add_suffix('_std')
        mean = final_results[calc_cols].mean(axis=0, skipna=True)
        mean = mean.add_suffix('_mean')
        str_cols = final_results.drop(calc_cols, axis=1).iloc[0]
        str_cols = str_cols.append(pd.Series([args.n_reps], index=['Reps']))
        summary = pd.concat([str_cols, mean, std])

        #summary.set_index('index', inplace=True)
        print('\n### Summary of results on test set ###')
        print(summary.filter(like='test_mean', axis=0))
        with open("RESULTS.txt", 'a') as f:
            summary.to_frame().transpose().to_csv(f,
                                                  header=f.tell() == 0,
                                                  sep='\t')

    print('Done!')
Ejemplo n.º 19
0
best = grid_result.best_params_

model = Sequential()
model.add(Dense(best['neurons'], input_dim=1000,
                activation=best['activation']))
model.add(Dropout(best['dropout_rate']))
if best['hidden_layers'] == 0:
    pass
else:
    for i in range(best['hidden_layers']):
        model.add(Dense(best['neurons'], activation=best['activation']))
        model.add(Dropout(best['dropout_rate']))
model.add(Dense(2, kernel_initializer='uniform', activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer=best['optimizer'],
              metrics=['accuracy'])

hist = model.fit(x_train,
                 y_train,
                 batch_size=best['batch_size'],
                 epochs=best['epochs'],
                 validation_data=(x_test, y_test),
                 verbose=2)

## print the final results from unseen data
score = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: ", score[1])
f.write(20*'-' + '\nAccuracy of best model on unseen data: ' + str(score[1]) + \
    '\n\nParameters used: ' + str(best))
f.close()