Ejemplo n.º 1
0
def build_model(**parameters):
    sequence_len = parameters['sequence_len']
    dropout_01 = parameters['dropout_01']
    dropout_02 = parameters['dropout_02']
    dev_df, test_df, val_df = build_test_validation_df()

    if VERBOSE:
        (dev_df.head())

    train_x, train_y = process_sb_df(dev_df, sequence_len)
    test_x, test_y = process_sb_df(test_df, sequence_len)

    print("train sells: ", train_y.count(0), ", holds: ", train_y.count(1),
          ", buys: ", train_y.count(2))
    print("test sells: ", test_y.count(0), ", holds:", test_y.count(1),
          ", buys: ", test_y.count(2))
    print("sequence_len: ", sequence_len, " ,dropout_01: ", dropout_01,
          " ,dropout_02: ", dropout_02)

    input_shape = (train_x.shape[1:])
    # Train model
    from keras.wrappers.scikit_learn import KerasClassifier
    model = KerasClassifier(build_fn=create_model,
                            input_shape=input_shape,
                            dropout_01=dropout_01,
                            dropout_02=dropout_02,
                            epochs=10,
                            batch_size=128,
                            verbose=1)

    score = -999.0
    if RUN_WITH_BEST_PARAMETERS:
        model.fit(train_x, train_y)
        model.save(MODEL_PATH)
    else:
        from sklearn.model_selection import cross_val_score
        score = np.mean(cross_val_score(model, train_x, train_y, cv=3))

    print("average score: ", score)

    return -score
Ejemplo n.º 2
0
	grid_result = grid.fit(base_model, outputs)

	# summarize results
	print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
	for params, mean_score, scores in grid_result.grid_scores_:
	    print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))

    


    early_stopping = EarlyStopping(patience=20)
    checkpointer = ModelCheckpoint('inception_resnet_bottleneck_drug_best.h5', verbose=1, save_best_only=True)

    ImageFile.LOAD_TRUNCATED_IMAGES = True

    model.fit_generator(batches, steps_per_epoch=num_train_steps, epochs=1000, callbacks=[early_stopping, checkpointer], validation_data=val_batches, validation_steps=num_valid_steps)
    model.save_weights('inception_resnet_bottleneck_drug_weights.h5')
    model.save('inception_resnet_bottleneck_drug.h5')

#     for layer in model.layers[-31:]:
#         layer.trainable=True
#     for layer in model.layers[:-31]:
#         layer.trainable=False

#     model.compile(loss='categorical_crossentropy', optimizer=optimizers.SGD(lr=1e-4, momentum=0.9), metrics=['accuracy'])

#     checkpointer = ModelCheckpoint('./resnet50_best_safety.h5', verbose=1, save_best_only=True)

#     model.fit_generator(batches, steps_per_epoch=num_train_steps, epochs=1000, callbacks=[early_stopping, checkpointer], validation_data=val_batches, validation_steps=num_valid_steps)
#     model.save('resnet50_safety.h5')
Ejemplo n.º 3
0
# print(model.summary())

labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['v1'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

batch_size = 32
model = createmodel()
model.fit(X_train, Y_train, epochs=5, batch_size=batch_size, verbose=2)
model.save('spam_model.h5')
score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size)
print(score)
print(acc)

import pandas as pd

import keras
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.wrappers.scikit_learn import KerasClassifier
Ejemplo n.º 4
0
    classifier = KerasClassifier(build_fn=build_classifier)
    parameters = {
        'batch_size': [40, 80],
        'epochs': [300, 600],
        'optimizer': ['adam', 'rmsprop'],
        'hidden_neurons': [9, 10]
    }
    grid = GridSearchCV(estimator=classifier,
                        param_grid=parameters,
                        scoring='accuracy',
                        cv=10)

    grid_search = grid.fit(X_train, y_train)
    best_parameters = grid_search.best_params_

    classifier = build_classifier(best_parameters['optimizer'],
                                  best_parameters['hidden_neurons'])
    classifier.fit(X_train,
                   y_train,
                   epochs=best_parameters['epochs'],
                   batch_size=best_parameters['batch_size'])

    classifier.save("classifier.h5")
    print("Saved model to disk")

    print("Best parameters: {}".format(best_parameters))
    best_accuracy = grid_search.best_score_
    print("Best accuracy: {}".format(best_accuracy))

    # Can load model with model = load_model(filename)
Ejemplo n.º 5
0
# summarize results
#print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# create model
model = build_model(x_train.shape[1:], y_train.shape[-1], activation[0],
                    learn_rate[0], loss[0], optimizer[0], drop_rate[0])

print('built model..')

# Data Augmentation
datagen = ImageDataGenerator(featurewise_center=True,
                             featurewise_std_normalization=True,
                             rotation_range=0.0,
                             fill_mode='nearest',
                             horizontal_flip=True,
                             vertical_flip=True,
                             rescale=1. / 255,
                             preprocessing_function=None,
                             validation_split=0.25)

datagen.fit(x_train)

model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size[0]),
                    epochs=epochs[0])

# Save model with weights
model.save(data_dir + '/models/yelp_model.h5')

score = model.evaluate(x_test, y_test)
print('Model loss:', score)
#print('Model accuracy:', score)
Ejemplo n.º 6
0
                             nb_epoch = runEpoch) 
 

if(os.access(modelName, os.F_OK)):
    classifier=load_model(modelName)

classifier.fit(X_train, y_train, batch_size=BS, epochs=runEpoch, class_weight=class_weights, validation_data=(X_test, y_test), verbose=2)
  
y_predict=classifier.predict(X_test,batch_size=BS)
y_predict =  [j[0] for j in y_predict]
y_predict = np.where(np.array(y_predict)<0.5,0,1)
 
precision = precision_score(y_test, y_predict, average='macro') 
recall = recall_score(y_test,y_predict, average='macro') 
print ("Precision:", precision) 
print ("Recall:", recall) 

confusion_matrix=confusion_matrix(y_test,y_predict)
print  confusion_matrix
 

precision_p = float(confusion_matrix[1][1])/float((confusion_matrix[0][1] + confusion_matrix[1][1]))
recall_p = float(confusion_matrix[1][1])/float((confusion_matrix[1][0] + confusion_matrix[1][1]))

if(os.access(modelName, os.F_OK)):
    print(classifier.summary()) 
    classifier.save(modelName)
else:
    print(classifier.model.summary())
    classifier.model.save(modelName)
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-4),
              metrics=['acc'])

history = model.fit(X_train,
                    y_train,
                    epochs=35,
                    batch_size=150, 
                    validation_data=(X_val, y_val))

model.save('best_model_imag_3clases.h5')

Y_pred_proba=model.predict(X_test)
Y_pred=model.predict_classes(X_test)
test_loss_trained_net, test_acc_trained_net = model.evaluate(X_test, y_test)
print('test_acc:', test_acc_trained_net)
#Resultado 75% de acuraccy sobre el test

matrix = confusion_matrix(y_test.argmax(axis=1), Y_pred)

labels_grouped=np.array(["Musica popular", "Musica melodica", "Musica ritmica"])
plot_confusion_matrix(y_test.argmax(axis=1), Y_pred, classes=labels_grouped,
                      title='Confusion matrix, without normalization')


#Graficamos el Loss (error) en función de los Epochs
Ejemplo n.º 8
0
    classifier = Sequential()
    classifier.add(Embedding(max_features, output_dim=256))
    classifier.add(LSTM(128))
    classifier.add(Dropout(0.3))
    classifier.add(Dense(1, activation='sigmoid'))

    classifier.compile(loss='binary_crossentropy',
              optimizer='Adam', #'rmsprop',
              metrics=['accuracy'])

    return classifier


#Now we should create classifier object using our internal classifier object in the function above
classifier = KerasClassifier(build_fn= classifier_builder,
                             batch_size = 1024,
                             nb_epoch = runEpoch) #10)

if(os.access("lstm_model.h5", os.F_OK)):
    classifier=load_model('lstm_model.h5')
hist=classifier.fit(X_train, y_train, batch_size=1024, epochs=runEpoch)
print(hist.history)

if(os.access("lstm_model.h5", os.F_OK)):
    print(classifier.summary()) 
    classifier.save('lstm_model.h5')
else:
    print(classifier.model.summary())
    classifier.model.save('lstm_model.h5')

    print(valid_x_data.shape, valid_y_data.shape)

    if args.tunning:
        model = KerasClassifier(build_fn=tunnel_model,
                                input_shape=(50, maxlen),
                                clear_session=True)
        parameter_tunning(model, train_x_data, train_y_data)
        sys.exit(0)

    model = build_model_graph(input_shape=(charmap_size, maxlen),
                              model='lstm_model_endgame')

    #checkpointer = ModelCheckpoint(filepath='/tmp/weights.model',
    #                               verbose=1, monitor='val_acc',
    #                               save_best_only=True)

    train_model(model,
                train_x_data,
                train_y_data,
                validation_data=(valid_x_data, valid_y_data),
                batch_size=256,
                epochs=30,
                with_weights=False)  #, checkpointer=checkpointer)

    if args.save_model:
        model.save(args.save_model)

    test_x_data, test_y_data = pickle.load(open('test_data.pkl', 'rb'))
    # test_x_data, test_y_data = pickle.load(open('train_data.pkl', 'rb'))
    test_binary_model(model, test_x_data, test_y_data, threshold)
data = pd.read_csv("KDDTrain+.txt")
data.drop(data.columns[[
    0, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 27, 32, 34,
    39, 40, 42
]],
          axis=1,
          inplace=True)

le = LabelEncoder()

data['protocol_type'] = le.fit_transform(data['protocol_type'])
data['service'] = le.fit_transform(data['service'])
data['flag'] = le.fit_transform(data['flag'])
data['a_class'] = le.fit_transform(data['a_class'])

values = data.iloc[:, 0:20].values
labels = data.iloc[:, 20].values

scaler = MinMaxScaler(feature_range=(0, 1))
rescaledValues = scaler.fit_transform(values)

model = KerasClassifier(build_fn=create_model,
                        epochs=100,
                        batch_size=10,
                        verbose=1)
kfold = StratifiedKFold(n_splits=10, random_state=seed)
results = cross_val_score(model, rescaledValues, labels, cv=kfold)
model.save('my_model.h5')
print(results.mean())
Ejemplo n.º 11
0
                             nb_epoch=runEpoch)

if (os.access("lstm_reshape_5_256.md", os.F_OK)):
    classifier = load_model('lstm_reshape_5_256.md')

classifier.fit(X_train,
               y_train,
               batch_size=BS,
               epochs=runEpoch,
               class_weight=class_weights,
               validation_data=(X_test, y_test))

y_predict = classifier.predict(X_test, batch_size=BS)
y_predict = [j[0] for j in y_predict]
y_predict = np.where(np.array(y_predict) < 0.5, 0, 1)

precision = precision_score(y_test, y_predict, average='macro')
recall = recall_score(y_test, y_predict, average='macro')
print("Precision:", precision)
print("Recall:", recall)

confusion_matrix = confusion_matrix(y_test, y_predict)
print confusion_matrix

if (os.access("lstm_reshape_5.md", os.F_OK)):
    print(classifier.summary())
    classifier.save('lstm_reshape_5_256.md')
else:
    print(classifier.model.summary())
    classifier.model.save('lstm_reshape_5_256.md')
Ejemplo n.º 12
0
        
        file_name_base="lstm_model.h5"
        #num to string 
        str_runLoop='%d' %runLoop     
        str_batch_size='%d' %batch_size
        file_name=str_runLoop+"_"+str_batch_size+"_"+file_name_base

        if(os.access(file_name, os.F_OK)):
            classifier=load_model(file_name)

        for i in range(0, runLoop):
            hist=classifier.fit(X_train, y_train, batch_size=batch_size, epochs=runEpoch, class_weight=class_weights, validation_data=(X_test, y_test))
            print "loop i=", i, "hist:", hist.history
            y_predict=classifier.predict(X_test,batch_size=batch_size)
            y_predict =  [j[0] for j in y_predict]
            y_predict = np.where(np.array(y_predict)<0.5,0,1)
        	
            precision = precision_score(y_test, y_predict, average='macro') 
            recall = recall_score(y_test,y_predict, average='macro') 
            print ("Precision:", precision) 
            print ("Recall:", recall) 

        if(os.access(file_name, os.F_OK)):
            print(classifier.summary()) 
            classifier.save(file_name)
        else:
            print(classifier.model.summary())
            classifier.model.save(file_name)
        print("End----------")
        print()
model.add(layers.Dense(128, activation='relu', ))
model.add(layers.Dense(64, activation='relu',))
model.add(layers.Dense(10, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model.summary()

history = model.fit(X_train,
                    y_train,
                    epochs=35,
                    batch_size=120, 
                    validation_data=(X_val, y_val))

model.save('best_model_feat_10classes.h5')

Y_pred_proba=model.predict(X_test)
Y_pred=model.predict_classes(X_test)
test_loss_trained_net, test_acc_trained_net = model.evaluate(X_test, y_test)
print('test_acc:', test_acc_trained_net)
#Resultado 69% de acuraccy sobre el test

matrix = confusion_matrix(y_test.argmax(axis=1), Y_pred)

labels=np.array(clases)
plot_confusion_matrix(y_test.argmax(axis=1), Y_pred, classes=labels,
                      title='Confusion matrix, without normalization')


#Graficamos el Loss (error) en función de los Epochs
Ejemplo n.º 14
0
# encoder = Model(input = input_dim, output = encoded)
# encoded_input = Input(shape = (encoding_dim, ))
# X_test_encoded = encoder.predict(sX)
# encoder.save('my_encoder.h5')

#PCA
print 'PCA start'
svd = TruncatedSVD(n_components=500, n_iter=7, random_state=42)
svd.fit(X_train_tfidf)
X_train_tfidf = svd.transform(X_train_tfidf)
print 'PCA done'

# split data into training and testing

# from sklearn.cross_validation import train_test_split
# from sklearn.model_selection import train_test_split
# X_train , X_test , y_train ,y_test= train_test_split(X_train_tfidf,y,test_size=0.5)

estimator = KerasClassifier(build_fn=baseline_model,
                            epochs=200,
                            batch_size=5,
                            verbose=0)

kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

results = cross_val_score(estimator, X_train_tfidf, y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" %
      (results.mean() * 100, results.std() * 100))

estimator.save('my_dnn_model.h5')
Ejemplo n.º 15
0
          kernel_regularizer=regularizers.l2(0.001))(L)
print('Dense layer is:', L)

model = Model(inputs=sequence_input, outputs=L)

# Optimization and compile
opt = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, decay=0.01)
print('Begin compiling...')
model.compile(loss='categorical_crossentropy', 
              optimizer=opt, 
              metrics=['accuracy'])
model.summary()

# Begin training
model.fit(data_train, 
          Y_train, 
          batch_size=batch_size, 
          epochs=epochs, 
          verbose=2,
          validation_data=(data_val, Y_val))
score = model.evaluate(data_test, Y_test, batch_size=batch_size)
print ('The evaluation is: ', score)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)


# Save model
print ('Saving model...')
model.save('CNN-LSTM-Turkish corpus-200d')
Ejemplo n.º 16
0
               validation_data=(X_test, y_test),
               verbose=2)

bk.set_learning_phase(0)  #测试阶段
print "After set ", bk.learning_phase()

y_predict = classifier.predict(X_test, batch_size=BS)
y_predict = [j[0] for j in y_predict]
y_predict = np.where(np.array(y_predict) < 0.5, 0, 1)

confusion_matrix = confusion_matrix(y_test, y_predict)
print confusion_matrix

precision_p = float(confusion_matrix[1][1]) / float(
    (confusion_matrix[0][1] + confusion_matrix[1][1]))
recall_p = float(confusion_matrix[1][1]) / float(
    (confusion_matrix[1][0] + confusion_matrix[1][1]))

print("Precision:", precision_p)
print("Recall:", recall_p)

if (os.access("lstm_lxr.md", os.F_OK)):
    print(classifier.summary())
    classifier.save('lstm_lxr.md')
else:
    print(classifier.model.summary())
    classifier.model.save('lstm_lxr.md')

#('Precision:', 0.9660511363636364)
#('Recall:', 0.9601863617111394)
Ejemplo n.º 17
0
def main():
    print('Using Keras version: ', keras.__version__)

    usage = 'usage: %prog [options]'
    parser = argparse.ArgumentParser(usage)
    parser.add_argument('-t', '--train_model', dest='train_model', help='Option to train model or simply make diagnostic plots (0=False, 1=True)', default=1, type=int)
    parser.add_argument('-s', '--suff', dest='suffix', help='Option to choose suffix for training', default='', type=str)
    parser.add_argument('-p', '--para', dest='hyp_param_scan', help='Option to run hyper-parameter scan', default=0, type=int)
    parser.add_argument('-i', '--inputs_file_path', dest='inputs_file_path', help='Path to directory containing directories \'Bkgs\' and \'Signal\' which contain background and signal ntuples respectively.', default='', type=str)
    args = parser.parse_args()
    do_model_fit = args.train_model
    suffix = args.suffix

    # Create instance of the input files directory
    inputs_file_path = 'HHWWgg_DataSignalMCnTuples/2017/'

    hyp_param_scan=args.hyp_param_scan
    # Set model hyper-parameters
    weights='BalanceYields'# 'BalanceYields' or 'BalanceNonWeighted'
    optimizer = 'Nadam'
    validation_split=0.1
    # hyper-parameter scan results
    if weights == 'BalanceNonWeighted':
        learn_rate = 0.0005
        epochs = 200
        batch_size=200
    if weights == 'BalanceYields':
        learn_rate = 0.0001
        epochs = 200
        batch_size=400

    # Create instance of output directory where all results are saved.
    output_directory = 'HHWWyyDNN_binary_%s_%s/' % (suffix,weights)
    check_dir(output_directory)
    hyperparam_file = os.path.join(output_directory,'additional_model_hyper_params.txt')
    additional_hyperparams = open(hyperparam_file,'w')
    additional_hyperparams.write("optimizer: "+optimizer+"\n")
    additional_hyperparams.write("learn_rate: "+str(learn_rate)+"\n")
    additional_hyperparams.write("epochs: "+str(epochs)+"\n")
    additional_hyperparams.write("validation_split: "+str(validation_split)+"\n")
    additional_hyperparams.write("weights: "+weights+"\n")
    # Create plots subdirectory
    plots_dir = os.path.join(output_directory,'plots/')
    input_var_jsonFile = open('input_variables.json','r')
    selection_criteria = '( ((Leading_Photon_pt/CMS_hgg_mass) > 0.35) && ((Subleading_Photon_pt/CMS_hgg_mass) > 0.25) && passbVeto==1 && ExOneLep==1 && N_goodJets>=1)'
    # selection_criteria = '(AtLeast4GoodJets0Lep==1)'
    # selection_criteria = '(passPhotonSels==1 && passbVeto==1 && ExOneLep==1 && goodJets==1)'
    #selection_criteria = '( ((Leading_Photon_pt/CMS_hgg_mass) > 0.35) && ((Subleading_Photon_pt/CMS_hgg_mass) > 0.25) && passbVeto==1 && ExOneLep==1 && N_goodJets>=1)'

    # Load Variables from .json
    variable_list = json.load(input_var_jsonFile,encoding="utf-8").items()

    # Create list of headers for dataset .csv
    column_headers = []
    for key,var in variable_list:
        column_headers.append(key)
    column_headers.append('weight')
    column_headers.append('unweighted')
    column_headers.append('target')
    column_headers.append('key')
    column_headers.append('classweight')
    column_headers.append('process_ID')

    # Create instance of the input files directory
    #inputs_file_path = '/afs/cern.ch/work/a/atishelm/public/ForJosh/2017_DataMC_ntuples_moreVars'
    inputs_file_path = '/eos/user/r/rasharma/post_doc_ihep/double-higgs/ntuples/September29/MVANtuples'
    #inputs_file_path = '/eos/user/a/atishelm/ntuples/HHWWgg_DataSignalMCnTuples/PromptPromptApplied/'
    #inputs_file_path = 'PromptPromptApplied/'

    # Load ttree into .csv including all variables listed in column_headers
    print('<train-DNN> Input file path: ', inputs_file_path)
    outputdataframe_name = '%s/output_dataframe.csv' %(output_directory)
    if os.path.isfile(outputdataframe_name):
        data = pandas.read_csv(outputdataframe_name)
        print('<train-DNN> Loading data .csv from: %s . . . . ' % (outputdataframe_name))
    else:
        print('<train-DNN> Creating new data .csv @: %s . . . . ' % (inputs_file_path))
        data = load_data(inputs_file_path,column_headers,selection_criteria)
        # Change sentinal value to speed up training.
        data = data.replace(to_replace=-999.000000,value=-9.0)
        data.to_csv(outputdataframe_name, index=False)
        data = pandas.read_csv(outputdataframe_name)

    print('<main> data columns: ', (data.columns.values.tolist()))
    n = len(data)
    nHH = len(data.iloc[data.target.values == 1])
    nbckg = len(data.iloc[data.target.values == 0])
    print("Total (train+validation) length of HH = %i, bckg = %i" % (nHH, nbckg))

    # Make instance of plotter tool
    Plotter = plotter()
    # Create statistically independant training/testing data
    traindataset, valdataset = train_test_split(data, test_size=0.1)
    valdataset.to_csv((output_directory+'valid_dataset.csv'), index=False)

    print('<train-DNN> Training dataset shape: ', traindataset.shape)
    print('<train-DNN> Validation dataset shape: ', valdataset.shape)


    # Event weights
    weights_for_HH = traindataset.loc[traindataset['process_ID']=='HH', 'weight']
    weights_for_DiPhoton = traindataset.loc[traindataset['process_ID']=='DiPhoton', 'weight']
    weights_for_GJet = traindataset.loc[traindataset['process_ID']=='GJet', 'weight']
    weights_for_DY = traindataset.loc[traindataset['process_ID']=='DY', 'weight']
    weights_for_TTGG = traindataset.loc[traindataset['process_ID']=='TTGG', 'weight']
    weights_for_TTGJets = traindataset.loc[traindataset['process_ID']=='TTGJets', 'weight']
    weights_for_TTJets = traindataset.loc[traindataset['process_ID']=='TTJets', 'weight']
    weights_for_WJets = traindataset.loc[traindataset['process_ID']=='WJets', 'weight']
    weights_for_ttH = traindataset.loc[traindataset['process_ID']=='ttH', 'weight']

    HHsum_weighted= sum(weights_for_HH)
    GJetsum_weighted= sum(weights_for_GJet)
    DiPhotonsum_weighted= sum(weights_for_DiPhoton)
    TTGGsum_weighted= sum(weights_for_TTGG)
    TTGJetssum_weighted= sum(weights_for_TTGJets)
    TTJetssum_weighted= sum(weights_for_TTJets)
    WJetssum_weighted= sum(weights_for_WJets)
    ttHsum_weighted= sum(weights_for_ttH)
    DYsum_weighted= sum(weights_for_DY)
    #bckgsum_weighted = DiPhotonsum_weighted+WJetssum_weighted+ttHsum_weighted
    bckgsum_weighted = DiPhotonsum_weighted+WJetssum_weighted

    nevents_for_HH = traindataset.loc[traindataset['process_ID']=='HH', 'unweighted']
    nevents_for_DiPhoton = traindataset.loc[traindataset['process_ID']=='DiPhoton', 'unweighted']
    nevents_for_GJet = traindataset.loc[traindataset['process_ID']=='GJet', 'unweighted']
    nevents_for_DY = traindataset.loc[traindataset['process_ID']=='DY', 'unweighted']
    nevents_for_TTGG = traindataset.loc[traindataset['process_ID']=='TTGG', 'unweighted']
    nevents_for_TTGJets = traindataset.loc[traindataset['process_ID']=='TTGJets', 'unweighted']
    nevents_for_TTJets = traindataset.loc[traindataset['process_ID']=='TTJets', 'unweighted']
    nevents_for_WJets = traindataset.loc[traindataset['process_ID']=='WJets', 'unweighted']
    nevents_for_ttH = traindataset.loc[traindataset['process_ID']=='ttH', 'unweighted']

    HHsum_unweighted= sum(nevents_for_HH)
    GJetsum_unweighted= sum(nevents_for_GJet)
    DiPhotonsum_unweighted= sum(nevents_for_DiPhoton)
    TTGGsum_unweighted= sum(nevents_for_TTGG)
    TTGJetssum_unweighted= sum(nevents_for_TTGJets)
    TTJetssum_unweighted= sum(nevents_for_TTJets)
    WJetssum_unweighted= sum(nevents_for_WJets)
    ttHsum_unweighted= sum(nevents_for_ttH)
    DYsum_unweighted= sum(nevents_for_DY)

    #bckgsum_unweighted = DiPhotonsum_unweighted+WJetssum_unweighted+ttHsum_unweighted
    bckgsum_unweighted = DiPhotonsum_unweighted+WJetssum_unweighted


    if weights=='BalanceYields':
        print('HHsum_weighted= ' , HHsum_weighted)
        print('ttHsum_weighted= ' , ttHsum_weighted)
        print('DiPhotonsum_weighted= ', DiPhotonsum_weighted)
        print('WJetssum_weighted= ', WJetssum_weighted)
        print('DYsum_weighted= ', DYsum_weighted)
        print('GJetsum_weighted= ', GJetsum_weighted)
        print('bckgsum_weighted= ', bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='HH', ['classweight']] = 1.
        traindataset.loc[traindataset['process_ID']=='GJet', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='DY', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='DiPhoton', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='WJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='TTGG', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='TTGJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='TTJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='ttH', ['classweight']] = (HHsum_weighted/bckgsum_weighted)

    if weights=='BalanceNonWeighted':
        print('HHsum_unweighted= ' , HHsum_unweighted)
        print('ttHsum_unweighted= ' , ttHsum_unweighted)
        print('DiPhotonsum_unweighted= ', DiPhotonsum_unweighted)
        print('WJetssum_unweighted= ', WJetssum_unweighted)
        print('DYsum_unweighted= ', DYsum_unweighted)
        print('GJetsum_unweighted= ', GJetsum_unweighted)
        print('bckgsum_unweighted= ', bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='HH', ['classweight']] = 1.
        traindataset.loc[traindataset['process_ID']=='GJet', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='DY', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='DiPhoton', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='WJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='TTGG', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='TTGJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='TTJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='ttH', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)

    # Remove column headers that aren't input variables
    training_columns = column_headers[:-6]
    print('<train-DNN> Training features: ', training_columns)

    column_order_txt = '%s/column_order.txt' %(output_directory)
    column_order_file = open(column_order_txt, "wb")
    for tc_i in training_columns:
        line = tc_i+"\n"
        pickle.dump(str(line), column_order_file)

    num_variables = len(training_columns)

    # Extract training and testing data
    X_train = traindataset[training_columns].values
    X_test = valdataset[training_columns].values

    # Extract labels data
    Y_train = traindataset['target'].values
    Y_test = valdataset['target'].values

    # Create dataframe containing input features only (for correlation matrix)
    train_df = data.iloc[:traindataset.shape[0]]

    ## Input Variable Correlation plot
    correlation_plot_file_name = 'correlation_plot.png'
    Plotter.correlation_matrix(train_df)
    Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name)

    ####################################################################################
    # Weights applied during training. You will also need to update the class weights if
    # you are going to change the event weights applied. Introduce class weights and any
    # event weight you want to use here.
    #trainingweights = traindataset.loc[:,'classbalance']#*traindataset.loc[:,'weight']
    #trainingweights = np.array(trainingweights)

    # Temp hack to be able to change class weights without remaking dataframe
    #for inde in xrange(len(trainingweights)):
    #    newweight = 13243.0/6306.0
    #    trainingweights[inde]= newweight
    #print 'training event weight = ', trainingweights[0]

    # Event weights calculation so we can correctly apply event weights to diagnostic plots.
    # use seperate list because we don't want to apply class weights in plots.
    # Event weights if wanted
    train_weights = traindataset['weight'].values
    test_weights = valdataset['weight'].values

    # Weights applied during training.
    if weights=='BalanceYields':
        trainingweights = traindataset.loc[:,'classweight']*traindataset.loc[:,'weight']
    if weights=='BalanceNonWeighted':
        trainingweights = traindataset.loc[:,'classweight']
    trainingweights = np.array(trainingweights)

    ## Input Variable Correlation plot
    correlation_plot_file_name = 'correlation_plot.pdf'
    Plotter.correlation_matrix(train_df)
    Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name)

    # Fit label encoder to Y_train
    newencoder = LabelEncoder()
    newencoder.fit(Y_train)
    # Transform to encoded array
    encoded_Y = newencoder.transform(Y_train)
    encoded_Y_test = newencoder.transform(Y_test)

    if do_model_fit == 1:
        print('<train-BinaryDNN> Training new model . . . . ')
        histories = []
        labels = []

        if hyp_param_scan == 1:
            print('Begin at local time: ', time.localtime())
            hyp_param_scan_name = 'hyp_param_scan_results.txt'
            hyp_param_scan_results = open(hyp_param_scan_name,'a')
            time_str = str(time.localtime())+'\n'
            hyp_param_scan_results.write(time_str)
            hyp_param_scan_results.write(weights)
            learn_rates=[0.00001, 0.0001]
            epochs = [150,200]
            batch_size = [400,500]
            param_grid = dict(learn_rate=learn_rates,epochs=epochs,batch_size=batch_size)
            model = KerasClassifier(build_fn=gscv_model,verbose=0)
            grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
            grid_result = grid.fit(X_train,Y_train,shuffle=True,sample_weight=trainingweights)
            print("Best score: %f , best params: %s" % (grid_result.best_score_,grid_result.best_params_))
            hyp_param_scan_results.write("Best score: %f , best params: %s\n" %(grid_result.best_score_,grid_result.best_params_))
            means = grid_result.cv_results_['mean_test_score']
            stds = grid_result.cv_results_['std_test_score']
            params = grid_result.cv_results_['params']
            for mean, stdev, param in zip(means, stds, params):
                print("Mean (stdev) test score: %f (%f) with parameters: %r" % (mean,stdev,param))
                hyp_param_scan_results.write("Mean (stdev) test score: %f (%f) with parameters: %r\n" % (mean,stdev,param))
            exit()
        else:
            # Define model for analysis
            early_stopping_monitor = EarlyStopping(patience=30, monitor='val_loss', verbose=1)
            model = baseline_model(num_variables, learn_rate=learn_rate)

            # Fit the model
            # Batch size = examples before updating weights (larger = faster training)
            # Epoch = One pass over data (useful for periodic logging and evaluation)
            #class_weights = np.array(class_weight.compute_class_weight('balanced',np.unique(Y_train),Y_train))
            history = model.fit(X_train,Y_train,validation_split=validation_split,epochs=epochs,batch_size=batch_size,verbose=1,shuffle=True,sample_weight=trainingweights,callbacks=[early_stopping_monitor])
            histories.append(history)
            labels.append(optimizer)
            # Make plot of loss function evolution
            Plotter.plot_training_progress_acc(histories, labels)
            acc_progress_filename = 'DNN_acc_wrt_epoch.png'
            Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename)
    else:
        model_name = os.path.join(output_directory,'model.h5')
        model = load_trained_model(model_name)

    # Node probabilities for training sample events
    result_probs = model.predict(np.array(X_train))
    result_classes = model.predict_classes(np.array(X_train))

    # Node probabilities for testing sample events
    result_probs_test = model.predict(np.array(X_test))
    result_classes_test = model.predict_classes(np.array(X_test))

    # Store model in file
    model_output_name = os.path.join(output_directory,'model.h5')
    model.save(model_output_name)
    weights_output_name = os.path.join(output_directory,'model_weights.h5')
    model.save_weights(weights_output_name)
    model_json = model.to_json()
    model_json_name = os.path.join(output_directory,'model_serialised.json')
    with open(model_json_name,'w') as json_file:
        json_file.write(model_json)
    model.summary()
    model_schematic_name = os.path.join(output_directory,'model_schematic.eps')
    print "DEBUG: ",model_schematic_name
    plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True)
    # plot_model(model, to_file='model_schematic.eps', show_shapes=True, show_layer_names=True)

    # Initialise output directory.
    Plotter.plots_directory = plots_dir
    Plotter.output_directory = output_directory

    '''
    print('================')
    print('Training event labels: ', len(Y_train))
    print('Training event probs', len(result_probs))
    print('Training event weights: ', len(train_weights))
    print('Testing events: ', len(Y_test))
    print('Testing event probs', len(result_probs_test))
    print('Testing event weights: ', len(test_weights))
    print('================')
    '''

    # Make overfitting plots of output nodes
    Plotter.binary_overfitting(model, Y_train, Y_test, result_probs, result_probs_test, plots_dir, train_weights, test_weights)
    print "DEBUG: Y_train shape: ",Y_train.shape

    # # Get true process integers for training dataset
    # original_encoded_train_Y = []
    # for i in xrange(len(result_probs)):
    #     if Y_train[i][0] == 1:
    #         original_encoded_train_Y.append(0)
    #     if Y_train[i][1] == 1:
    #         original_encoded_train_Y.append(1)
    #     if Y_train[i][2] == 1:
    #         original_encoded_train_Y.append(2)
    #     if Y_train[i][3] == 1:
    #         original_encoded_train_Y.append(3)

    # Get true class values for testing dataset
    # result_classes_test = newencoder.inverse_transform(result_classes_test)
    # result_classes_train = newencoder.inverse_transform(result_classes)
    e = shap.DeepExplainer(model, X_train[:400, ])
    shap_values = e.shap_values(X_test[:400, ])
    Plotter.plot_dot(title="DeepExplainer_sigmoid_y0", x=X_test[:400, ], shap_values=shap_values, column_headers=column_headers)
    Plotter.plot_dot_bar(title="DeepExplainer_Bar_sigmoid_y0", x=X_test[:400,], shap_values=shap_values, column_headers=column_headers)
    #e = shap.GradientExplainer(model, X_train[:100, ])
    #shap_values = e.shap_values(X_test[:100, ])
    #Plotter.plot_dot(title="GradientExplainer_sigmoid_y0", x=X_test[:100, ], shap_values=shap_values, column_headers=column_headers)
    #e = shap.KernelExplainer(model.predict, X_train[:100, ])
    #shap_values = e.shap_values(X_test[:100, ])
    #Plotter.plot_dot(title="KernelExplainer_sigmoid_y0", x=X_test[:100, ],shap_values=shap_values, column_headers=column_headers)
    #Plotter.plot_dot_bar(title="KernelExplainer_Bar_sigmoid_y0", x=X_test[:100,], shap_values=shap_values, column_headers=column_headers)
    #Plotter.plot_dot_bar_all(title="KernelExplainer_bar_All_Var_sigmoid_y0", x=X_test[:100,], shap_values=shap_values, column_headers=column_headers)

    # Create confusion matrices for training and testing performance
    # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'index')
    # Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TRAIN.png')
    # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'index')
    # Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TEST.png')

    # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'columns')
    # Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TRAIN.png')
    # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'columns')
    # Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TEST.png')

    # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'')
    # Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TRAIN.png')
    # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'')
    # Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TEST.png')

    Plotter.ROC_sklearn(Y_train, result_probs, Y_test, result_probs_test, 1 , 'BinaryClassifierROC',train_weights, test_weights)
Ejemplo n.º 18
0
    'optimizer': ['rmsprop'],
    'neurons': [5, 6, 7],
    'n_layer': [3]
}
grid_search = GridSearchCV(estimator=classifier,
                           param_grid=parameters,
                           scoring='accuracy',
                           cv=10)
grid_search = grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

# evaluation (k-fold crossvalidation included in grid search)
#classifier = KerasClassifier(build_fn = make_my_classifier, batch_size = 32, epochs = 250)
#accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
#mean = accuracies.mean()
#variance = accuracies.std()

#------------------------------------------------------------------------------
# Predictions
#------------------------------------------------------------------------------
prediction = classifier.predict(X_test)
prediction = (prediction > 0.5)
cm = confusion_matrix(y_test, prediction)

#------------------------------------------------------------------------------
# Save and/or load model
#------------------------------------------------------------------------------
classifier.save('wine_good_or_nah.h5', overwrite=True)
km.load_model('wine_good_or_nah.h5')
Ejemplo n.º 19
0
    ann_classifier.add(
        Dense(
            units=1,
            kernel_initializer="uniform",  # output layer
            activation="sigmoid"))
    optimizer = Adam(lr, decay)
    ann_classifier.compile(optimizer=optimizer,
                           loss="binary_crossentropy",
                           metrics=["acc"])
    return ann_classifier


ann_classifier = KerasClassifier(build_fn=create_classifier)

# visualization of the model
print(ann_classifier.summary())
plot_model(ann_classifier,
           to_file='ann_classifier_plot.png',
           show_shapes=True,
           show_layer_names=True)

kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
grid = GridSearchCV(estimator=ann_classifier,
                    param_grid=param_grid,
                    scoring="accuracy",
                    cv=kfold,
                    verbose=1)

grid_results = grid.fit(X=x_train_scaled, y=y_train)
ann_classifier.save("ann_adam.h5")
Ejemplo n.º 20
0
                           param_grid=parameters,
                           scoring="accuracy",
                           cv=10,
                           n_jobs=-1)

grid_search = grid_search.fit(X_train, y_train)

## Getting best parameters from the GridSearchCV
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

## Building and Fitting the ANN with bests parameters
classifier = build_classifier(optimizer=best_parameters.get("optimizer"),
                              nb_layers=best_parameters.get("nb_layers"),
                              dropout=best_parameters.get("dropout"))
classifier.fit(X_train,
               y_train,
               batch_size=best_parameters.get("batch_size"),
               epochs=best_parameters.get("epochs"))

## Saving the model for reuse
classifier.save("churn.h5")

## Making the predictions and evaluating the model
y_pred = classifier.predict(X_test)
y_pred = (y_pred > .5)

cm = confusion_matrix(y_test, y_pred)

accuracy = (cm[0, 0] + cm[1, 1]) / X_test.shape[0]
Ejemplo n.º 21
0
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)


## Accuracy of 86,5% confirmed



#### SAVING/LOADING MODEL:
        
classifier.save('my_classifier.h5')        


#classifier = load_model('my_classifier.h5')



### SINGLE PREDICTION

# two pairs of square brackets + feature scaling(NO FIT! - just transform)
# To remove warning transform one element into float

#john = np.array([[0.0,1,555,1, 51,5, 1550000, 5, 1, 1, 120000]])
#john_transform = sc.transform(john)
#
#
Ejemplo n.º 22
0
          kernel_regularizer=regularizers.l2(0.001))(L)
print('Dense layer is:', L)

model = Model(inputs=sequence_input, outputs=L)

# Optimization and compile
opt = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, decay=0.01)
print('Begin compiling...')
model.compile(loss='categorical_crossentropy', 
              optimizer=opt, 
              metrics=['accuracy'])
model.summary()

# Begin training
model.fit(data_train, 
          Y_train, 
          batch_size=batch_size, 
          epochs=epochs, 
          verbose=2,
          validation_data=(data_val, Y_val))
score = model.evaluate(data_test, Y_test, batch_size=batch_size)
print ('The evaluation is: ', score)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)


# Save model
print ('Saving model...')
model.save('CNN-GRU-Turkish corpus-200d')
Ejemplo n.º 23
0
def main():
    print('Using Keras version: ', keras.__version__)

    usage = 'usage: %prog [options]'
    parser = argparse.ArgumentParser(usage)
    parser.add_argument(
        '-t',
        '--train_model',
        dest='train_model',
        help=
        'Option to train model or simply make diagnostic plots (0=False, 1=True)',
        default=1,
        type=int)
    parser.add_argument('-s',
                        '--suff',
                        dest='suffix',
                        help='Option to choose suffix for training',
                        default='',
                        type=str)
    parser.add_argument('-p',
                        '--para',
                        dest='hyp_param_scan',
                        help='Option to run hyper-parameter scan',
                        default=0,
                        type=int)
    parser.add_argument(
        '-i',
        '--inputs_file_path',
        dest='inputs_file_path',
        help=
        'Path to directory containing directories \'Bkgs\' and \'Signal\' which contain background and signal ntuples respectively.',
        default='',
        type=str)
    args = parser.parse_args()
    do_model_fit = args.train_model
    suffix = args.suffix

    # Create instance of the input files directory
    #inputs_file_path = 'HHWWgg_DataSignalMCnTuples/2017/'
    inputs_file_path = '/eos/user/b/bmarzocc/HHWWgg/January_2021_Production/2017/'

    hyp_param_scan = args.hyp_param_scan
    # Set model hyper-parameters
    weights = 'BalanceYields'  # 'BalanceYields' or 'BalanceNonWeighted'
    optimizer = 'Nadam'
    validation_split = 0.1
    # hyper-parameter scan results
    if weights == 'BalanceNonWeighted':
        learn_rate = 0.0005
        epochs = 200
        batch_size = 200
    if weights == 'BalanceYields':
        learn_rate = 0.0001
        epochs = 200
        batch_size = 32
        #epochs = 10
        #batch_size=200

    # Create instance of output directory where all results are saved.
    output_directory = 'HHWWyyDNN_binary_%s_%s/' % (suffix, weights)
    check_dir(output_directory)
    hyperparam_file = os.path.join(output_directory,
                                   'additional_model_hyper_params.txt')
    additional_hyperparams = open(hyperparam_file, 'w')
    additional_hyperparams.write("optimizer: " + optimizer + "\n")
    additional_hyperparams.write("learn_rate: " + str(learn_rate) + "\n")
    additional_hyperparams.write("epochs: " + str(epochs) + "\n")
    additional_hyperparams.write("validation_split: " + str(validation_split) +
                                 "\n")
    additional_hyperparams.write("weights: " + weights + "\n")
    # Create plots subdirectory
    plots_dir = os.path.join(output_directory, 'plots/')
    input_var_jsonFile = open('input_variables.json', 'r')
    selection_criteria = '( (Leading_Photon_pt/CMS_hgg_mass) > 1/3 && (Subleading_Photon_pt/CMS_hgg_mass) > 1/4 )'

    # Load Variables from .json
    variable_list = json.load(input_var_jsonFile, encoding="utf-8").items()

    # Create list of headers for dataset .csv
    column_headers = []
    for key, var in variable_list:
        column_headers.append(key)
    column_headers.append('weight')
    column_headers.append('unweighted')
    column_headers.append('target')
    column_headers.append('key')
    column_headers.append('classweight')
    column_headers.append('process_ID')

    # Load ttree into .csv including all variables listed in column_headers
    print('<train-DNN> Input file path: ', inputs_file_path)
    outputdataframe_name = '%s/output_dataframe.csv' % (output_directory)
    if os.path.isfile(outputdataframe_name):
        data = pandas.read_csv(outputdataframe_name)
        print('<train-DNN> Loading data .csv from: %s . . . . ' %
              (outputdataframe_name))
    else:
        print('<train-DNN> Creating new data .csv @: %s . . . . ' %
              (inputs_file_path))
        data = load_data(inputs_file_path, column_headers, selection_criteria)
        # Change sentinal value to speed up training.
        data = data.mask(data < -25., -9.)
        #data = data.replace(to_replace=-99.,value=-9.0)
        data.to_csv(outputdataframe_name, index=False)
        data = pandas.read_csv(outputdataframe_name)

    print('<main> data columns: ', (data.columns.values.tolist()))
    n = len(data)
    nHH = len(data.iloc[data.target.values == 1])
    nbckg = len(data.iloc[data.target.values == 0])
    print("Total (train+validation) length of HH = %i, bckg = %i" %
          (nHH, nbckg))

    # Make instance of plotter tool
    Plotter = plotter()
    # Create statistically independant training/testing data
    traindataset, valdataset = train_test_split(data, test_size=0.1)
    valdataset.to_csv((output_directory + 'valid_dataset.csv'), index=False)

    print('<train-DNN> Training dataset shape: ', traindataset.shape)
    print('<train-DNN> Validation dataset shape: ', valdataset.shape)

    # Event weights
    weights_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH',
                                      'weight']
    weights_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg',
                                       'weight']
    weights_for_DiPhoton = traindataset.loc[traindataset['process_ID'] ==
                                            'DiPhoton', 'weight']
    weights_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet',
                                        'weight']
    weights_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD',
                                       'weight']
    weights_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY',
                                      'weight']
    weights_for_TTGsJets = traindataset.loc[traindataset['process_ID'] ==
                                            'TTGsJets', 'weight']
    weights_for_WGsJets = traindataset.loc[traindataset['process_ID'] ==
                                           'WGsJets', 'weight']
    weights_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW',
                                      'weight']

    HHsum_weighted = sum(weights_for_HH)
    Hggsum_weighted = sum(weights_for_Hgg)
    DiPhotonsum_weighted = sum(weights_for_DiPhoton)
    GJetsum_weighted = sum(weights_for_GJet)
    QCDsum_weighted = sum(weights_for_QCD)
    DYsum_weighted = sum(weights_for_DY)
    TTGsJetssum_weighted = sum(weights_for_TTGsJets)
    WGsJetssum_weighted = sum(weights_for_WGsJets)
    WWsum_weighted = sum(weights_for_WW)
    bckgsum_weighted = Hggsum_weighted + DiPhotonsum_weighted + GJetsum_weighted + QCDsum_weighted + DYsum_weighted + TTGsJetssum_weighted + WGsJetssum_weighted + WWsum_weighted
    #bckgsum_weighted = DiPhotonsum_weighted + GJetsum_weighted + QCDsum_weighted + DYsum_weighted + TTGsJetssum_weighted + WGsJetssum_weighted + WWsum_weighted

    nevents_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH',
                                      'unweighted']
    nevents_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg',
                                       'unweighted']
    nevents_for_DiPhoton = traindataset.loc[traindataset['process_ID'] ==
                                            'DiPhoton', 'unweighted']
    nevents_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet',
                                        'unweighted']
    nevents_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD',
                                       'unweighted']
    nevents_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY',
                                      'unweighted']
    nevents_for_TTGsJets = traindataset.loc[traindataset['process_ID'] ==
                                            'TTGsJets', 'unweighted']
    nevents_for_WGsJets = traindataset.loc[traindataset['process_ID'] ==
                                           'WGsJets', 'unweighted']
    nevents_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW',
                                      'unweighted']

    HHsum_unweighted = sum(nevents_for_HH)
    Hggsum_unweighted = sum(nevents_for_Hgg)
    DiPhotonsum_unweighted = sum(nevents_for_DiPhoton)
    GJetsum_unweighted = sum(nevents_for_GJet)
    QCDsum_unweighted = sum(nevents_for_QCD)
    DYsum_unweighted = sum(nevents_for_DY)
    TTGsJetssum_unweighted = sum(nevents_for_TTGsJets)
    WGsJetssum_unweighted = sum(nevents_for_WGsJets)
    WWsum_unweighted = sum(nevents_for_WW)
    bckgsum_unweighted = Hggsum_unweighted + DiPhotonsum_unweighted + GJetsum_unweighted + QCDsum_unweighted + DYsum_unweighted + TTGsJetssum_unweighted + WGsJetssum_unweighted + WWsum_unweighted
    #bckgsum_unweighted = DiPhotonsum_unweighted + GJetsum_unweighted + QCDsum_unweighted + DYsum_unweighted + TTGsJetssum_unweighted + WGsJetssum_unweighted + WWsum_unweighted

    HHsum_weighted = 2 * HHsum_weighted
    HHsum_unweighted = 2 * HHsum_unweighted

    if weights == 'BalanceYields':
        print('HHsum_weighted= ', HHsum_weighted)
        print('Hggsum_weighted= ', Hggsum_weighted)
        print('DiPhotonsum_weighted= ', DiPhotonsum_weighted)
        print('GJetsum_weighted= ', GJetsum_weighted)
        print('QCDsum_weighted= ', QCDsum_weighted)
        print('DYsum_weighted= ', DYsum_weighted)
        print('TTGsJetssum_weighted= ', TTGsJetssum_weighted)
        print('WGsJetssum_weighted= ', WGsJetssum_weighted)
        print('WWsum_weighted= ', WWsum_weighted)
        print('bckgsum_weighted= ', bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'HH',
                         ['classweight']] = HHsum_unweighted / HHsum_weighted
        traindataset.loc[traindataset['process_ID'] == 'Hgg',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'DiPhoton',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'GJet',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'QCD',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'DY',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'TTGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'WGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'WW',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)

    if weights == 'BalanceNonWeighted':
        print('HHsum_unweighted= ', HHsum_unweighted)
        print('Hggsum_unweighted= ', Hggsum_unweighted)
        print('DiPhotonsum_unweighted= ', DiPhotonsum_unweighted)
        print('GJetsum_unweighted= ', GJetsum_unweighted)
        print('QCDsum_unweighted= ', QCDsum_unweighted)
        print('DYsum_unweighted= ', DYsum_unweighted)
        print('TTGsJetssum_unweighted= ', TTGsJetssum_unweighted)
        print('WGsJetssum_unweighted= ', WGsJetssum_unweighted)
        print('WWsum_unweighted= ', WWsum_unweighted)
        print('bckgsum_unweighted= ', bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'HH',
                         ['classweight']] = 1.
        traindataset.loc[traindataset['process_ID'] == 'Hgg',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'DiPhoton',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'GJet',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'QCD',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'DY',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'TTGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'WGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'WW',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)

    # Remove column headers that aren't input variables
    training_columns = column_headers[:-6]
    print('<train-DNN> Training features: ', training_columns)

    column_order_txt = '%s/column_order.txt' % (output_directory)
    column_order_file = open(column_order_txt, "wb")
    for tc_i in training_columns:
        line = tc_i + "\n"
        pickle.dump(str(line), column_order_file)

    num_variables = len(training_columns)

    # Extract training and testing data
    X_train = traindataset[training_columns].values
    X_test = valdataset[training_columns].values

    # Extract labels data
    Y_train = traindataset['target'].values
    Y_test = valdataset['target'].values

    # Create dataframe containing input features only (for correlation matrix)
    train_df = data.iloc[:traindataset.shape[0]]

    # Event weights if wanted
    train_weights = traindataset['weight'].values
    test_weights = valdataset['weight'].values

    # Weights applied during training.
    if weights == 'BalanceYields':
        trainingweights = traindataset.loc[:,
                                           'classweight'] * traindataset.loc[:,
                                                                             'weight']
    if weights == 'BalanceNonWeighted':
        trainingweights = traindataset.loc[:, 'classweight']
    trainingweights = np.array(trainingweights)

    ## Input Variable Correlation plot
    correlation_plot_file_name = 'correlation_plot'
    Plotter.correlation_matrix(train_df)
    Plotter.save_plots(dir=plots_dir,
                       filename=correlation_plot_file_name + '.png')
    Plotter.save_plots(dir=plots_dir,
                       filename=correlation_plot_file_name + '.pdf')

    # Fit label encoder to Y_train
    newencoder = LabelEncoder()
    newencoder.fit(Y_train)
    # Transform to encoded array
    encoded_Y = newencoder.transform(Y_train)
    encoded_Y_test = newencoder.transform(Y_test)

    if do_model_fit == 1:
        print('<train-BinaryDNN> Training new model . . . . ')
        histories = []
        labels = []

        if hyp_param_scan == 1:
            print('Begin at local time: ', time.localtime())
            hyp_param_scan_name = 'hyp_param_scan_results.txt'
            hyp_param_scan_results = open(hyp_param_scan_name, 'a')
            time_str = str(time.localtime()) + '\n'
            hyp_param_scan_results.write(time_str)
            hyp_param_scan_results.write(weights)
            learn_rates = [0.00001, 0.0001]
            epochs = [150, 200]
            batch_size = [400, 500]
            param_grid = dict(learn_rate=learn_rates,
                              epochs=epochs,
                              batch_size=batch_size)
            model = KerasClassifier(build_fn=gscv_model, verbose=0)
            grid = GridSearchCV(estimator=model,
                                param_grid=param_grid,
                                n_jobs=-1)
            grid_result = grid.fit(X_train,
                                   Y_train,
                                   shuffle=True,
                                   sample_weight=trainingweights)
            print("Best score: %f , best params: %s" %
                  (grid_result.best_score_, grid_result.best_params_))
            hyp_param_scan_results.write(
                "Best score: %f , best params: %s\n" %
                (grid_result.best_score_, grid_result.best_params_))
            means = grid_result.cv_results_['mean_test_score']
            stds = grid_result.cv_results_['std_test_score']
            params = grid_result.cv_results_['params']
            for mean, stdev, param in zip(means, stds, params):
                print("Mean (stdev) test score: %f (%f) with parameters: %r" %
                      (mean, stdev, param))
                hyp_param_scan_results.write(
                    "Mean (stdev) test score: %f (%f) with parameters: %r\n" %
                    (mean, stdev, param))
            exit()
        else:
            # Define model for analysis
            early_stopping_monitor = EarlyStopping(patience=100,
                                                   monitor='val_loss',
                                                   min_delta=0.01,
                                                   verbose=1)
            #model = baseline_model(num_variables, learn_rate=learn_rate)
            model = new_model(num_variables, learn_rate=learn_rate)

            # Fit the model
            # Batch size = examples before updating weights (larger = faster training)
            # Epoch = One pass over data (useful for periodic logging and evaluation)
            #class_weights = np.array(class_weight.compute_class_weight('balanced',np.unique(Y_train),Y_train))
            history = model.fit(X_train,
                                Y_train,
                                validation_split=validation_split,
                                epochs=epochs,
                                batch_size=batch_size,
                                verbose=1,
                                shuffle=True,
                                sample_weight=trainingweights,
                                callbacks=[early_stopping_monitor])
            histories.append(history)
            labels.append(optimizer)
            # Make plot of loss function evolution
            Plotter.plot_training_progress_acc(histories, labels)
            acc_progress_filename = 'DNN_acc_wrt_epoch'
            Plotter.save_plots(dir=plots_dir,
                               filename=acc_progress_filename + '.png')
            Plotter.save_plots(dir=plots_dir,
                               filename=acc_progress_filename + '.pdf')

            Plotter.history_plot(history, label='loss')
            Plotter.save_plots(dir=plots_dir, filename='history_loss.png')
            Plotter.save_plots(dir=plots_dir, filename='history_loss.pdf')
    else:
        model_name = os.path.join(output_directory, 'model.h5')
        model = load_trained_model(model_name)

    # Node probabilities for training sample events
    result_probs = model.predict(np.array(X_train))
    result_classes = model.predict_classes(np.array(X_train))

    # Node probabilities for testing sample events
    result_probs_test = model.predict(np.array(X_test))
    result_classes_test = model.predict_classes(np.array(X_test))

    # Store model in file
    model_output_name = os.path.join(output_directory, 'model.h5')
    model.save(model_output_name)
    weights_output_name = os.path.join(output_directory, 'model_weights.h5')
    model.save_weights(weights_output_name)
    model_json = model.to_json()
    model_json_name = os.path.join(output_directory, 'model_serialised.json')
    with open(model_json_name, 'w') as json_file:
        json_file.write(model_json)
    model.summary()
    model_schematic_name = os.path.join(output_directory,
                                        'model_schematic.png')
    #plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True)

    print('================')
    print('Training event labels: ', len(Y_train))
    print('Training event probs', len(result_probs))
    print('Training event weights: ', len(train_weights))
    print('Testing events: ', len(Y_test))
    print('Testing event probs', len(result_probs_test))
    print('Testing event weights: ', len(test_weights))
    print('================')

    # Initialise output directory.
    Plotter.plots_directory = plots_dir
    Plotter.output_directory = output_directory

    Plotter.ROC(model, X_test, Y_test, X_train, Y_train)
    Plotter.save_plots(dir=plots_dir, filename='ROC.png')
    Plotter.save_plots(dir=plots_dir, filename='ROC.pdf')
Ejemplo n.º 24
0
def main():

    ########################
    ### Parse Input Args ###
    ########################
    parser = argparse.ArgumentParser(
        description='CNN classification code implemented using TensorFlow v2.0',
        epilog='https://github.com/azodichr')

    parser.add_argument('-x', help='Feature numpy dataset', required=True)
    parser.add_argument('-y', help='Class/Y numpy dataset', required=True)
    parser.add_argument('-run', help='T/F to run final models', default='t')
    parser.add_argument('-splits',
                        help='Values for train/val/test',
                        default='70,10,20')
    parser.add_argument('-y_name', help='Phenotype Trait')
    parser.add_argument('-f', help='Function: gs, run, full', default='full')
    parser.add_argument('-save', help='Name for Output File', default='test')
    parser.add_argument('-balance',
                        help='t/f to downsample so balance classes',
                        default='t')
    parser.add_argument('-n_channels',
                        help='Number of channels',
                        default=1,
                        type=int)
    parser.add_argument('-cv',
                        help='Number of cross validation folds',
                        type=int,
                        default=5)
    parser.add_argument('-n_jobs',
                        '-p',
                        help='Number of processors for '
                        'parallel computing (max for HPCC = 14)',
                        type=int,
                        default=1)
    parser.add_argument('-save_model',
                        help='T/F if want to save final models',
                        type=str,
                        default='f')
    parser.add_argument('-tag',
                        help='Identifier String to add to RESULTS',
                        type=str,
                        default='cnn')
    parser.add_argument('-save_detailed',
                        help='T/F Save detailed model performance',
                        type=str,
                        default='f')
    parser.add_argument('-original_df',
                        help='DF fed into input_converter.py',
                        type=str,
                        default='')
    parser.add_argument('-imp_m',
                        help='T/F to calculate importance of each motif',
                        type=str,
                        default='f')
    parser.add_argument('-imp_k',
                        help='T/F to calculate importance of each kernel',
                        type=str,
                        default='f')

    # Default Hyperparameters
    parser.add_argument('-params',
                        help='Output from -f gs (i.e. '
                        'SAVE_GridSearch.txt)',
                        default='default')
    parser.add_argument('-actfun',
                        help='Activation function. (relu, sigmoid)',
                        default='relu')
    parser.add_argument('-learn_rate',
                        help='Learning Rate',
                        default=0.01,
                        type=float)
    parser.add_argument('-dropout',
                        help='Dropout rate',
                        default=0.25,
                        type=float)
    parser.add_argument('-l2',
                        help='Shrinkage parameter for L2 regularization',
                        default=0.25,
                        type=float)
    parser.add_argument('-filters',
                        help='Number of Kernels/filters',
                        default=8,
                        type=int)
    parser.add_argument('-optimizer',
                        help='Optimization function to use)',
                        type=str,
                        default='Adam')
    parser.add_argument('-dense',
                        help='Number of nodes in dense layer',
                        type=int,
                        default=16)
    parser.add_argument('-activation',
                        help='Activation function in all but '
                        'last dense layer, which is set to linear',
                        type=str,
                        default='relu')
    parser.add_argument('-n_reps',
                        '-n',
                        help='Number of replicates (unique '
                        'validation set/starting weights for each)',
                        default=100,
                        type=int)
    parser.add_argument('-clip_value',
                        help='Clip Value',
                        type=float,
                        default=0.5)
    parser.add_argument('-patience',
                        help='Patience for Early Stopping',
                        type=int,
                        default=5)
    parser.add_argument('-min_delta',
                        help='Minimum Delta Value for Early '
                        'Stopping',
                        type=float,
                        default=0)

    # Grid Search reps/space
    parser.add_argument('-gs_reps',
                        '-gs_n',
                        help='Number of Grid Search Reps'
                        '(will append results if SAVE_GridSearch.csv exists)',
                        type=int,
                        default=10)
    parser.add_argument('-actfun_gs',
                        help='Activation functions for Grid '
                        'Search',
                        nargs='*',
                        default=['relu', 'selu', 'elu'])
    parser.add_argument('-dropout_gs',
                        help='Dropout rates for Grid Search',
                        nargs='*',
                        type=float,
                        default=[0.0, 0.1, 0.25])
    parser.add_argument('-l2_gs',
                        help='Shrinkage parameters for L2 for Grid '
                        'Search',
                        nargs='*',
                        type=float,
                        default=[0.01, 0.1, 0.25])
    parser.add_argument('-lrate_gs',
                        help='Learning Rate',
                        nargs='*',
                        type=float,
                        default=[0.1, 0.01, 0.001, 0.0001])
    parser.add_argument('-kernels_gs',
                        help='Number of Kernels for Grid Search',
                        default=[4, 8, 16, 24],
                        type=int)

    args = parser.parse_args()
    k_height = 'tmp'
    args.k_len = 'tmp'

    def downsample(x, y):
        unique, counts = np.unique(y_all, return_counts=True)
        smaller_index = list(counts).index(min(counts))
        bigger_index = list(counts).index(max(counts))

        i_smaller = np.where(y_all == unique[smaller_index])[0]
        i_bigger = np.where(y_all == unique[bigger_index])[0]
        downsample_n = len(i_smaller)
        i_bigger_downsampled = np.random.choice(i_bigger,
                                                size=downsample_n,
                                                replace=False)

        i_keep = list(i_smaller) + list(i_bigger_downsampled)
        y = y_all[i_keep]
        x = x_all[i_keep]

        return x, y

    def make_cnn_model(learn_rate=args.learn_rate,
                       filters=args.filters,
                       dropout=args.dropout,
                       dense=args.dense,
                       l2=args.l2,
                       activation=args.activation,
                       optimizer=args.optimizer,
                       units=1):

        if optimizer.lower() == 'adam':
            opt = tf.keras.optimizers.Adam(lr=learn_rate,
                                           clipvalue=args.clip_value)
        elif optimizer.lower() == 'nadam':
            opt = tf.keras.optimizers.Nadam(lr=learn_rate,
                                            clipvalue=args.clip_value)
        elif optimizer.lower() == 'rmsprop':
            opt = tf.keras.optimizers.RMSprop(lr=learn_rate,
                                              clipvalue=args.clip_value)
        elif optimizer.lower() == 'sgdm':
            opt = tf.keras.optimizers.SGD(lr=learn_rate,
                                          decay=1e-6,
                                          clipvalue=args.clip_value,
                                          momentum=0.9,
                                          nesterov=True)

        conv2d_layer = layers.Conv2D(
            filters=filters,
            kernel_size=tuple([k_height, 1]),
            kernel_regularizer=tf.keras.regularizers.l2(l2),
            activation=activation,
            kernel_initializer='glorot_normal',
            input_shape=(n_rows, n_columns, args.n_channels),
            name='conv2d_layer')
        K.clear_session()
        model = models.Sequential()
        model.add(conv2d_layer)
        model.add(layers.Flatten())
        model.add(layers.Dense(dense, activation=activation))
        model.add(layers.Dropout(dropout))
        model.add(layers.Dense(units=1, activation='sigmoid'))
        model.compile(optimizer=opt, loss='binary_crossentropy')

        return model, conv2d_layer

    ##########################
    ### Data preprocessing ###
    ##########################
    x_all = np.load(args.x)
    y_all = np.load(args.y)
    x_all = x_all.reshape(x_all.shape + (args.n_channels, ))

    if args.balance.lower() in ['t', 'true']:
        x, y = downsample(x_all, y_all)

        print('Y shape (down-sampled): %s' % str(y.shape))
        print('X shape (down-sampled): %s' % str(x.shape))
    else:
        y = y_all
        x = x_all

    print("\nSnapshot of feature data for first instance in data set:")
    print(x[0, :, 0:5, 0])
    n = y.shape[0]
    n_rows = x.shape[1]
    n_columns = x.shape[2]

    k_height = x.shape[1]
    args.k_len = 1
    print('Kernel dimensions: ', k_height, args.k_len)

    ###################
    ### Grid Search ###
    ###################

    if args.params.lower() == 'gs':
        print('\n***** Starting Random Search with %i reps using %i testing '
              'instances and %i fold cross-validation *****\n' %
              (args.gs_reps, x.shape[0], args.cv))
        scoring = {'acc': 'accuracy', 'f1': 'f1'}
        param_grid = dict(
            learn_rate=[0.1, 0.01, 0.001],
            filters=[8, 16],
            dense=[8, 16, 32],
            l2=[0.1, 0.25],  #, 0.5],
            dropout=[0.1, 0.25],  #, 0.5],
            activation=["relu"],  #, 'selu', 'elu'],
            optimizer=['RMSprop', 'Adam', 'nadam'])
        model, conv2d_layer = KerasClassifier(build_fn=make_cnn_model,
                                              batch_size=100,
                                              epochs=30,
                                              verbose=0)
        rand_search = RandomizedSearchCV(estimator=model,
                                         param_distributions=param_grid,
                                         cv=args.cv,
                                         n_iter=args.gs_reps,
                                         n_jobs=args.n_jobs,
                                         scoring=scoring,
                                         refit='acc',
                                         verbose=0)
        gs_result = rand_search.fit(x, y)
        gs_result_df = pd.DataFrame.from_dict(gs_result.cv_results_)

        print("Saving Grid Search Results....")
        print(gs_result_df.head())
        with open(args.save + "_GridSearch.txt", 'a') as out_gs:
            gs_result_df.to_csv(out_gs, header=out_gs.tell() == 0, sep='\t')

    print('\n\n Grid Search results saved to: %s_GridSearch.txt\n' % args.save)

    ################
    ### Run final model
    ################

    if args.run.lower() in ['t', 'true']:
        print('####### Running Final Model(s) ###########')

        # Step 1: Define the parameters from the Grid Search or use default
        if args.params.lower() != 'default':
            if args.params.lower() != 'gs':
                gs_result_df = pd.read_csv(args.params, sep='\t')
                gs_result_df.fillna(0, inplace=True)

            gs_mean = gs_result_df.groupby([
                'param_filters', 'param_optimizer', 'param_learn_rate',
                'param_dropout', 'param_l2', 'param_dense', 'param_activation'
            ]).agg({
                'mean_test_acc': 'mean',
                'std_test_acc': 'mean',
                'mean_fit_time': 'count'
            }).reset_index()

            print('Parameter Search Coverage: \nMin: %i\nMean: %3f\nMax:%i' %
                  (gs_mean['mean_fit_time'].min(),
                   gs_mean['mean_fit_time'].mean(),
                   gs_mean['mean_fit_time'].max()))

            if gs_mean['mean_fit_time'].min() == 1:
                print('Dropping parameter combinations with < 2 replicates...')
                gs_mean = gs_mean[gs_mean['mean_fit_time'] >= 2]

            gs_mean = gs_mean.sort_values(by='mean_test_acc', ascending=False)
            print('\nSnapshot of grid search results:')
            print(gs_mean.head())

            args.learn_rate = float(gs_mean['param_learn_rate'].iloc[0])
            args.l2 = float(gs_mean['param_l2'].iloc[0])
            args.dropout = float(gs_mean['param_dropout'].iloc[0])
            args.filters = int(gs_mean['param_filters'].iloc[0])
            args.dense = int(gs_mean['param_dense'].iloc[0])
            args.activation = gs_mean['param_activation'].iloc[0]
            args.optimizer = gs_mean['param_optimizer'].iloc[0]

        print('\n***** Running CNN models ******')
        print('Optimizer: %s\nActivation function:'
              ' %s\nLearning Rate: %4f\nNumber of kernels: '
              '%i\nL2: %4f\nDropout: %4f\nDense nodes: %s\n' %
              (args.optimizer, args.activation, args.learn_rate, args.filters,
               args.l2, args.dropout, args.dense))

        final_results = pd.DataFrame()
        motif_imps = pd.DataFrame()
        kern_imp = []

        for n in range(args.n_reps):
            print("\nReplicate %i/%i" % (n, args.n_reps))
            x, y = downsample(x_all, y_all)
            print(x.shape)

            model, conv2d_layer = make_cnn_model(learn_rate=args.learn_rate,
                                                 optimizer='sgdm',
                                                 filters=args.filters,
                                                 dense=args.dense,
                                                 l2=args.l2,
                                                 dropout=args.dropout,
                                                 activation=args.activation)
            #print(model.summary())

            # Step 3: Split training into training2 and validation
            x_train, x_test, y_train, y_test = train_test_split(x,
                                                                y,
                                                                stratify=y,
                                                                test_size=0.1)
            x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                              y_train,
                                                              stratify=y_train,
                                                              test_size=0.111)
            print('Train on %i, validate on %i, test on %i' %
                  (x_train.shape[0], x_val.shape[0], x_test.shape[0]))

            # Step 4: Define optimizer and early stopping criteria & train
            model.compile(optimizer=args.optimizer,
                          loss='binary_crossentropy',
                          metrics=['accuracy'])

            earlystop_callback = EarlyStopping(monitor='val_loss',
                                               mode='min',
                                               min_delta=args.min_delta,
                                               patience=args.patience,
                                               restore_best_weights=True,
                                               verbose=0)

            model.fit(x_train,
                      y_train,
                      batch_size=50,
                      epochs=1000,
                      verbose=0,
                      callbacks=[earlystop_callback],
                      validation_data=(x_val, y_val))

            train_loss, train_acc = model.evaluate(x_train, y_train)
            val_loss, val_acc = model.evaluate(x_val, y_val)
            test_loss, test_acc = model.evaluate(x_test, y_test)

            val_yhat = model.predict(x_val)
            max_f1 = 0
            best_thresh = 0
            for thr in np.arange(0.01, 1, 0.01):
                thr_pred = val_yhat.copy()
                thr_pred[thr_pred >= thr] = 1
                thr_pred[thr_pred < thr] = 0
                if sum(
                        thr_pred
                ) > 1:  # Eliminates cases where all predictions are negative and the f1 and auROC are undefined
                    f1 = f1_score(y_val, thr_pred,
                                  pos_label=1)  # Returns F1 for positive class
                    if f1 >= max_f1:
                        max_f1 = f1
                        best_thresh = thr
            print('Threshold for F1 measure: %3f' % best_thresh)

            # Calculate AUC-ROC and F-measure from train, val, and test.
            yhat_train = model.predict(x_train)
            train_auroc = roc_auc_score(y_train, yhat_train)
            yhat_train[yhat_train >= best_thresh] = 1
            yhat_train[yhat_train < best_thresh] = 0
            train_f1 = f1_score(y_train, yhat_train, pos_label=1)

            yhat_val = model.predict(x_val)
            val_auroc = roc_auc_score(y_val, yhat_val)
            yhat_val[yhat_val >= best_thresh] = 1
            yhat_val[yhat_val < best_thresh] = 0
            val_f1 = f1_score(y_val, yhat_val, pos_label=1)

            yhat_test = model.predict(x_test)
            test_auroc = roc_auc_score(y_test, yhat_test)
            yhat_test[yhat_test >= best_thresh] = 1
            yhat_test[yhat_test < best_thresh] = 0
            test_f1 = f1_score(y_test, yhat_test, pos_label=1)

            if args.save_model.lower() in ['t', 'true']:
                model.save(args.save + '_model_' + str(n) + '.h5')

            final_results = final_results.append(
                {
                    'ID': args.save,
                    'Tag': args.tag,
                    'Rep': n,
                    'X_file': args.x,
                    'Y_file': args.y,
                    'ActFun': args.activation,
                    'dropout': args.dropout,
                    'L2': args.l2,
                    'LearnRate': args.learn_rate,
                    'Optimizer': args.optimizer,
                    'n_Kernels': args.filters,
                    'F1_threshold': best_thresh,
                    'n_Dense': args.dense,
                    'Acc_train': train_acc,
                    'Loss_train': train_loss,
                    'auROC_train': train_auroc,
                    'F1_train': train_f1,
                    'Acc_val': val_acc,
                    'Loss_val': val_loss,
                    'auROC_val': val_auroc,
                    'F1_val': val_f1,
                    'Acc_test': test_acc,
                    'Loss_test': test_loss,
                    'auROC_test': test_auroc,
                    'F1_test': test_f1
                },
                ignore_index=True)

            ##########################
            ## Model Interpretation ##
            ##########################

            if (args.imp_m.lower() in ['t', 'true']
                    or args.imp_k.lower() in ['t', 'true']):
                # Step 1: Read in x data meta data
                key = pd.read_csv(
                    args.original_df,
                    sep='\t',
                    index_col=0,
                )
                key_index_list = key.columns.str.split('_', expand=True).values
                key.columns = pd.MultiIndex.from_tuples([
                    (x[1], x[0]) for x in key_index_list
                ])
                key = key.sort_index(axis=1)
                motifs = key.columns.levels[0].values
                omic_stack = list(key[list(key.columns.levels[0])[0]])
                omic_stack.append('PA')

                # Calculate Motif importance (zero-out-each-feature)
                if args.imp_m.lower() in ['t', 'true']:
                    motif_imp = np.empty((0, 2))
                    model_mot_imp = model
                    for mx in range(0, x_test.shape[2] - 1):
                        x_test_tmp = np.copy(x_test)
                        x_test_tmp[:, ..., mx, :] = 0
                        yhat_m_imp = model_mot_imp.predict(x_test_tmp)
                        auroc_m_imp = roc_auc_score(y_test, yhat_m_imp)
                        imp_m_auc = test_auroc - auroc_m_imp
                        motif_imp = np.vstack(
                            (motif_imp, np.array([motifs[mx], imp_m_auc])))
                    motif_imp = pd.DataFrame(
                        motif_imp, columns=['motif', 'auROC_test_decrease'])
                    if n == 0:
                        motif_imps = motif_imp
                    else:
                        motif_imps = pd.merge(motif_imps,
                                              motif_imp,
                                              on='motif')

                # Calculate Kernel Importance (zero-out-weights)
                if args.imp_k.lower() in ['t', 'true']:
                    all_weights = model.get_weights()
                    all_weights_2 = all_weights.copy()
                    print(
                        'Performing Leave-One-Kernel-Out importance analysis...'
                    )
                    for kx in range(0, args.filters):
                        orig_weights = all_weights[0][:, :, 0, kx].copy()
                        orig_weights = orig_weights.tolist()
                        orig_weights = [i for l in orig_weights for i in l]
                        conv2d_drop = copy.deepcopy(all_weights)
                        conv2d_drop[0][:, :, 0, kx] = 0.0
                        print(conv2d_drop[0][1, :, 0, 0:10])
                        model_LOKO = tf.keras.models.clone_model(model)
                        model_LOKO.set_weights(weights=conv2d_drop)
                        yhat_k_imp = model_LOKO.predict(x_test)
                        auroc_k_imp = roc_auc_score(y_test, yhat_k_imp)
                        imp_k_auc = test_auroc - auroc_k_imp
                        old = roc_auc_score(y_test, model.predict(x_test))
                        print(old, imp_k_auc)
                        kern_imp.append([n, imp_k_auc, orig_weights])

        if args.imp_m.lower() in ['t', 'true']:
            print('Snapshor ot motif importance scores...')
            motif_imps = motif_imps.set_index('motif')
            motif_imps = motif_imps.apply(pd.to_numeric, errors='coerce')
            motif_imps['mean_imp'] = motif_imps.mean(axis=1)
            motif_imps = motif_imps.sort_values('mean_imp', 0, ascending=False)
            print(motif_imps['mean_imp'].head())
            motif_imps['mean_imp'].to_csv(args.save + "_Motif_imp",
                                          sep="\t",
                                          index=True)

        if args.imp_k.lower() in ['t', 'true']:
            print('\nSnapshot of kernel importance scores:')
            kern_imp = pd.DataFrame(
                kern_imp, columns=['rep', 'auROC_test_decrease', 'kernel'])
            print(kern_imp.head())
            kern_imp.to_csv(args.save + "_Kernel_imp", sep="\t", index=True)

        final_results.to_csv(args.save + "_results.txt", header=True, sep='\t')

        # Save summary of results to RESULTS.txt
        calc_cols = [
            'F1_threshold', 'Acc_train', 'Acc_val', 'Acc_test', 'Loss_train',
            'Loss_val', 'Loss_test', 'auROC_train', 'auROC_val', 'auROC_test',
            'F1_train', 'F1_val', 'F1_test'
        ]
        final_results = final_results.drop(['Rep'], axis=1)
        std = final_results[calc_cols].std(axis=0, skipna=True)
        std = std.add_suffix('_std')
        mean = final_results[calc_cols].mean(axis=0, skipna=True)
        mean = mean.add_suffix('_mean')
        str_cols = final_results.drop(calc_cols, axis=1).iloc[0]
        str_cols = str_cols.append(pd.Series([args.n_reps], index=['Reps']))
        summary = pd.concat([str_cols, mean, std])

        #summary.set_index('index', inplace=True)
        print('\n### Summary of results on test set ###')
        print(summary.filter(like='test_mean', axis=0))
        with open("RESULTS.txt", 'a') as f:
            summary.to_frame().transpose().to_csv(f,
                                                  header=f.tell() == 0,
                                                  sep='\t')

    print('Done!')
print dummy_y
print X.shape
print X


# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(4, input_dim=4, init='normal', activation='relu'))
    model.add(Dense(3, init='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model


estimator = KerasClassifier(build_fn=baseline_model,
                            nb_epoch=200,
                            batch_size=5,
                            verbose=0)

estimator.save("estimator.h5")

kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

results = cross_val_score(estimator, X, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" %
      (results.mean() * 100, results.std() * 100))