Example #1
0
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train_elo,
          Y_train_elo,
          epochs=10,
          batch_size=50,
          validation_split=0.2,
          verbose=1)
model.test_on_batch(X_test_elo, Y_test_elo, sample_weight=None)
model.evaluate(X_test_elo, Y_test_elo, verbose=1)
pred = model.predict_classes(X_test_elo, verbose=1)

plot_model(model, to_file='model.png', show_shapes=True)

SVG(model_to_dot(model).create(prog='dot', format='svg'))

print(confusion_matrix(Y_test_elo, pred))
print classification_report(Y_test_elo, pred)
print(accuracy_score(Y_test_elo, pred))
fpr_elo, tpr_elo, thresholds_elo = roc_curve(Y_test_elo, pred)

auc = auc(fpr_elo, tpr_elo)

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_elo,
Example #2
0
# In[ ]:

plot_loss(model_output.history, COURSE_LIST[course_idx])

# In[ ]:

plot_accuracy(model_output.history, COURSE_LIST[course_idx])

# In[ ]:

course_metrics['course_name'].append(COURSE_LIST[course_idx])
course_metrics['val_binary_accuracy'].append(
    model_output.history['val_binary_accuracy'][-1])
course_metrics['test_accuracy'].append(
    accuracy_score(model.predict_classes(features_test), labels_test))
course_metrics['test_f1_score'].append(
    f1_score(model.predict_classes(features_test), labels_test))

# ### 2. CS50x - Introduction to Computer Science I

# In[ ]:

course_idx = 1
print(COURSE_LIST[course_idx])

# In[ ]:

course_loc = DATA_DIR + COURSE_LIST[course_idx]
print(course_loc)
Example #3
0
import numpy as np

model = load_model('saved_model.h5')
test_data = [
    "A lot of good things are happening. We are respected again throughout the world, and that's a great thing"
]
max_features = 200
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(test_data)
X = tokenizer.texts_to_sequences(test_data)
max_len = 28
X = pad_sequences(X, maxlen=max_len)
class_names = ['positive', 'negative']
preds = model.predict(X)
print(preds)
classes = model.predict_classes(X)
print(classes)
print(class_names[classes[0]])

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
Example #4
0
# summarize history for loss
from matplotlib import pyplot as plt
plt.plot(model_history.history['loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

"""We will check for the acuraccy on testing dataset"""

model.evaluate(test_images,to_categorical(test_labels))

"""### Prediction"""

ans=model.predict(test_kaggle)

import numpy as np
ans=np.argmax(ans,axis=1)

ans[:5]

predicted_classes = model.predict_classes(test_kaggle)
submissions=pd.DataFrame({"ImageId": list(range(1,len(predicted_classes)+1)),
                         "Label": predicted_classes})
submissions.to_csv("subbmision2.csv", index=False, header=True)

ls

Example #5
0
print "class weights = ", class_weights

#Now we should create classifier object using our internal classifier object in the function above
classifier = KerasClassifier(build_fn=classifier_builder,
                             batch_size=16,
                             nb_epoch=1)

if (os.access("lstm_model.h5", os.F_OK)):
    classifier = load_model('lstm_model.h5')
for i in range(0, runLoop):
    hist = classifier.fit(X_train,
                          y_train,
                          batch_size=256,
                          epochs=runEpoch,
                          class_weight=class_weights,
                          validation_data=(X_test, y_test))
    print "loop i=", i, "hist:", hist.history
    y_predict = classifier.predict_classes(X_test, batch_size=256)
    y_predict = [j[0] for j in y_predict]
    precision = precision_score(y_test, y_predict, average='macro')
    recall = recall_score(y_test, y_predict, average='macro')
    print("Precision:", precision)
    print("Recall:", recall)

if (os.access("lstm_model.h5", os.F_OK)):
    print(classifier.summary())
    classifier.save('lstm_model.h5')
else:
    print(classifier.model.summary())
    classifier.model.save('lstm_model.h5')
Example #6
0
def main():
    print('Using Keras version: ', keras.__version__)

    usage = 'usage: %prog [options]'
    parser = argparse.ArgumentParser(usage)
    parser.add_argument(
        '-t',
        '--train_model',
        dest='train_model',
        help=
        'Option to train model or simply make diagnostic plots (0=False, 1=True)',
        default=1,
        type=int)
    parser.add_argument('-s',
                        '--suff',
                        dest='suffix',
                        help='Option to choose suffix for training',
                        default='',
                        type=str)
    parser.add_argument('-p',
                        '--para',
                        dest='hyp_param_scan',
                        help='Option to run hyper-parameter scan',
                        default=0,
                        type=int)
    parser.add_argument(
        '-i',
        '--inputs_file_path',
        dest='inputs_file_path',
        help=
        'Path to directory containing directories \'Bkgs\' and \'Signal\' which contain background and signal ntuples respectively.',
        default='',
        type=str)
    args = parser.parse_args()
    do_model_fit = args.train_model
    suffix = args.suffix

    # Create instance of the input files directory
    #inputs_file_path = 'HHWWgg_DataSignalMCnTuples/2017/'
    inputs_file_path = '/eos/user/b/bmarzocc/HHWWgg/January_2021_Production/2017/'

    hyp_param_scan = args.hyp_param_scan
    # Set model hyper-parameters
    weights = 'BalanceYields'  # 'BalanceYields' or 'BalanceNonWeighted'
    optimizer = 'Nadam'
    validation_split = 0.1
    # hyper-parameter scan results
    if weights == 'BalanceNonWeighted':
        learn_rate = 0.0005
        epochs = 200
        batch_size = 200
    if weights == 'BalanceYields':
        learn_rate = 0.0001
        epochs = 200
        batch_size = 32
        #epochs = 10
        #batch_size=200

    # Create instance of output directory where all results are saved.
    output_directory = 'HHWWyyDNN_binary_%s_%s/' % (suffix, weights)
    check_dir(output_directory)
    hyperparam_file = os.path.join(output_directory,
                                   'additional_model_hyper_params.txt')
    additional_hyperparams = open(hyperparam_file, 'w')
    additional_hyperparams.write("optimizer: " + optimizer + "\n")
    additional_hyperparams.write("learn_rate: " + str(learn_rate) + "\n")
    additional_hyperparams.write("epochs: " + str(epochs) + "\n")
    additional_hyperparams.write("validation_split: " + str(validation_split) +
                                 "\n")
    additional_hyperparams.write("weights: " + weights + "\n")
    # Create plots subdirectory
    plots_dir = os.path.join(output_directory, 'plots/')
    input_var_jsonFile = open('input_variables.json', 'r')
    selection_criteria = '( (Leading_Photon_pt/CMS_hgg_mass) > 1/3 && (Subleading_Photon_pt/CMS_hgg_mass) > 1/4 )'

    # Load Variables from .json
    variable_list = json.load(input_var_jsonFile, encoding="utf-8").items()

    # Create list of headers for dataset .csv
    column_headers = []
    for key, var in variable_list:
        column_headers.append(key)
    column_headers.append('weight')
    column_headers.append('unweighted')
    column_headers.append('target')
    column_headers.append('key')
    column_headers.append('classweight')
    column_headers.append('process_ID')

    # Load ttree into .csv including all variables listed in column_headers
    print('<train-DNN> Input file path: ', inputs_file_path)
    outputdataframe_name = '%s/output_dataframe.csv' % (output_directory)
    if os.path.isfile(outputdataframe_name):
        data = pandas.read_csv(outputdataframe_name)
        print('<train-DNN> Loading data .csv from: %s . . . . ' %
              (outputdataframe_name))
    else:
        print('<train-DNN> Creating new data .csv @: %s . . . . ' %
              (inputs_file_path))
        data = load_data(inputs_file_path, column_headers, selection_criteria)
        # Change sentinal value to speed up training.
        data = data.mask(data < -25., -9.)
        #data = data.replace(to_replace=-99.,value=-9.0)
        data.to_csv(outputdataframe_name, index=False)
        data = pandas.read_csv(outputdataframe_name)

    print('<main> data columns: ', (data.columns.values.tolist()))
    n = len(data)
    nHH = len(data.iloc[data.target.values == 1])
    nbckg = len(data.iloc[data.target.values == 0])
    print("Total (train+validation) length of HH = %i, bckg = %i" %
          (nHH, nbckg))

    # Make instance of plotter tool
    Plotter = plotter()
    # Create statistically independant training/testing data
    traindataset, valdataset = train_test_split(data, test_size=0.1)
    valdataset.to_csv((output_directory + 'valid_dataset.csv'), index=False)

    print('<train-DNN> Training dataset shape: ', traindataset.shape)
    print('<train-DNN> Validation dataset shape: ', valdataset.shape)

    # Event weights
    weights_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH',
                                      'weight']
    weights_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg',
                                       'weight']
    weights_for_DiPhoton = traindataset.loc[traindataset['process_ID'] ==
                                            'DiPhoton', 'weight']
    weights_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet',
                                        'weight']
    weights_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD',
                                       'weight']
    weights_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY',
                                      'weight']
    weights_for_TTGsJets = traindataset.loc[traindataset['process_ID'] ==
                                            'TTGsJets', 'weight']
    weights_for_WGsJets = traindataset.loc[traindataset['process_ID'] ==
                                           'WGsJets', 'weight']
    weights_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW',
                                      'weight']

    HHsum_weighted = sum(weights_for_HH)
    Hggsum_weighted = sum(weights_for_Hgg)
    DiPhotonsum_weighted = sum(weights_for_DiPhoton)
    GJetsum_weighted = sum(weights_for_GJet)
    QCDsum_weighted = sum(weights_for_QCD)
    DYsum_weighted = sum(weights_for_DY)
    TTGsJetssum_weighted = sum(weights_for_TTGsJets)
    WGsJetssum_weighted = sum(weights_for_WGsJets)
    WWsum_weighted = sum(weights_for_WW)
    bckgsum_weighted = Hggsum_weighted + DiPhotonsum_weighted + GJetsum_weighted + QCDsum_weighted + DYsum_weighted + TTGsJetssum_weighted + WGsJetssum_weighted + WWsum_weighted
    #bckgsum_weighted = DiPhotonsum_weighted + GJetsum_weighted + QCDsum_weighted + DYsum_weighted + TTGsJetssum_weighted + WGsJetssum_weighted + WWsum_weighted

    nevents_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH',
                                      'unweighted']
    nevents_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg',
                                       'unweighted']
    nevents_for_DiPhoton = traindataset.loc[traindataset['process_ID'] ==
                                            'DiPhoton', 'unweighted']
    nevents_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet',
                                        'unweighted']
    nevents_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD',
                                       'unweighted']
    nevents_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY',
                                      'unweighted']
    nevents_for_TTGsJets = traindataset.loc[traindataset['process_ID'] ==
                                            'TTGsJets', 'unweighted']
    nevents_for_WGsJets = traindataset.loc[traindataset['process_ID'] ==
                                           'WGsJets', 'unweighted']
    nevents_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW',
                                      'unweighted']

    HHsum_unweighted = sum(nevents_for_HH)
    Hggsum_unweighted = sum(nevents_for_Hgg)
    DiPhotonsum_unweighted = sum(nevents_for_DiPhoton)
    GJetsum_unweighted = sum(nevents_for_GJet)
    QCDsum_unweighted = sum(nevents_for_QCD)
    DYsum_unweighted = sum(nevents_for_DY)
    TTGsJetssum_unweighted = sum(nevents_for_TTGsJets)
    WGsJetssum_unweighted = sum(nevents_for_WGsJets)
    WWsum_unweighted = sum(nevents_for_WW)
    bckgsum_unweighted = Hggsum_unweighted + DiPhotonsum_unweighted + GJetsum_unweighted + QCDsum_unweighted + DYsum_unweighted + TTGsJetssum_unweighted + WGsJetssum_unweighted + WWsum_unweighted
    #bckgsum_unweighted = DiPhotonsum_unweighted + GJetsum_unweighted + QCDsum_unweighted + DYsum_unweighted + TTGsJetssum_unweighted + WGsJetssum_unweighted + WWsum_unweighted

    HHsum_weighted = 2 * HHsum_weighted
    HHsum_unweighted = 2 * HHsum_unweighted

    if weights == 'BalanceYields':
        print('HHsum_weighted= ', HHsum_weighted)
        print('Hggsum_weighted= ', Hggsum_weighted)
        print('DiPhotonsum_weighted= ', DiPhotonsum_weighted)
        print('GJetsum_weighted= ', GJetsum_weighted)
        print('QCDsum_weighted= ', QCDsum_weighted)
        print('DYsum_weighted= ', DYsum_weighted)
        print('TTGsJetssum_weighted= ', TTGsJetssum_weighted)
        print('WGsJetssum_weighted= ', WGsJetssum_weighted)
        print('WWsum_weighted= ', WWsum_weighted)
        print('bckgsum_weighted= ', bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'HH',
                         ['classweight']] = HHsum_unweighted / HHsum_weighted
        traindataset.loc[traindataset['process_ID'] == 'Hgg',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'DiPhoton',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'GJet',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'QCD',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'DY',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'TTGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'WGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'WW',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)

    if weights == 'BalanceNonWeighted':
        print('HHsum_unweighted= ', HHsum_unweighted)
        print('Hggsum_unweighted= ', Hggsum_unweighted)
        print('DiPhotonsum_unweighted= ', DiPhotonsum_unweighted)
        print('GJetsum_unweighted= ', GJetsum_unweighted)
        print('QCDsum_unweighted= ', QCDsum_unweighted)
        print('DYsum_unweighted= ', DYsum_unweighted)
        print('TTGsJetssum_unweighted= ', TTGsJetssum_unweighted)
        print('WGsJetssum_unweighted= ', WGsJetssum_unweighted)
        print('WWsum_unweighted= ', WWsum_unweighted)
        print('bckgsum_unweighted= ', bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'HH',
                         ['classweight']] = 1.
        traindataset.loc[traindataset['process_ID'] == 'Hgg',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'DiPhoton',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'GJet',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'QCD',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'DY',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'TTGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'WGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'WW',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)

    # Remove column headers that aren't input variables
    training_columns = column_headers[:-6]
    print('<train-DNN> Training features: ', training_columns)

    column_order_txt = '%s/column_order.txt' % (output_directory)
    column_order_file = open(column_order_txt, "wb")
    for tc_i in training_columns:
        line = tc_i + "\n"
        pickle.dump(str(line), column_order_file)

    num_variables = len(training_columns)

    # Extract training and testing data
    X_train = traindataset[training_columns].values
    X_test = valdataset[training_columns].values

    # Extract labels data
    Y_train = traindataset['target'].values
    Y_test = valdataset['target'].values

    # Create dataframe containing input features only (for correlation matrix)
    train_df = data.iloc[:traindataset.shape[0]]

    # Event weights if wanted
    train_weights = traindataset['weight'].values
    test_weights = valdataset['weight'].values

    # Weights applied during training.
    if weights == 'BalanceYields':
        trainingweights = traindataset.loc[:,
                                           'classweight'] * traindataset.loc[:,
                                                                             'weight']
    if weights == 'BalanceNonWeighted':
        trainingweights = traindataset.loc[:, 'classweight']
    trainingweights = np.array(trainingweights)

    ## Input Variable Correlation plot
    correlation_plot_file_name = 'correlation_plot'
    Plotter.correlation_matrix(train_df)
    Plotter.save_plots(dir=plots_dir,
                       filename=correlation_plot_file_name + '.png')
    Plotter.save_plots(dir=plots_dir,
                       filename=correlation_plot_file_name + '.pdf')

    # Fit label encoder to Y_train
    newencoder = LabelEncoder()
    newencoder.fit(Y_train)
    # Transform to encoded array
    encoded_Y = newencoder.transform(Y_train)
    encoded_Y_test = newencoder.transform(Y_test)

    if do_model_fit == 1:
        print('<train-BinaryDNN> Training new model . . . . ')
        histories = []
        labels = []

        if hyp_param_scan == 1:
            print('Begin at local time: ', time.localtime())
            hyp_param_scan_name = 'hyp_param_scan_results.txt'
            hyp_param_scan_results = open(hyp_param_scan_name, 'a')
            time_str = str(time.localtime()) + '\n'
            hyp_param_scan_results.write(time_str)
            hyp_param_scan_results.write(weights)
            learn_rates = [0.00001, 0.0001]
            epochs = [150, 200]
            batch_size = [400, 500]
            param_grid = dict(learn_rate=learn_rates,
                              epochs=epochs,
                              batch_size=batch_size)
            model = KerasClassifier(build_fn=gscv_model, verbose=0)
            grid = GridSearchCV(estimator=model,
                                param_grid=param_grid,
                                n_jobs=-1)
            grid_result = grid.fit(X_train,
                                   Y_train,
                                   shuffle=True,
                                   sample_weight=trainingweights)
            print("Best score: %f , best params: %s" %
                  (grid_result.best_score_, grid_result.best_params_))
            hyp_param_scan_results.write(
                "Best score: %f , best params: %s\n" %
                (grid_result.best_score_, grid_result.best_params_))
            means = grid_result.cv_results_['mean_test_score']
            stds = grid_result.cv_results_['std_test_score']
            params = grid_result.cv_results_['params']
            for mean, stdev, param in zip(means, stds, params):
                print("Mean (stdev) test score: %f (%f) with parameters: %r" %
                      (mean, stdev, param))
                hyp_param_scan_results.write(
                    "Mean (stdev) test score: %f (%f) with parameters: %r\n" %
                    (mean, stdev, param))
            exit()
        else:
            # Define model for analysis
            early_stopping_monitor = EarlyStopping(patience=100,
                                                   monitor='val_loss',
                                                   min_delta=0.01,
                                                   verbose=1)
            #model = baseline_model(num_variables, learn_rate=learn_rate)
            model = new_model(num_variables, learn_rate=learn_rate)

            # Fit the model
            # Batch size = examples before updating weights (larger = faster training)
            # Epoch = One pass over data (useful for periodic logging and evaluation)
            #class_weights = np.array(class_weight.compute_class_weight('balanced',np.unique(Y_train),Y_train))
            history = model.fit(X_train,
                                Y_train,
                                validation_split=validation_split,
                                epochs=epochs,
                                batch_size=batch_size,
                                verbose=1,
                                shuffle=True,
                                sample_weight=trainingweights,
                                callbacks=[early_stopping_monitor])
            histories.append(history)
            labels.append(optimizer)
            # Make plot of loss function evolution
            Plotter.plot_training_progress_acc(histories, labels)
            acc_progress_filename = 'DNN_acc_wrt_epoch'
            Plotter.save_plots(dir=plots_dir,
                               filename=acc_progress_filename + '.png')
            Plotter.save_plots(dir=plots_dir,
                               filename=acc_progress_filename + '.pdf')

            Plotter.history_plot(history, label='loss')
            Plotter.save_plots(dir=plots_dir, filename='history_loss.png')
            Plotter.save_plots(dir=plots_dir, filename='history_loss.pdf')
    else:
        model_name = os.path.join(output_directory, 'model.h5')
        model = load_trained_model(model_name)

    # Node probabilities for training sample events
    result_probs = model.predict(np.array(X_train))
    result_classes = model.predict_classes(np.array(X_train))

    # Node probabilities for testing sample events
    result_probs_test = model.predict(np.array(X_test))
    result_classes_test = model.predict_classes(np.array(X_test))

    # Store model in file
    model_output_name = os.path.join(output_directory, 'model.h5')
    model.save(model_output_name)
    weights_output_name = os.path.join(output_directory, 'model_weights.h5')
    model.save_weights(weights_output_name)
    model_json = model.to_json()
    model_json_name = os.path.join(output_directory, 'model_serialised.json')
    with open(model_json_name, 'w') as json_file:
        json_file.write(model_json)
    model.summary()
    model_schematic_name = os.path.join(output_directory,
                                        'model_schematic.png')
    #plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True)

    print('================')
    print('Training event labels: ', len(Y_train))
    print('Training event probs', len(result_probs))
    print('Training event weights: ', len(train_weights))
    print('Testing events: ', len(Y_test))
    print('Testing event probs', len(result_probs_test))
    print('Testing event weights: ', len(test_weights))
    print('================')

    # Initialise output directory.
    Plotter.plots_directory = plots_dir
    Plotter.output_directory = output_directory

    Plotter.ROC(model, X_test, Y_test, X_train, Y_train)
    Plotter.save_plots(dir=plots_dir, filename='ROC.png')
    Plotter.save_plots(dir=plots_dir, filename='ROC.pdf')
Example #7
0
def main():
    print('Using Keras version: ', keras.__version__)

    usage = 'usage: %prog [options]'
    parser = argparse.ArgumentParser(usage)
    parser.add_argument('-t', '--train_model', dest='train_model', help='Option to train model or simply make diagnostic plots (0=False, 1=True)', default=1, type=int)
    parser.add_argument('-s', '--suff', dest='suffix', help='Option to choose suffix for training', default='', type=str)
    parser.add_argument('-p', '--para', dest='hyp_param_scan', help='Option to run hyper-parameter scan', default=0, type=int)
    parser.add_argument('-i', '--inputs_file_path', dest='inputs_file_path', help='Path to directory containing directories \'Bkgs\' and \'Signal\' which contain background and signal ntuples respectively.', default='', type=str)
    args = parser.parse_args()
    do_model_fit = args.train_model
    suffix = args.suffix

    # Create instance of the input files directory
    inputs_file_path = 'HHWWgg_DataSignalMCnTuples/2017/'

    hyp_param_scan=args.hyp_param_scan
    # Set model hyper-parameters
    weights='BalanceYields'# 'BalanceYields' or 'BalanceNonWeighted'
    optimizer = 'Nadam'
    validation_split=0.1
    # hyper-parameter scan results
    if weights == 'BalanceNonWeighted':
        learn_rate = 0.0005
        epochs = 200
        batch_size=200
    if weights == 'BalanceYields':
        learn_rate = 0.0001
        epochs = 200
        batch_size=400

    # Create instance of output directory where all results are saved.
    output_directory = 'HHWWyyDNN_binary_%s_%s/' % (suffix,weights)
    check_dir(output_directory)
    hyperparam_file = os.path.join(output_directory,'additional_model_hyper_params.txt')
    additional_hyperparams = open(hyperparam_file,'w')
    additional_hyperparams.write("optimizer: "+optimizer+"\n")
    additional_hyperparams.write("learn_rate: "+str(learn_rate)+"\n")
    additional_hyperparams.write("epochs: "+str(epochs)+"\n")
    additional_hyperparams.write("validation_split: "+str(validation_split)+"\n")
    additional_hyperparams.write("weights: "+weights+"\n")
    # Create plots subdirectory
    plots_dir = os.path.join(output_directory,'plots/')
    input_var_jsonFile = open('input_variables.json','r')
    selection_criteria = '( ((Leading_Photon_pt/CMS_hgg_mass) > 0.35) && ((Subleading_Photon_pt/CMS_hgg_mass) > 0.25) && passbVeto==1 && ExOneLep==1 && N_goodJets>=1)'
    # selection_criteria = '(AtLeast4GoodJets0Lep==1)'
    # selection_criteria = '(passPhotonSels==1 && passbVeto==1 && ExOneLep==1 && goodJets==1)'
    #selection_criteria = '( ((Leading_Photon_pt/CMS_hgg_mass) > 0.35) && ((Subleading_Photon_pt/CMS_hgg_mass) > 0.25) && passbVeto==1 && ExOneLep==1 && N_goodJets>=1)'

    # Load Variables from .json
    variable_list = json.load(input_var_jsonFile,encoding="utf-8").items()

    # Create list of headers for dataset .csv
    column_headers = []
    for key,var in variable_list:
        column_headers.append(key)
    column_headers.append('weight')
    column_headers.append('unweighted')
    column_headers.append('target')
    column_headers.append('key')
    column_headers.append('classweight')
    column_headers.append('process_ID')

    # Create instance of the input files directory
    #inputs_file_path = '/afs/cern.ch/work/a/atishelm/public/ForJosh/2017_DataMC_ntuples_moreVars'
    inputs_file_path = '/eos/user/r/rasharma/post_doc_ihep/double-higgs/ntuples/September29/MVANtuples'
    #inputs_file_path = '/eos/user/a/atishelm/ntuples/HHWWgg_DataSignalMCnTuples/PromptPromptApplied/'
    #inputs_file_path = 'PromptPromptApplied/'

    # Load ttree into .csv including all variables listed in column_headers
    print('<train-DNN> Input file path: ', inputs_file_path)
    outputdataframe_name = '%s/output_dataframe.csv' %(output_directory)
    if os.path.isfile(outputdataframe_name):
        data = pandas.read_csv(outputdataframe_name)
        print('<train-DNN> Loading data .csv from: %s . . . . ' % (outputdataframe_name))
    else:
        print('<train-DNN> Creating new data .csv @: %s . . . . ' % (inputs_file_path))
        data = load_data(inputs_file_path,column_headers,selection_criteria)
        # Change sentinal value to speed up training.
        data = data.replace(to_replace=-999.000000,value=-9.0)
        data.to_csv(outputdataframe_name, index=False)
        data = pandas.read_csv(outputdataframe_name)

    print('<main> data columns: ', (data.columns.values.tolist()))
    n = len(data)
    nHH = len(data.iloc[data.target.values == 1])
    nbckg = len(data.iloc[data.target.values == 0])
    print("Total (train+validation) length of HH = %i, bckg = %i" % (nHH, nbckg))

    # Make instance of plotter tool
    Plotter = plotter()
    # Create statistically independant training/testing data
    traindataset, valdataset = train_test_split(data, test_size=0.1)
    valdataset.to_csv((output_directory+'valid_dataset.csv'), index=False)

    print('<train-DNN> Training dataset shape: ', traindataset.shape)
    print('<train-DNN> Validation dataset shape: ', valdataset.shape)


    # Event weights
    weights_for_HH = traindataset.loc[traindataset['process_ID']=='HH', 'weight']
    weights_for_DiPhoton = traindataset.loc[traindataset['process_ID']=='DiPhoton', 'weight']
    weights_for_GJet = traindataset.loc[traindataset['process_ID']=='GJet', 'weight']
    weights_for_DY = traindataset.loc[traindataset['process_ID']=='DY', 'weight']
    weights_for_TTGG = traindataset.loc[traindataset['process_ID']=='TTGG', 'weight']
    weights_for_TTGJets = traindataset.loc[traindataset['process_ID']=='TTGJets', 'weight']
    weights_for_TTJets = traindataset.loc[traindataset['process_ID']=='TTJets', 'weight']
    weights_for_WJets = traindataset.loc[traindataset['process_ID']=='WJets', 'weight']
    weights_for_ttH = traindataset.loc[traindataset['process_ID']=='ttH', 'weight']

    HHsum_weighted= sum(weights_for_HH)
    GJetsum_weighted= sum(weights_for_GJet)
    DiPhotonsum_weighted= sum(weights_for_DiPhoton)
    TTGGsum_weighted= sum(weights_for_TTGG)
    TTGJetssum_weighted= sum(weights_for_TTGJets)
    TTJetssum_weighted= sum(weights_for_TTJets)
    WJetssum_weighted= sum(weights_for_WJets)
    ttHsum_weighted= sum(weights_for_ttH)
    DYsum_weighted= sum(weights_for_DY)
    #bckgsum_weighted = DiPhotonsum_weighted+WJetssum_weighted+ttHsum_weighted
    bckgsum_weighted = DiPhotonsum_weighted+WJetssum_weighted

    nevents_for_HH = traindataset.loc[traindataset['process_ID']=='HH', 'unweighted']
    nevents_for_DiPhoton = traindataset.loc[traindataset['process_ID']=='DiPhoton', 'unweighted']
    nevents_for_GJet = traindataset.loc[traindataset['process_ID']=='GJet', 'unweighted']
    nevents_for_DY = traindataset.loc[traindataset['process_ID']=='DY', 'unweighted']
    nevents_for_TTGG = traindataset.loc[traindataset['process_ID']=='TTGG', 'unweighted']
    nevents_for_TTGJets = traindataset.loc[traindataset['process_ID']=='TTGJets', 'unweighted']
    nevents_for_TTJets = traindataset.loc[traindataset['process_ID']=='TTJets', 'unweighted']
    nevents_for_WJets = traindataset.loc[traindataset['process_ID']=='WJets', 'unweighted']
    nevents_for_ttH = traindataset.loc[traindataset['process_ID']=='ttH', 'unweighted']

    HHsum_unweighted= sum(nevents_for_HH)
    GJetsum_unweighted= sum(nevents_for_GJet)
    DiPhotonsum_unweighted= sum(nevents_for_DiPhoton)
    TTGGsum_unweighted= sum(nevents_for_TTGG)
    TTGJetssum_unweighted= sum(nevents_for_TTGJets)
    TTJetssum_unweighted= sum(nevents_for_TTJets)
    WJetssum_unweighted= sum(nevents_for_WJets)
    ttHsum_unweighted= sum(nevents_for_ttH)
    DYsum_unweighted= sum(nevents_for_DY)

    #bckgsum_unweighted = DiPhotonsum_unweighted+WJetssum_unweighted+ttHsum_unweighted
    bckgsum_unweighted = DiPhotonsum_unweighted+WJetssum_unweighted


    if weights=='BalanceYields':
        print('HHsum_weighted= ' , HHsum_weighted)
        print('ttHsum_weighted= ' , ttHsum_weighted)
        print('DiPhotonsum_weighted= ', DiPhotonsum_weighted)
        print('WJetssum_weighted= ', WJetssum_weighted)
        print('DYsum_weighted= ', DYsum_weighted)
        print('GJetsum_weighted= ', GJetsum_weighted)
        print('bckgsum_weighted= ', bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='HH', ['classweight']] = 1.
        traindataset.loc[traindataset['process_ID']=='GJet', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='DY', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='DiPhoton', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='WJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='TTGG', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='TTGJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='TTJets', ['classweight']] = (HHsum_weighted/bckgsum_weighted)
        traindataset.loc[traindataset['process_ID']=='ttH', ['classweight']] = (HHsum_weighted/bckgsum_weighted)

    if weights=='BalanceNonWeighted':
        print('HHsum_unweighted= ' , HHsum_unweighted)
        print('ttHsum_unweighted= ' , ttHsum_unweighted)
        print('DiPhotonsum_unweighted= ', DiPhotonsum_unweighted)
        print('WJetssum_unweighted= ', WJetssum_unweighted)
        print('DYsum_unweighted= ', DYsum_unweighted)
        print('GJetsum_unweighted= ', GJetsum_unweighted)
        print('bckgsum_unweighted= ', bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='HH', ['classweight']] = 1.
        traindataset.loc[traindataset['process_ID']=='GJet', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='DY', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='DiPhoton', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='WJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='TTGG', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='TTGJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='TTJets', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID']=='ttH', ['classweight']] = (HHsum_unweighted/bckgsum_unweighted)

    # Remove column headers that aren't input variables
    training_columns = column_headers[:-6]
    print('<train-DNN> Training features: ', training_columns)

    column_order_txt = '%s/column_order.txt' %(output_directory)
    column_order_file = open(column_order_txt, "wb")
    for tc_i in training_columns:
        line = tc_i+"\n"
        pickle.dump(str(line), column_order_file)

    num_variables = len(training_columns)

    # Extract training and testing data
    X_train = traindataset[training_columns].values
    X_test = valdataset[training_columns].values

    # Extract labels data
    Y_train = traindataset['target'].values
    Y_test = valdataset['target'].values

    # Create dataframe containing input features only (for correlation matrix)
    train_df = data.iloc[:traindataset.shape[0]]

    ## Input Variable Correlation plot
    correlation_plot_file_name = 'correlation_plot.png'
    Plotter.correlation_matrix(train_df)
    Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name)

    ####################################################################################
    # Weights applied during training. You will also need to update the class weights if
    # you are going to change the event weights applied. Introduce class weights and any
    # event weight you want to use here.
    #trainingweights = traindataset.loc[:,'classbalance']#*traindataset.loc[:,'weight']
    #trainingweights = np.array(trainingweights)

    # Temp hack to be able to change class weights without remaking dataframe
    #for inde in xrange(len(trainingweights)):
    #    newweight = 13243.0/6306.0
    #    trainingweights[inde]= newweight
    #print 'training event weight = ', trainingweights[0]

    # Event weights calculation so we can correctly apply event weights to diagnostic plots.
    # use seperate list because we don't want to apply class weights in plots.
    # Event weights if wanted
    train_weights = traindataset['weight'].values
    test_weights = valdataset['weight'].values

    # Weights applied during training.
    if weights=='BalanceYields':
        trainingweights = traindataset.loc[:,'classweight']*traindataset.loc[:,'weight']
    if weights=='BalanceNonWeighted':
        trainingweights = traindataset.loc[:,'classweight']
    trainingweights = np.array(trainingweights)

    ## Input Variable Correlation plot
    correlation_plot_file_name = 'correlation_plot.pdf'
    Plotter.correlation_matrix(train_df)
    Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name)

    # Fit label encoder to Y_train
    newencoder = LabelEncoder()
    newencoder.fit(Y_train)
    # Transform to encoded array
    encoded_Y = newencoder.transform(Y_train)
    encoded_Y_test = newencoder.transform(Y_test)

    if do_model_fit == 1:
        print('<train-BinaryDNN> Training new model . . . . ')
        histories = []
        labels = []

        if hyp_param_scan == 1:
            print('Begin at local time: ', time.localtime())
            hyp_param_scan_name = 'hyp_param_scan_results.txt'
            hyp_param_scan_results = open(hyp_param_scan_name,'a')
            time_str = str(time.localtime())+'\n'
            hyp_param_scan_results.write(time_str)
            hyp_param_scan_results.write(weights)
            learn_rates=[0.00001, 0.0001]
            epochs = [150,200]
            batch_size = [400,500]
            param_grid = dict(learn_rate=learn_rates,epochs=epochs,batch_size=batch_size)
            model = KerasClassifier(build_fn=gscv_model,verbose=0)
            grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
            grid_result = grid.fit(X_train,Y_train,shuffle=True,sample_weight=trainingweights)
            print("Best score: %f , best params: %s" % (grid_result.best_score_,grid_result.best_params_))
            hyp_param_scan_results.write("Best score: %f , best params: %s\n" %(grid_result.best_score_,grid_result.best_params_))
            means = grid_result.cv_results_['mean_test_score']
            stds = grid_result.cv_results_['std_test_score']
            params = grid_result.cv_results_['params']
            for mean, stdev, param in zip(means, stds, params):
                print("Mean (stdev) test score: %f (%f) with parameters: %r" % (mean,stdev,param))
                hyp_param_scan_results.write("Mean (stdev) test score: %f (%f) with parameters: %r\n" % (mean,stdev,param))
            exit()
        else:
            # Define model for analysis
            early_stopping_monitor = EarlyStopping(patience=30, monitor='val_loss', verbose=1)
            model = baseline_model(num_variables, learn_rate=learn_rate)

            # Fit the model
            # Batch size = examples before updating weights (larger = faster training)
            # Epoch = One pass over data (useful for periodic logging and evaluation)
            #class_weights = np.array(class_weight.compute_class_weight('balanced',np.unique(Y_train),Y_train))
            history = model.fit(X_train,Y_train,validation_split=validation_split,epochs=epochs,batch_size=batch_size,verbose=1,shuffle=True,sample_weight=trainingweights,callbacks=[early_stopping_monitor])
            histories.append(history)
            labels.append(optimizer)
            # Make plot of loss function evolution
            Plotter.plot_training_progress_acc(histories, labels)
            acc_progress_filename = 'DNN_acc_wrt_epoch.png'
            Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename)
    else:
        model_name = os.path.join(output_directory,'model.h5')
        model = load_trained_model(model_name)

    # Node probabilities for training sample events
    result_probs = model.predict(np.array(X_train))
    result_classes = model.predict_classes(np.array(X_train))

    # Node probabilities for testing sample events
    result_probs_test = model.predict(np.array(X_test))
    result_classes_test = model.predict_classes(np.array(X_test))

    # Store model in file
    model_output_name = os.path.join(output_directory,'model.h5')
    model.save(model_output_name)
    weights_output_name = os.path.join(output_directory,'model_weights.h5')
    model.save_weights(weights_output_name)
    model_json = model.to_json()
    model_json_name = os.path.join(output_directory,'model_serialised.json')
    with open(model_json_name,'w') as json_file:
        json_file.write(model_json)
    model.summary()
    model_schematic_name = os.path.join(output_directory,'model_schematic.eps')
    print "DEBUG: ",model_schematic_name
    plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True)
    # plot_model(model, to_file='model_schematic.eps', show_shapes=True, show_layer_names=True)

    # Initialise output directory.
    Plotter.plots_directory = plots_dir
    Plotter.output_directory = output_directory

    '''
    print('================')
    print('Training event labels: ', len(Y_train))
    print('Training event probs', len(result_probs))
    print('Training event weights: ', len(train_weights))
    print('Testing events: ', len(Y_test))
    print('Testing event probs', len(result_probs_test))
    print('Testing event weights: ', len(test_weights))
    print('================')
    '''

    # Make overfitting plots of output nodes
    Plotter.binary_overfitting(model, Y_train, Y_test, result_probs, result_probs_test, plots_dir, train_weights, test_weights)
    print "DEBUG: Y_train shape: ",Y_train.shape

    # # Get true process integers for training dataset
    # original_encoded_train_Y = []
    # for i in xrange(len(result_probs)):
    #     if Y_train[i][0] == 1:
    #         original_encoded_train_Y.append(0)
    #     if Y_train[i][1] == 1:
    #         original_encoded_train_Y.append(1)
    #     if Y_train[i][2] == 1:
    #         original_encoded_train_Y.append(2)
    #     if Y_train[i][3] == 1:
    #         original_encoded_train_Y.append(3)

    # Get true class values for testing dataset
    # result_classes_test = newencoder.inverse_transform(result_classes_test)
    # result_classes_train = newencoder.inverse_transform(result_classes)
    e = shap.DeepExplainer(model, X_train[:400, ])
    shap_values = e.shap_values(X_test[:400, ])
    Plotter.plot_dot(title="DeepExplainer_sigmoid_y0", x=X_test[:400, ], shap_values=shap_values, column_headers=column_headers)
    Plotter.plot_dot_bar(title="DeepExplainer_Bar_sigmoid_y0", x=X_test[:400,], shap_values=shap_values, column_headers=column_headers)
    #e = shap.GradientExplainer(model, X_train[:100, ])
    #shap_values = e.shap_values(X_test[:100, ])
    #Plotter.plot_dot(title="GradientExplainer_sigmoid_y0", x=X_test[:100, ], shap_values=shap_values, column_headers=column_headers)
    #e = shap.KernelExplainer(model.predict, X_train[:100, ])
    #shap_values = e.shap_values(X_test[:100, ])
    #Plotter.plot_dot(title="KernelExplainer_sigmoid_y0", x=X_test[:100, ],shap_values=shap_values, column_headers=column_headers)
    #Plotter.plot_dot_bar(title="KernelExplainer_Bar_sigmoid_y0", x=X_test[:100,], shap_values=shap_values, column_headers=column_headers)
    #Plotter.plot_dot_bar_all(title="KernelExplainer_bar_All_Var_sigmoid_y0", x=X_test[:100,], shap_values=shap_values, column_headers=column_headers)

    # Create confusion matrices for training and testing performance
    # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'index')
    # Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TRAIN.png')
    # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'index')
    # Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TEST.png')

    # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'columns')
    # Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TRAIN.png')
    # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'columns')
    # Plotter.save_plots(dir=plots_dir, filename='yields_norm_columns_confusion_matrix_TEST.png')

    # Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'')
    # Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TRAIN.png')
    # Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'')
    # Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TEST.png')

    Plotter.ROC_sklearn(Y_train, result_probs, Y_test, result_probs_test, 1 , 'BinaryClassifierROC',train_weights, test_weights)
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-4),
              metrics=['acc'])

history = model.fit(X_train,
                    y_train,
                    epochs=35,
                    batch_size=150, 
                    validation_data=(X_val, y_val))

model.save('best_model_imag_3clases.h5')

Y_pred_proba=model.predict(X_test)
Y_pred=model.predict_classes(X_test)
test_loss_trained_net, test_acc_trained_net = model.evaluate(X_test, y_test)
print('test_acc:', test_acc_trained_net)
#Resultado 75% de acuraccy sobre el test

matrix = confusion_matrix(y_test.argmax(axis=1), Y_pred)

labels_grouped=np.array(["Musica popular", "Musica melodica", "Musica ritmica"])
plot_confusion_matrix(y_test.argmax(axis=1), Y_pred, classes=labels_grouped,
                      title='Confusion matrix, without normalization')


#Graficamos el Loss (error) en funciĆ³n de los Epochs
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
Example #9
0
def modeling(conn, sentences, lib, dz):
#def modeling(conn, df, lib, dz):
  
    #pts = pd.read_sql("SELECT DISTINCT SUBJECT_ID from UFM", conn)
    #pts =list(set(pts.SUBJECT_ID))
    #pool = []
    #for d in dz:
    #    pool += d.pos + d.neg
    np.random.seed(7)
    decay = .0002
    data = []; train = []; test = []
    keys = [k[1] for k in lib]
    
    admits = pd.read_sql("SELECT * from admissions", conn)
    
    for itr in range(0,5):
        print ("Sess: {0}".format(itr))
        for d in dz:
            neg = random.sample(d[1], len(d[0]))
            temp = d[0] + neg
            random.shuffle(temp)
            t1, t2 = cross_validation.train_test_split(temp, test_size = .2)
            train +=t1; test +=t2
                    
        #X stands for raw indexes of feature input; V stands for raw feature input
        #W stands for word vectors from feature input trained by Word2Vec
        X_train = []; t_train = []; W_train = []; Y_train = []
        X_test = []; t_test = []; W_test = []; Y_test = []
        V_train = []; V_test = []
    
        count=0
        for t in train:
            print (count)
            count+=1

            corpus = [[s[2], s[3]] for s in sentences if  (s[0] == t[0]) and (pd.to_datetime(admits[admits['HADM_ID']==s[1]].ADMITTIME.values[0]) <= t[1])]
            #order subject by time of entry for each sentence (admission)
            corpus = sorted(corpus, key = lambda x: x[1])
            #transpose into nx2xd from 2xnxd
            #this way, corpus[0] refers to words and corpus[1] refers to times
            corpus = list(map(list, zip(*corpus)))                  
            x_train = list(chain.from_iterable(corpus[0]))
            t_stamps = list(chain.from_iterable(corpus[1]))
            x = np.array(list(map(lambda x: keys.index(x), x_train)))
     
            #configure each timestamp to reflect time elapsed from first time entry
            #calculate time decay from initial event
            temp = t_stamps[0]
            t_stamps = [ii-temp for ii in t_stamps]
                
            #append
            X_train.append(x)
            V_train.append(np.array(x_train))
            t_train.append(np.array(t_stamps))
            Y_train.append(t[3])
                
        print ("X_train made.")

        count = 0
        for t in test:
            print (count)
            count+=1
                
            corpus = [[s[2], s[3]] for s in sentences if  (s[0] == t[0]) and (pd.to_datetime(admits[admits['HADM_ID']==s[1]].ADMITTIME.values[0]) <= t[1])]
                
            corpus = sorted(corpus, key = lambda x: x[1])
            corpus = list(map(list, zip(*corpus)))                  
            x_test = list(chain.from_iterable(corpus[0]))
            t_stamps = list(chain.from_iterable(corpus[1]))
            temp = t_stamps[0]
            t_stamps = [ii-temp for ii in t_stamps]
            x = np.array(list(map(lambda x: keys.index(x), x_test)))
            
            X_test.append(x)
            V_test.append(np.array(x_train))
            t_test.append(np.array(t_stamps))
            Y_test.append(t[3])            
                           
        #training normal LSTM and CNN-LSTM          
        top_words = [9444]
        max_review_length = [1000]
        embedding_length = [300]          
        X_train = sequence.pad_sequences(X_train, maxlen=max_review_length[0])
        X_test = sequence.pad_sequences(X_test, maxlen=max_review_length[0])


        #build model using KerasClassifier and Gridsearch
        cnn = KerasClassifier(build_fn=cnn_train, verbose=1)
        lstm = KerasClassifier(build_fn=lstm_train, verbose=1)
        d_cnn = KerasClassifier(build_fn=d_cnn_train, verbose = 1)
        d_lstm = KerasClassifier(build_fn=d_lstm_train, verbose = 1)
        # define the grid search parameters

        batch_size = [32, 64, 128]
        epochs = [20, 50, 100, 200]
        optimizer = ['SGD', 'RMSprop', 'Adam']
        learn_rate = (10.0**np.arange(-4,-1)).tolist()
        momentum = np.arange(.5,.9,.1).tolist()
        neurons = [50, 100, 200]
        dropout_W = [.1, .2, .5]
        dropout_U = [.1, .2, .5]
        W_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None]
        U_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None]
        init_mode = ['uniform', 'normal', 'zero']
        #activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
        param_grid = dict(top_words=top_words, max_length = max_review_length, embedding_length = embedding_length, batch_size=batch_size, nb_epoch=epochs, optimizer = optimizer, learn_rate = learn_rate, momentum = momentum, neurons = neurons, dropout_W = dropout_W, dropout_U = dropout_U, W_regularizer = W_regularizer, U_regularizer = U_regularizer, init_mode = init_mode)
        d_param_grid = dict(input_shape = [(max_review_length[0], embedding_length[0])], batch_size=batch_size, nb_epoch=epochs, optimizer = optimizer, learn_rate = learn_rate, momentum = momentum, neurons = neurons, dropout_W = dropout_W, dropout_U = dropout_U, W_regularizer = W_regularizer, U_regularizer = U_regularizer, init_mode = init_mode)
        lr_params = {'C':(10.0**np.arange(-4,4)).tolist(), 'penalty':('l1','l2')}
        sv_params = {'C':(10.0**np.arange(-4,4)).tolist(), 'kernel':('linear', 'poly', 'rbf', 'sigmoid')}
        rf_params = {'criterion': ['gini', 'entropy']}

        #setup GridSearch w/ cross validation
        cnn_grid = GridSearchCV(estimator=cnn, param_grid=param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1)
        lstm_grid = GridSearchCV(estimator=lstm, param_grid=param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1)
        d_cnn_grid = GridSearchCV(estimator=d_cnn, param_grid=d_param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1)
        d_lstm_grid = GridSearchCV(estimator=d_lstm, param_grid=d_param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1)
        classics = GridSearchCV(estimator = (LR, SVM, RF), param_grid = (lr_params, sv_params, rf_params), scoring = 'roc_auc', sv = 5, n_jobs = -1)
        #lr_grid = GridSearchCV(estimator = lr_params, param_grid = lr_params, scoring = 'roc_auc', sv = 5, n_jobs = -1)
        #sv_grid = GridSearchCV(estimator = sv_params, param_grid = sv_params, scoring = 'roc_auc', sv = 5, n_jobs = -1)
        #rf_grid = GridSearchCV(estimator = rf_params, param_grid = rf_params, scoring = 'roc_auc', sv = 5, n_jobs = -1)

        # Fit the model
        cnn_result = cnn_grid.fit(X_train, Y_train)
        lstm_result = lstm_grid.fit(X_train, Y_train) 
        d_cnn_result = d_cnn_grid.fit(decay(x=np.array(V_train), t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[0], Y_train)
        d_lstm_result = d_lstm_grid.fit(decay(x=np.array(V_train), t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[0], Y_train) 
        classics_result = classics.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[1], Y_train)       
        #lr_result = lr_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train)
        #sv_result = sv_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train)
        #rf_result = rf_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train)        
        
        #grid_search results:
        print("CNN Best: %f using %s" % (cnn_result.best_score_, cnn_result.best_params_))
        means = cnn_result.cv_results_['mean_test_score']
        stds = cnn_result.cv_results_['std_test_score']
        params = cnn_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))
        
        print("LSTM Best: %f using %s" % (lstm_result.best_score_, lstm_result.best_params_))
        means = lstm_result.cv_results_['mean_test_score']
        stds = lstm_result.cv_results_['std_test_score']
        params = lstm_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))
        
        print("Decay CNN Best: %f using %s" % (d_cnn_result.best_score_, d_cnn_result.best_params_))
        means = d_cnn_result.cv_results_['mean_test_score']
        stds = d_cnn_result.cv_results_['std_test_score']
        params = d_cnn_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))        
            
        print("Decay LSTM Best: %f using %s" % (d_lstm_result.best_score_, d_lstm_result.best_params_))
        means = d_lstm_result.cv_results_['mean_test_score']
        stds = d_lstm_result.cv_results_['std_test_score']
        params = d_lstm_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))        
            
        print("Best of Classics: %f using %s, %s" % (classics_result.best_score_, classics_result.best_estimator_, classics_result.best_params_))    
        means = classics_result.cv_results_['mean_test_score']
        stds = classics_result.cv_results_['std_test_score']
        params = classics_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))        
        
        #KFold = 5
        #kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
        #cvscores = []
        #for training, testing in kfold.split(X_train, Y_train):     
            # Fit the model
            #model.fit(X[training], Y[training], nb_epoch=150, batch_size=10, verbose=0)
            # evaluate the model
            #scores = model.evaluate(X[testing], Y[testing], verbose=0)
            #print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
            #cvscores.append(scores[1] * 100)
        #print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

 ######TESTING#######
        cnn = cnn_train(top_words = top_words, max_length = max_review_length, embedding_length=embedding_length)
        lstm = lstm_train(top_words = top_words, max_length = max_review_length, embedding_length=embedding_length)
            
        cnn.fit(X_train, Y_train, validation_split = .2, nb_epoch=100, batch_size=128, shuffle = True, verbose=1)
        lstm.fit(X_train, Y_train, validation_split = .2, nb_epoch=100, batch_size=128, shuffle = True, verbose=1)

        #testing
        predictions_lstm = lstm.predict_classes(X_test)
        predictions_cnn = cnn.predict_classes(X_test)

        acc = accuracy_score(Y_test, predictions_lstm)
        f1 = f1_score (Y_test, predictions_lstm)
        auc = roc_auc_score (Y_test, predictions_lstm)
        scores_lstm = [("Accuracy", acc) , ("F1 Score", f1) , ("AUC Score",auc)]

        acc = accuracy_score(Y_test, predictions_cnn)
        f1 = f1_score (Y_test, predictions_cnn)
        auc = roc_auc_score (Y_test, predictions_cnn)
        scores_cnn = [("Accuracy", acc) , ("F1 Score", f1) , ("AUC Score",auc)]

        print ("LSTM DATA: ")
        for s in scores_lstm:
            print("%s: %.2f" %(s[0], s[1]), end = " ")
        print ("")
        print ("CNN DATA: ")
        for s in scores_cnn:
            print("%s: %.2f" %(s[0], s[1]), end = " ")        
        
        
        data.append(data)
            
    return (Data)
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='sgd',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train_Perf_based,
          Y_train_Perf_based,
          epochs=100,
          batch_size=20,
          validation_split=0.3,
          verbose=1)

model.test_on_batch(X_test_Perf_based, Y_test_Perf_based, sample_weight=None)

pred_Perf_based = model.predict_classes(X_test_Perf_based, verbose=1)

print(confusion_matrix(Y_test_Perf_based, pred_Perf_based))
print classification_report(Y_test_Perf_based, pred_Perf_based)
print(accuracy_score(Y_test_Perf_based, pred_Perf_based))
fpr_Perf_based, tpr_Perf_based, thresholds_Perf_based = roc_curve(
    Y_test_Perf_based, pred_Perf_based)

auc_keras = auc(fpr_Perf_based, tpr_Perf_based)

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_Perf_based,
         tpr_Perf_based,
         label='Performance Based Model (area = {:.3f})'.format(auc_keras))
plt.xlabel('False positive rate')
Example #11
0
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
else:
    model = create_model()
    # model.fit(train_data, train_label, batch_size=20, epochs=100, shuffle=True, verbose=1, validation_split=0.2)
    model.fit(train_data,
              train_label,
              batch_size=10,
              epochs=150,
              shuffle=True,
              verbose=1,
              validation_split=0.2)
    result = model.evaluate(test_data, test_label, batch_size=1000)

    print('loss:%5.6f   acct:%5.6f' % (result[0], result[1]))

    #
    test_data = np.array([binary_encode(i) for i in range(1, 101)])
    # pred=model.predict(test_data)
    pred = model.predict_classes(test_data)

    # init_lables = lb.inverse_transformnsform(pred)
    # print(init_lables)

    # Convert the one-hot-encoded prediction back to a normal letter
    results = []
    for i in range(1, 100):
        results.append('{}'.format(['fizzbuzz', 'fizz', 'buzz',
                                    i][pred[i - 1]]))
    print(', '.join(results))