Exemple #1
0
    # ## noisy labels
    NOISE_LEVEL = 0.46  # what part of training labels are permuted
    perm = np.array([7, 9, 0, 4, 2, 1, 3, 5, 6,
                     8])  # noise permutation (from Reed)

    noise = perm[y_train]

    # replace some of the training labels with permuted (noise) labels.
    # make sure each categories receive an equal amount of noise
    from sklearn.model_selection import StratifiedShuffleSplit

    _, noise_idx = next(
        iter(
            StratifiedShuffleSplit(n_splits=1,
                                   test_size=NOISE_LEVEL,
                                   random_state=seed).split(X_train, y_train)))
    y_train_noise = y_train.copy()
    y_train_noise[noise_idx] = noise[noise_idx]

    # actual noise level
    1. - np.mean(y_train_noise == y_train)

    # split training data to training and validation
    # break the training set to 10% validation which we will use for early
    #  stopping.
    train_idx, val_idx = next(
        iter(
            StratifiedShuffleSplit(n_splits=1,
                                   test_size=0.1,
                                   random_state=seed).split(
Exemple #2
0
def rodar_experimento(dir_experimento, documentos_validos, freq_min,
                      op_stopwords, op_ica, op_tesauro, op_tam_vec, lista_k,
                      rnd, exp, w2v_geral, ftt_geral, glv_geral):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=rnd)
    X = documentos_validos.id
    y = documentos_validos.Assunto
    stopwords = nltk.corpus.stopwords.words('portuguese')
    diretorio = "dados/corpus_tratado/"
    le = LabelEncoder()

    #index[0] são os indices de treino, e index[1] são os de teste
    for index in sss.split(X, y):
        X_treino, X_teste = X[index[0]], X[index[1]]
        y_treino, y_teste = y[index[0]], y[index[1]]

        # instanciando o corpus do conjunto de treinamento
        base_treino = criar_base_treino(exp, X_treino, y_treino, diretorio,
                                        stopwords)
        # criando vocabulário
        vocab = extrair_vocabulario(base_treino, freq_min, stopwords,
                                    op_stopwords, op_ica, op_tesauro)
        # treinando modelos juridicos
        w2v_jur, ftt_jur, glv_jur = treinar_modelos_jur(
            X_treino, X_teste, y_treino, y_teste, vocab, diretorio, exp,
            op_tam_vec)
        #criando representações através da soma de vetores
        bs = criar_representacoes_soma_jur(X_teste, y_teste, vocab, diretorio,
                                           w2v_jur, ftt_jur, glv_jur, exp,
                                           op_tam_vec)
        criar_representacoes_soma_ger(vocab, diretorio, w2v_geral, ftt_geral,
                                      glv_geral, exp, op_tam_vec, bs)

        ######DOC2VEC####
        print('--------- Treinando doc2vec do experimento ' + str(exp) +
              ' ---------')
        os.mkdir('resultados/' + dir_experimento)
        corpus = "dados/" + dir_experimento + "/base_treino_glv.txt"
        model = Doc2Vec(corpus_file=corpus,
                        vector_size=100,
                        window=5,
                        min_count=1,
                        workers=8)
        model.save("dados/" + dir_experimento + "/doc2vec_jur.model")
        print(
            '--------- Inferindo vetores para docs de teste do experimento ' +
            str(exp) + ' ---------')
        base_teste = pd.read_csv("dados/" + dir_experimento +
                                 "/vetores_teste.csv")
        base_teste['doc2vec_jur'] = [
            normalize(model.infer_vector(x[0].split(' ')).reshape(1, -1))
            for x in base_teste.teores
        ]
        base_teste.to_csv('dados/experimento_' + str(exp) +
                          '/vetores_teste.csv',
                          index=False)

        df = pd.read_csv('dados/' + dir_experimento + '/vetores_teste.csv')
        print('++++++ modelos ++++++ ' + df.iloc[:, 3:].columns)

        for modelo in df.iloc[:, 3:].columns:
            #####AGRUPAMENTOS###############
            print('--------- Agrupando dados para o modelo ' + modelo +
                  ' no experimento' + str(exp) + ' ---------')
            df[modelo] = df[modelo].apply(lambda x: converter_string_array(x))
            X_kmeans = np.stack(df[modelo])
            X_kmeans = X_kmeans.reshape(X_kmeans.shape[0], X_kmeans.shape[2])
            y_kmeans = df['assunto']
            le.fit(y_kmeans)
            y_kmeans = le.transform(y_kmeans)
            lista_scores_k = computar_scores_agrupamento(
                X_kmeans, y_kmeans, dir_experimento, modelo, lista_k)
            #gerar_graficos_kmeans(lista_scores_k, dir_experimento, modelo)
            np.save(
                'resultados/' + dir_experimento + '/' + modelo +
                '_lista_scores_k.npy', lista_scores_k)
            print('******   dados de agrupamento do modelo ' + modelo +
                  'salvos.')

            #####MATRIZES DE SIMILARIDADE##############
            print('--------- executando analyzer para experimento ' +
                  str(exp) + ' ---------')
            sim_m = calc_matriz_sim(df[modelo], dir_experimento)
            calcular_sim_assuntos(df['assunto'], sim_m, df[modelo].name,
                                  dir_experimento)
            plt.close()
train_set,test_set = train_test_split(housing,test_size=0.2,random_state=42)

#pg 53
#stratified sampling

###divide median income into different categories
housing["income_cat"] = pd.cut(housing["median_income"],
                                bins=[0.,1.5,3.0,4.5,6.,np.inf],
                                labels=[1,2,3,4,5])
housing["income_cat"].hist()

###scikit-learn function
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split.split(housing,housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

###test that it worked
strat_test_set["income_cat"].value_counts()/len(strat_test_set)

###remove the income_cat attribute to get data back to its original state
for set_ in (strat_train_set,strat_test_set):
    set_.drop("income_cat",axis=1,inplace=True)

#pg 56
#explore the data

###make a copy of the data to play with it without changing the original set
def stratify(housing):
    split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state =42)
    for train_index, test_index in split.split(housing, housing["income_cat"]):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]
    return strat_train_set, strat_test_set
Exemple #5
0
def prepareData():
    if wiki_model_name in os.listdir(wiki_model_path):
        model = gensim.models.KeyedVectors.load(
            os.path.join(wiki_model_path, wiki_model_name))
    else:
        print("Word2vec model not found in {}".format(wiki_model_path))

    vec_len = len(model['a'])
    print("Word2vec Vector length {}".format(vec_len))

    SheetsToParse = [
        'AAPL', 'MSFT', 'GE', 'IBM', 'DIS', 'PG', 'AXP', 'BA', 'DD', 'JNJ',
        'KO', 'MCD', 'MMM'
    ]
    #df= parseExcelFileWithMultipleSheetsAndCombine("/datadrive/Sahil/code/GL/fewTrails/twitter/Tweet-Scale.xlsx",SheetsToParse)
    df = pd.read_csv(
        "/datadrive/Sahil/code/GL/fewTrails/twitter/twitter_training.csv")

    #df = pd.read_csv(training_data_csv, encoding='iso-8859-1')
    sentences_len = [len(str(s).split()) for s in df['text']]
    max_len = max(sentences_len) + 20  # 20 margin

    print("Max Sentence length {}".format(max_len))

    V_index_dict = getIndexedDict(model)
    vocab_size = len(V_index_dict)
    embedding_weights = getEmbeddings(vocab_size, vec_len)

    data_X = []

    for sen in df.text[:]:
        #vec = np.zeros(max_len)
        vec = []
        for index, word in enumerate(word_tokenize(str(sen))[:max_len]):
            if word in V_index_dict.keys():
                vec.append(V_index_dict[word])
            else:
                vec.append(0)
        data_X.append(vec)

    data_X = np.array(data_X)

    data_X = sequence.pad_sequences(data_X, maxlen=max_len)

    y = df.Rating_m
    y = to_categorical(y, num_classes=None)
    print(y)
    print("Shape of Y{}".format(y.shape))

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

    for train_index, test_index1 in sss.split(data_X, y):
        print("TRAIN:", train_index, "TEST:", test_index1)
        print("TRAIN:", len(train_index), "TEST:", len(test_index1))
        X_train, X_test = data_X[train_index], data_X[test_index1]
        y_train, y_test = y[train_index], y[test_index1]

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=seed)

    for val_index, test_index2 in sss.split(X_test, y_test):
        print("TRAIN:", val_index, "TEST:", test_index2)
        print("TRAIN:", len(val_index), "TEST:", len(test_index2))
        X_val, X_test = X_test[val_index], X_test[test_index2]
        y_val, y_test = y_test[val_index], y_test[test_index2]

    data = {}

    data["X_train"] = X_train
    data["X_test"] = X_test
    data["X_val"] = X_val
    data["y_train"] = y_train
    data["y_test"] = y_test
    data["y_val"] = y_val

    data["train_index"] = train_index
    data["test_index"] = test_index1[test_index2]
    data["val_index"] = test_index1[val_index]

    data["max_len"] = max_len
    data["vec_len"] = vec_len
    data["vocab_size"] = vocab_size
    pickle.dump(data, open(saved_data_filename, 'wb'))
Exemple #6
0
def main():
    # ========================================================================
    # VGG-16 ARCHITECTURE
    # ========================================================================
    model = Sequential()

    model.add(ZeroPadding2D((1, 1), input_shape=(224, 224, 20)))
    model.add(Conv2D(64, (3, 3), activation='relu', name='conv1_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Conv2D(64, (3, 3), activation='relu', name='conv1_2'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Conv2D(128, (3, 3), activation='relu', name='conv2_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Conv2D(128, (3, 3), activation='relu', name='conv2_2'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Conv2D(256, (3, 3), activation='relu', name='conv3_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Conv2D(256, (3, 3), activation='relu', name='conv3_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Conv2D(256, (3, 3), activation='relu', name='conv3_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Conv2D(512, (3, 3), activation='relu', name='conv4_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Conv2D(512, (3, 3), activation='relu', name='conv4_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Conv2D(512, (3, 3), activation='relu', name='conv4_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Conv2D(512, (3, 3), activation='relu', name='conv5_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Conv2D(512, (3, 3), activation='relu', name='conv5_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Conv2D(512, (3, 3), activation='relu', name='conv5_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(Flatten())
    model.add(
        Dense(num_features, name='fc6', kernel_initializer='glorot_uniform'))

    # ========================================================================
    # WEIGHT INITIALIZATION
    # ========================================================================
    layerscaffe = [
        'conv1_1', 'conv1_2', 'conv2_1', 'conv2_2', 'conv3_1', 'conv3_2',
        'conv3_3', 'conv4_1', 'conv4_2', 'conv4_3', 'conv5_1', 'conv5_2',
        'conv5_3', 'fc6', 'fc7', 'fc8'
    ]
    i = 0
    h5 = h5py.File(vgg_16_weights)

    layer_dict = dict([(layer.name, layer) for layer in model.layers])

    # Copy the weights stored in the 'vgg_16_weights' file to the
    # feature extractor part of the VGG16
    for layer in layerscaffe[:-3]:
        w2, b2 = h5['data'][layer]['0'], h5['data'][layer]['1']
        w2 = np.transpose(np.asarray(w2), (2, 3, 1, 0))
        w2 = w2[::-1, ::-1, :, :]
        b2 = np.asarray(b2)
        layer_dict[layer].set_weights((w2, b2))

    # Copy the weights of the first fully-connected layer (fc6)
    layer = layerscaffe[-3]
    w2, b2 = h5['data'][layer]['0'], h5['data'][layer]['1']
    w2 = np.transpose(np.asarray(w2), (1, 0))
    b2 = np.asarray(b2)
    layer_dict[layer].set_weights((w2, b2))

    # ========================================================================
    # FEATURE EXTRACTION
    # ========================================================================
    if save_features:
        saveFeatures(model, features_file, labels_file, features_key,
                     labels_key)

    # ========================================================================
    # TRAINING
    # =======================================================================

    adam = Adam(lr=learning_rate,
                beta_1=0.9,
                beta_2=0.999,
                epsilon=1e-08,
                decay=0.0005)
    model.compile(optimizer=adam,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    cams_x, cams_y = load_dataset()

    sensitivities = []
    specificities = []
    aucs = []
    accuracies = []

    # LEAVE-ONE-CAMERA-OUT CROSS-VALIDATION
    for cam in range(num_cameras):
        print('=' * 30)
        print('LEAVE-ONE-OUT STEP {}/8'.format(cam + 1))
        print('=' * 30)
        # cams_x[nb_cam] contains all the optical flow stacks of
        # the 'cam' camera (where 'cam' is an integer from 0 to 24)
        test_x = cams_x[cam]
        test_y = cams_y[cam]
        train_x = cams_x[0:cam] + cams_x[cam + 1:]
        train_y = cams_y[0:cam] + cams_y[cam + 1:]
        # Flatten to 1D arrays
        train_x = np.asarray([
            train_x[i][j] for i in range(len(train_x))
            for j in range(len(train_x[i]))
        ])
        train_y = np.asarray([
            train_y[i][j] for i in range(len(train_y))
            for j in range(len(train_y[i]))
        ])

        # Create a validation subset from the training set
        zeroes = np.asarray(np.where(train_y == 0)[0])
        ones = np.asarray(np.where(train_y == 1)[0])
        trainval_split_0 = StratifiedShuffleSplit(n_splits=1,
                                                  test_size=val_size / 2,
                                                  random_state=7)
        indices_0 = trainval_split_0.split(train_x[zeroes, ...],
                                           np.argmax(train_y[zeroes, ...], 1))
        trainval_split_1 = StratifiedShuffleSplit(n_splits=1,
                                                  test_size=val_size / 2,
                                                  random_state=7)
        indices_1 = trainval_split_1.split(train_x[ones, ...],
                                           np.argmax(train_y[ones, ...], 1))
        train_indices_0, val_indices_0 = indices_0.next()
        train_indices_1, val_indices_1 = indices_1.next()

        _X_train = np.concatenate([
            train_x[zeroes, ...][train_indices_0, ...],
            train_x[ones, ...][train_indices_1, ...]
        ],
                                  axis=0)
        _y_train = np.concatenate([
            train_y[zeroes, ...][train_indices_0, ...],
            train_y[ones, ...][train_indices_1, ...]
        ],
                                  axis=0)
        X_val = np.concatenate([
            train_x[zeroes, ...][val_indices_0, ...],
            train_x[ones, ...][val_indices_1, ...]
        ],
                               axis=0)
        y_val = np.concatenate([
            train_y[zeroes, ...][val_indices_0, ...],
            train_y[ones, ...][val_indices_1, ...]
        ],
                               axis=0)
        y_val = np.squeeze(y_val)
        _y_train = np.squeeze(np.asarray(_y_train))

        # Balance the positive and negative samples
        all0 = np.where(_y_train == 0)[0]
        all1 = np.where(_y_train == 1)[0]

        all1 = np.random.choice(all1, len(all0), replace=False)
        allin = np.concatenate((all0.flatten(), all1.flatten()))
        X_train = np.asarray(_X_train[allin, ...])
        y_train = np.asarray(_y_train[allin])
        X_test = np.asarray(test_x)
        y_test = np.asarray(test_y)

        # ==================== CLASSIFIER ========================
        extracted_features = Input(shape=(num_features, ),
                                   dtype='float32',
                                   name='input')
        if batch_norm:
            x = BatchNormalization(axis=-1, momentum=0.99,
                                   epsilon=0.001)(extracted_features)
            x = Activation('relu')(x)
        else:
            x = ELU(alpha=1.0)(extracted_features)

        x = Dropout(0.9)(x)
        x = Dense(4096, name='fc2', init='glorot_uniform')(x)
        if batch_norm:
            x = BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001)(x)
            x = Activation('relu')(x)
        else:
            x = ELU(alpha=1.0)(x)
        x = Dropout(0.8)(x)
        x = Dense(1, name='predictions', init='glorot_uniform')(x)
        x = Activation('sigmoid')(x)

        classifier = Model(input=extracted_features,
                           output=x,
                           name='classifier')
        fold_best_model_path = best_model_path + 'multicam_fold_{}'.format(cam)
        classifier.compile(optimizer=adam,
                           loss='binary_crossentropy',
                           metrics=['accuracy'])

        if not use_checkpoint:
            # ==================== TRAINING ========================
            # weighting of each class: only the fall class gets
            # a different weight
            class_weight = {0: weight_0, 1: 1}

            callbacks = None
            if use_validation:
                # callback definition
                metric = 'val_loss'
                e = EarlyStopping(monitor=metric,
                                  min_delta=0,
                                  patience=100,
                                  mode='auto')
                c = ModelCheckpoint(fold_best_model_path,
                                    monitor=metric,
                                    save_best_only=True,
                                    save_weights_only=False,
                                    mode='auto')
                callbacks = [e, c]
            validation_data = None
            if use_validation:
                validation_data = (X_val, y_val)
            _mini_batch_size = mini_batch_size
            if mini_batch_size == 0:
                _mini_batch_size = X_train.shape[0]

            history = classifier.fit(X_train,
                                     y_train,
                                     validation_data=validation_data,
                                     batch_size=_mini_batch_size,
                                     nb_epoch=epochs,
                                     shuffle=True,
                                     class_weight=class_weight,
                                     callbacks=callbacks)

            if not use_validation:
                classifier.save(fold_best_model_path)

            plot_training_info(plots_folder + exp, ['accuracy', 'loss'],
                               save_plots, history.history)

            if use_validation and use_val_for_training:
                classifier = load_model(fold_best_model_path)

                # Use full training set (training+validation)
                X_train = np.concatenate((X_train, X_val), axis=0)
                y_train = np.concatenate((y_train, y_val), axis=0)

                history = classifier.fit(X_train,
                                         y_train,
                                         validation_data=validation_data,
                                         batch_size=_mini_batch_size,
                                         nb_epoch=epochs,
                                         shuffle='batch',
                                         class_weight=class_weight,
                                         callbacks=callbacks)

                classifier.save(fold_best_model_path)

        # ==================== EVALUATION ========================

        # Load best model
        print('Model loaded from checkpoint')
        classifier = load_model(fold_best_model_path)

        predicted = classifier.predict(X_test)
        for i in range(len(predicted)):
            if predicted[i] < threshold:
                predicted[i] = 0
            else:
                predicted[i] = 1
        # Array of predictions 0/1
        predicted = np.asarray(predicted).astype(int)

        # Compute metrics and print them
        cm = confusion_matrix(y_test, predicted, labels=[0, 1])
        tp = cm[0][0]
        fn = cm[0][1]
        fp = cm[1][0]
        tn = cm[1][1]
        tpr = tp / float(tp + fn)
        fpr = fp / float(fp + tn)
        fnr = fn / float(fn + tp)
        tnr = tn / float(tn + fp)
        precision = tp / float(tp + fp)
        recall = tp / float(tp + fn)
        specificity = tn / float(tn + fp)
        f1 = 2 * float(precision * recall) / float(precision + recall)
        accuracy = accuracy_score(y_test, predicted)
        fpr, tpr, _ = roc_curve(y_test, predicted)
        roc_auc = auc(fpr, tpr)

        print('FOLD/CAMERA {} results:'.format(cam))
        print('TP: {}, TN: {}, FP: {}, FN: {}'.format(tp, tn, fp, fn))
        print('TPR: {}, TNR: {}, FPR: {}, FNR: {}'.format(tpr, tnr, fpr, fnr))
        print('Sensitivity/Recall: {}'.format(recall))
        print('Specificity: {}'.format(specificity))
        print('Precision: {}'.format(precision))
        print('F1-measure: {}'.format(f1))
        print('Accuracy: {}'.format(accuracy))
        print('AUC: {}'.format(roc_auc))

        # Store the metrics for this epoch
        sensitivities.append(tp / float(tp + fn))
        specificities.append(tn / float(tn + fp))
        aucs.append(roc_auc)
        accuracies.append(accuracy)

    print('LEAVE-ONE-OUT RESULTS ===================')
    print("Sensitivity: %.2f%% (+/- %.2f%%)" %
          (np.mean(sensitivities), np.std(sensitivities)))
    print("Specificity: %.2f%% (+/- %.2f%%)" %
          (np.mean(specificities), np.std(specificities)))
    print("Accuracy: %.2f%% (+/- %.2f%%)" %
          (np.mean(accuracies), np.std(accuracies)))
    print("AUC: %.2f%% (+/- %.2f%%)" % (np.mean(aucs), np.std(aucs)))
print('------------------------------------------------------------')
# Note that this is a numpy structured array as the data set contains both int and float
# http://docs.scipy.org/doc/numpy/user/basics.rec.html
data = np.loadtxt(filePath, delimiter=',', skiprows=1, dtype=dataType)
df = pd.DataFrame(data)

subj = df.ix[:, -2]
activity = df.ix[:, -1]
subj_activity = (100 * subj) + activity
df = pd.concat([df, subj_activity], axis=1)
df.rename(columns={0: 'activity_subj'}, inplace=True)

# split data set into test and train using stratification (for both subj and activity)
# ---------------------
strat_split = StratifiedShuffleSplit(n_splits=1,
                                     train_size=0.75,
                                     test_size=0.25,
                                     random_state=2016)

# stratify based on subj_activity
for train_index, test_index in strat_split.split(df, subj_activity):
    df_train, df_test = df.ix[train_index], df.ix[test_index]
    print('Size of data set: ', len(df))
    print('Size of training data set: ', len(train_index))
    print('Size of test data set: ', len(test_index))

print('Verifying distribution ...')
train_table = df_train.rename(index=str, columns={'subject': 'training_count'})
test_table = df_test.rename(index=str, columns={'subject': 'test_count'})
verify = pd.concat([
    train_table.ix[:, -2:].groupby('activity_subj').count(),
    test_table.ix[:, -2:].groupby('activity_subj').count()
Exemple #8
0
                                test_predictions,
                                average='weighted')
    recall = recall_score(test_labels, test_predictions, average='weighted')
    f1 = 2.0 * (precision * recall) / (precision + recall)

    print("Test Precision: %.4f" % (precision))
    print("Test Recall: %.4f" % (recall))
    print("Test f1_score: %.4f" % (f1))

    return accuracy, precision, recall, f1


filename = sys.argv[1]
X_data, Y_data = load_csv(filename)

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.125)
metrics = []
fold = 1
for train_indices, test_indices in sss.split(X_data, Y_data):
    train_data, test_data = X_data[train_indices], X_data[test_indices]
    train_labels, test_labels = Y_data[train_indices], Y_data[test_indices]
    metrics.append(SVM(train_data, train_labels, test_data, test_labels))
    fold += 1

accuracy = 0.00
precision = 0.00
recall = 0.00
fi = 0.00
for i in metrics:
    accuracy += i[0]
    precision += i[1]
Exemple #9
0
df1['label'] = 'BENIGN'
df2 = pd.read_csv('../results/dataset_dos.csv')
df2['label'] = 'dos'
df3 = pd.read_csv('../results/dataset_hb.csv')
df3['label'] = 'heartbleed'
frames = [df1, df2, df3]

print('join datasets')
df = pd.concat(frames)

print('separate y')
X = df.drop(columns=['label'])
y = df['label'].values

print('StratifiedShuffleSplit')
sss = StratifiedShuffleSplit(n_splits=1, test_size=110000, random_state=1)
print('split')
print(sss.get_n_splits(X, y))

list = []
for train_index, test_index in sss.split(X, y):
    for index in test_index:
        list.append(df.iloc[index].values)

dts = pd.DataFrame(list, columns=df.columns)
dts = df.drop(columns=['ipsrc', 'ipdst'])

print('saving')
dts.to_csv("../results/dataset_110000.csv",
           sep=',',
           encoding='utf-8',
Exemple #10
0
def train_age(kfold, batchsize, lr_age, lr_gender, num_epochs, p_augment,
              device, num_age_classes, num_gender_classes, test_fold,
              train_fold, random_seed):
    all_accuracy_age = []
    all_val_loss_age = []
    all_stat_fold = []

    for fold in range(kfold):
        all_stat = defaultdict(list)

        # image paths
        train_data = train_fold[fold]['image_path'].copy().reset_index(
            drop=True).to_list()
        test_data = test_fold[fold]['image_path'].copy().reset_index(
            drop=True).to_list()

        #get label
        train_age_label = train_fold[fold]['age'].copy().reset_index(
            drop=True).to_list()
        train_gender_label = train_fold[fold]['gender'].copy().reset_index(
            drop=True).to_list()
        test_age_label = test_fold[fold]['age'].copy().reset_index(
            drop=True).to_list()
        test_gender_label = test_fold[fold]['gender'].copy().reset_index(
            drop=True).to_list()

        #create train-validation stratified split
        sss = StratifiedShuffleSplit(n_splits=10, random_state=random_seed)

        #split based on age, more balanced for both age and gender
        train_idx, val_idx = list(sss.split(train_data, train_age_label))[0]

        train_idx = list(train_idx)
        val_idx = list(val_idx)

        #create dataloader for gender
        train_dataset = AgeDataset(
            '',
            list(np.array(train_data)[train_idx]),
            list(np.array(train_age_label)[train_idx]),
            list(np.array(train_gender_label)[train_idx]),
            p_augment=p_augment)
        val_dataset = AgeDataset('',
                                 list(np.array(train_data)[val_idx]),
                                 list(np.array(train_age_label)[val_idx]),
                                 list(np.array(train_gender_label)[val_idx]),
                                 validation=True)
        test_dataset = AgeDataset('',
                                  test_data,
                                  test_age_label,
                                  test_gender_label,
                                  validation=True)

        train_loader = DataLoader(train_dataset,
                                  batch_size=batchsize,
                                  shuffle=True)
        val_loader = DataLoader(val_dataset,
                                batch_size=batchsize,
                                shuffle=False)
        test_loader = DataLoader(test_dataset,
                                 batch_size=batchsize,
                                 shuffle=False)

        val_gender_label = list(np.array(train_gender_label)[val_idx])
        val_age_label = list(np.array(train_age_label)[val_idx])

        model = InceptionResnetV1(classify=True,
                                  pretrained='vggface2',
                                  num_classes=num_age_classes)
        model = model.to(device)

        #optimizer
        optimizer = optim.AdamW(model.parameters(), lr=lr_age)
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [5, 10])

        #loss
        criterion = nn.CrossEntropyLoss()

        best_acc_age = 0
        best_val_loss_age = 999

        print(f'Fold {fold+1}\n')
        for epoch in range(num_epochs):
            print(f'epoch: {epoch}\n')
            train_loss_age = 0
            val_loss_age = 0

            #Training
            model.train()
            iterat = 0
            vsego = len(train_loader)
            for batch in train_loader:
                print(f'batch_num: {100*(iterat/vsego)}%\n')

                # Load image batch
                batch_data, batch_age_label = batch
                batch_data = batch_data.to(device)
                batch_age_label = batch_age_label.to(device)

                # Clear gradients
                optimizer.zero_grad()

                with torch.set_grad_enabled(True):
                    pred_age = model(batch_data)
                    loss_age = criterion(pred_age, batch_age_label)

                    train_loss_age += loss_age.detach().item()
                    loss_age.backward()
                    optimizer.step()

                iterat = iterat + 1

            #Validation
            model.eval()
            all_pred_age = torch.empty(0).to(device)
            for batch in val_loader:

                # Load image batch
                batch_data, batch_age_label = batch
                batch_data = batch_data.to(device)
                batch_age_label = batch_age_label.to(device)

                with torch.set_grad_enabled(False):
                    pred_age = model(batch_data)
                    loss_age = criterion(pred_age, batch_age_label)
                    val_loss_age += loss_age.detach().item()
                    all_pred_age = torch.cat(
                        (all_pred_age,
                         nn.functional.softmax(pred_age.detach(), dim=1)), 0)

            train_loss_age /= len(train_loader)
            val_loss_age /= len(val_loader)

            all_pred_age = all_pred_age.cpu().numpy()
            pred_label_age = list(np.argmax(all_pred_age, axis=1))

            acc_age = accuracy_score(val_age_label, pred_label_age)
            if acc_age > best_acc_age:
                best_acc_age = acc_age
                best_val_loss_age = val_loss_age
                torch.save(model.state_dict(), f'models/age_model{fold}.pth')

            all_stat['train_loss'].append(train_loss_age)
            all_stat['val_loss'].append(val_loss_age)
            all_stat['val_acc'].append(acc_age)

            print(
                f'Epoch {epoch} | train loss: {train_loss_age} | val loss: {val_loss_age} | accuracy: {round(acc_age*100, 2)}%'
            )
            scheduler.step()

        #INFERENCE
        with torch.no_grad():
            model.load_state_dict(torch.load(f'models/age_model{fold}.pth'))
            model.eval()
            test_pred_age = torch.empty(0).to(device)
            for batch in test_loader:

                # Load image batch
                batch_data, batch_age_label = batch
                batch_data = batch_data.to(device)
                batch_age_label = batch_age_label.to(device)

                with torch.set_grad_enabled(False):
                    pred_age = model(batch_data)
                    test_pred_age = torch.cat(
                        (test_pred_age,
                         nn.functional.softmax(pred_age.detach(), dim=1)), 0)

            test_pred_age = test_pred_age.cpu().numpy()
            pred_label_age = list(np.argmax(test_pred_age, axis=1))

            acc_age = accuracy_score(test_age_label, pred_label_age)
            all_stat['test_acc'].append(acc_age)
            all_stat['conf'].append(
                confusion_matrix(test_age_label,
                                 pred_label_age,
                                 labels=list(range(num_age_classes))))
            all_stat['conf_norm'].append(
                confusion_matrix(test_age_label,
                                 pred_label_age,
                                 normalize='true',
                                 labels=list(range(num_age_classes))))
            all_stat['test_pred'].append(pred_label_age)
            all_stat['test_target'].append(test_age_label)

        all_accuracy_age.append(acc_age)
        all_val_loss_age.append(best_val_loss_age)
        print(
            f'TEST ACCURACY: {round(acc_age*100,2)}% | Val. Accuracy: {round(best_acc_age*100,2)}% | Val. Loss.: {best_val_loss_age}\n'
        )

        all_stat_fold.append(all_stat)

    all_accuracy_age = np.array(all_accuracy_age)
    all_val_loss_age = np.array(all_val_loss_age)

    mean_accuracy_age = round(all_accuracy_age.mean() * 100, 2)

    print(f'\nOverall Accuracy: {mean_accuracy_age} p/m')
Exemple #11
0
def processtarget(inp):
	global thresh
	activity_threshold = thresh
	sdict = {idx:i for idx, i in enumerate([round(float(i),2) for i in np.arange(0,.9,0.1)])}
	uniprot,infile = inp
	try: matrix,active_scaf,pactivity = processfile(infile.groupby('smiles').mean().reset_index()[['smiles','pchembl_value']].values,file=True)
	except TypeError: return
	if len(matrix) < 100: return
	vector = [1 if x >= activity_threshold else 0 for x in pactivity]
	sfvector = []
	#set up cdf for bioactivity scale
	for standard_deviation_threshold in sorted(sdict.values()):
		if standard_deviation_threshold == 0.0:
			sfvector.append(vector)
		else:
			reweighted = convertPvalue(pactivity,activity_threshold,standard_deviation_threshold)
			sfvector.append(reweighted)
	#process the inactive set
	if sum(vector) < 100: return
	print(uniprot)
	nact = sum(vector)
	ninact = len(vector)-sum(vector)
	conf_smiles = []
	egids = uniprot_egid.get(uniprot)
	if egids != None:
		for egid in egids:
			try:
				with zipfile.ZipFile(path_to_pidgin_inactives + egid + '.smi.zip') as z:
					conf_smiles += [i.split(' ')[0] for i in z.open(egid + '.smi').read().decode('UTF-8').splitlines()]		
			except: pass
	req = nact * 2
	if req < 1000: req = 1000
	if req > 2000: req = 2000
	req -= ninact
	if req < 0: req = 0
	conf_inactives, inactive_scaf = [], []
	#sample inactives if necessary
	if len(conf_smiles) > 0:
		random.seed(2)
		random.shuffle(conf_smiles)
		try:
			random.seed(2)
			conf_inactives,inactive_scaf = calcFingerprints_array(random.sample(conf_smiles,req))
		except ValueError: conf_inactives,inactive_scaf = calcFingerprints_array(conf_smiles)
	conf_smiles = []
	vector2 = []
	for i in conf_inactives:
		if req > 0:
			matrix.append(i)
			vector2.append(0)
			req-=1
	conf_inactives = None
	ninact += len(vector2)
	nse = 0
	if req > 0:
		vector2 += [0] * req
		random_bg, random_scaf = getfp(req)
		nse = len(random_bg)
		matrix += random_bg
		inactive_scaf += random_scaf
	del random_bg, random_scaf
	all_scafs = active_scaf+inactive_scaf
	del active_scaf, inactive_scaf
	scaf_dict = {s[0]:s[1] for s in zip(set(all_scafs),range(0,len(set(all_scafs)),1))}
	all_scafs = [scaf_dict[sca] for sca in all_scafs]
	nscaf = len(scaf_dict.keys())
	vector += vector2
	pactivity = np.array(pactivity + [0] * len(vector2), dtype=np.float32)
	sfvector = [s+vector2 for s in sfvector]
	vector2 = None
	matrix = np.array(matrix, dtype=np.uint8)
	vector = np.array(vector, dtype=np.uint8)
	sfvector = [np.array(s) for s in sfvector]
	skf = StratifiedShuffleSplit(n_splits=3, random_state=2, test_size=0.75, train_size=0.25)
	lso = GroupShuffleSplit(n_splits=3, random_state=2, test_size=0.75, train_size=0.25)
	base_predicted1, base_predicted2, base_predicted3 = [], [], []
	y_lab, y_lab_raw, y_binary = [], [], []
	per_fold=[]
	try:
		#remove '[:1]' to enable scaffold splitting
		for split_method, split_name in [(skf,0),(lso,1)][:1]:
			#for each splitting method, perform the evaluation
			for train, test in split_method.split(matrix,vector,groups=all_scafs):
				x, y, X_test,Y_binary, Y_raw = matrix[train], vector[train], matrix[test], vector[test], pactivity[test]
				class_weights = class_weight.compute_class_weight('balanced',np.unique(y),y)
				sw = np.array([class_weights[1] if i == 1 else class_weights[0] for i in y])				
				rfc = RandomForestClassifier(n_jobs = 1, n_estimators=200, class_weight='balanced', random_state=2)
				###### ###### ###### ###### ###### ###### ###### ###### ###### 
				brfc=sklearn.base.clone(rfc)
				brfc.fit(x,y,sample_weight=sw)
				#for each emulated experimental error, generate predictions
				for sidx,ystrain in enumerate(sfvector):
					sw2 = ystrain[train]
					py=np.zeros([len(sw2),2])
					py[:,1] = sw2
					py[:,0] = 1-py[:,1]
					prfc = prf(n_estimators=200, bootstrap=True, keep_proba=0.05)
					prfc.fit(X=x.astype(float), py=py.astype(float))
					rfr = RandomForestRegressor(n_jobs = 1, n_estimators=200, random_state=2)
					rfr.fit(x,sw2)
					p_prfc = [round(pr,3) for pr in list(np.array(prfc.predict_proba(X=X_test.astype(float)))[:,1])]
					p_brfc = [round(pr,3) for pr in list(brfc.predict_proba(X_test)[:,1])]
					p_rfr = [round(pr,3) for pr in list(np.array(rfr.predict(X_test)))]
					for sidx2, ystest in enumerate(sfvector):
						y_test=list(ystest[test])
						#add base rf method output
						base_predicted1 += p_brfc
						#add base prf method output (when stdev = 0)
						base_predicted2 += p_prfc
						#add prf method output
						base_predicted3 += p_rfr
						y_lab_raw += list(Y_raw)
						y_lab += list(y_test)
						y_binary += list(Y_binary)
						per_fold.append([len(y_test),[split_name,sdict[sidx],sdict[sidx2]]])
	except ValueError: return
	return [uniprot,nact,ninact,nse,nscaf], [y_binary,y_lab_raw,y_lab,base_predicted1,base_predicted2,base_predicted3], per_fold
model_results = []

iterations = 100

model = RandomForestClassifier(n_jobs=-1,
                               random_state=55,
                               min_samples_split=20,
                               n_estimators=500,
                               max_features='auto',
                               min_samples_leaf=20,
                               oob_score='TRUE')
modelname = 'RF'

#  Make 'iterations' index vectors for the train-test split
sss = StratifiedShuffleSplit(n_splits=iterations,
                             test_size=0.33,
                             random_state=None)

accuracy_scores_is = []
accuracy_scores_oos = []
precision_scores_is = []
precision_scores_oos = []
recall_scores_is = []
recall_scores_oos = []
f1_scores_is = []
f1_scores_oos = []

#  Initialize the confusion matrix
cm_sum_is = np.zeros((2, 2))
cm_sum_oos = np.zeros((2, 2))
    return img_data, np.array(_2d_images)


train, labels, test, classes = encode(train, test)
train = train.values

img_data, _2d_images = load_image_data()

##plt.imshow(_2d_images[0])#, interpolation='nearest')
##plt.show()
#img_data = np.array(img_data)
##img_data = img_data.reshape(1584, rows,cols)
##print("data loaded")
##input()
# splittrain data into train and validation
sss = StratifiedShuffleSplit(test_size=0.2, random_state=23)
for train_index, valid_index in sss.split(train, labels):
    X_train, X_valid = train[train_index], train[valid_index]
    y_train, y_valid = labels[train_index], labels[valid_index]
    X_train_img, X_valid_img = img_data[train_index], img_data[valid_index]
    X_train_2dimg, X_valid_2dimg = _2d_images[train_index], _2d_images[
        valid_index]

X_test = test.values

print("Done")

import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
Exemple #14
0
array([ 0.938...,  0.963...,  0.944...])
"""
testing = 1

fileDir = os.path.join(os.getcwd(), 'MicroMaster', 'AI', 'week7ML',
                       'input3.csv')
input_data = np.genfromtxt(fileDir, delimiter=',', skip_header=1)
X = input_data[:, :2]
y = input_data[:, 2]
if testing: print(X)

test_size = 0.4
random_state = 0
n_splits = 5
cv = StratifiedShuffleSplit(n_splits=n_splits,
                            test_size=test_size,
                            random_state=random_state)

# SVM with Linear Kernel
# https://stats.stackexchange.com/questions/31066/what-is-the-influence-of-c-in-svms-with-linear-kernel
# https://stats.stackexchange.com/questions/73032/linear-kernel-and-non-linear-kernel-for-support-vector-machine
"""kernel : string, optional (default=’rbf’)

Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape (n_samples, n_samples)."""

kernel = 'linear'
C = [0.1, 0.5, 1, 5, 10, 50, 100]
param_grid = dict(C=C)
grid = GridSearchCV(SVC(kernel=kernel), param_grid=param_grid, cv=cv)

grid.fit(X, y)
Exemple #15
0
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_2d = scaler.fit_transform(X_2d)

##############################################################################
# Train classifiers
#
# For an initial search, a logarithmic grid with basis
# 10 is often helpful. Using a basis of 2, a finer
# tuning can be achieved but at a much higher cost.

C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X, y)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

# Now we need to fit a classifier for all parameters in the 2d version
# (we use a smaller set of parameters here because it takes a while to train)

C_2d_range = [1e-2, 1, 1e2]
gamma_2d_range = [1e-1, 1, 1e1]
classifiers = []
for C in C_2d_range:
    for gamma in gamma_2d_range:
        clf = SVC(C=C, gamma=gamma)
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(n_splits=folds, random_state=42)

    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv.split(features, labels):
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        print(clf)
        print(
            PERF_FORMAT_STRING.format(accuracy,
                                      precision,
                                      recall,
                                      f1,
                                      f2,
                                      display_precision=5))
        print(
            RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                         false_positives, false_negatives,
                                         true_negatives))
        print("")
    except:
        print("Got a divide by zero when trying out:", clf)
        print(
            "Precision or recall may be undefined due to a lack of true positive predicitons."
        )
Exemple #17
0
print(y1)
#split training data and test data, The ratio is 4: 1
X_train, X_test, y_train, y_test = train_test_split(select_X,
                                                    y1,
                                                    test_size=0.2,
                                                    random_state=0)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

#cross validation and grid search for hyperparameter estimation
param_dist = {'algorithm': ["SAMME", "SAMME.R"]}

cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
clf = GridSearchCV(adaBoost(), param_grid=param_dist, cv=cv)
clf = clf.fit(X_train, y_train.values.ravel())

print("Best estimator found by grid search:")
print(clf.best_estimator_)
#apply the classifier on the test data and show the accuracy of the model
print('the acuracy with featureboost is:')
print(clf.score(X_test, y_test.values.ravel()))

prediction = clf.predict(X_test)
#use the metrics.classification to report.
print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, prediction))
print("Classification report:\n %s\n" %
      metrics.classification_report(y_test, prediction))
    avgFlakyTest /= successFold
    avgNonFlakyTest /= successFold

    return (avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest,
            avgP, avgR, storage, avgTPrep, avgTPred)


if __name__ == "__main__":
    projectBasePath = "dataset"
    projectName = "pinto-ds"
    outDir = "results/"
    os.makedirs(outDir, exist_ok=True)

    numSplit = 30
    testSetSize = 0.2
    kf = StratifiedShuffleSplit(n_splits=numSplit, test_size=testSetSize)

    # DISTANCE
    outFile = "params-distance.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write(
            "distance,k,sigma,eps,precision,recall,storage,preparationTime,predictionTime\n"
        )

    k = 7
    sigma = 0.5
    dim = 0  # number of dimensions (0: JL with error eps)
    eps = 0.3  # JL eps
    params = {"algorithm": "brute", "metric": "cosine", "weights": "uniform"}
    for metric in ["cosine", "euclidean"]:
        for k in [3, 7]:
def get_cv(X, y):
    cv = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=57)
    return cv.split(X, y)
Exemple #20
0
    df = test_data.fillna(np.mean(train_data['Age']))
    scaled_data = scaler.transform(df[['Age', 'Fare']])
    df[['Age', 'Fare']] = scaled_data
    for var in categorical:
        df = pd.concat([df, pd.get_dummies(df[var], prefix=var)], axis=1)
        del df[var]
    testdf = df
    test_data = df.to_numpy()
    train_labels = train_data_dropped[:, 0]
    train_data_dropped = train_data_dropped[:, 1:]

    ### Running classification
    acc, val_acc, loss, val_loss = [], [], [], []
    ## Running k-folds classification to improve generalization and reduce overfitting
    K = StratifiedShuffleSplit(10, train_size=0.6)
    for train_index, test_index in K.split(train_data_dropped, train_labels):
        x_train, y_train = train_data_dropped[train_index], train_labels[
            train_index]
        x_valid, y_valid = train_data_dropped[test_index], train_labels[
            test_index]
        # ##Only need to balance the training data, not the validation data.
        # x_train, x_valid, y_train, y_valid = train_test_split(train_data_dropped,
        #                                                       train_labels, test_size=0.2,
        #                                                       shuffle= True)

        y_train = pd.get_dummies(y_train).to_numpy()
        y_valid = pd.get_dummies(y_valid).to_numpy()
        history = model.fit(
            x_train,
            y_train,
Exemple #21
0
x = train.iloc[:, [0, 2, 3, 4, 5, 6, 7, 8]].values
y = train.iloc[:, 1].values

classifiers = [
    KNeighborsClassifier(4),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LogisticRegression()
]

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

chart = pd.DataFrame(columns=["Classifier", "Accuracy"])

acc_dict = {}

for train_index, test_index in sss.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

for clf in classifiers:
    name = clf.__class__.__name__
    clf.fit(x_train, y_train)
    train_predictions = clf.predict(x_test)
    acc = accuracy_score(y_test, train_predictions)
    if name in acc_dict:
Exemple #22
0
    def train(self, datasets):
        ''' Initialize, train and predict a classifier.
        This includes: Feature engineering (i.e. PCA) and
        selection, training clf, (hyper)parameter optimization,
        and a prediction on the test set. Make sure to save
        all variables you want to keep track of in the instance.

        Input:
            datasets:: dict
                Contains train and test x, y

        Output:
            clf:: instance, dict, list, None
                Trained classifier/regressor instance, such as
                sklearn logistic regression. Is not used
                outside this file, so can be left empty
            datasets:: dict
                Dictionary containing the UPDATED train and test
                sets. Any new features should be present in this
                dict
            test_y_hat:: list
                List containing the probabilities of outcomes.
        '''

        train_x = datasets['train_x']
        test_x = datasets['test_x']
        train_y = datasets['train_y']
        test_y = datasets['test_y']

        self.learn_size += [{
            'tr_x': train_x.shape,
            'tr_y': train_y.shape,
            'te_x': test_x.shape,
            'te_y': test_y.shape
        }]

        train_x = self.impute_missing_values(train_x)
        test_x = self.impute_missing_values(test_x)

        # Define pipeline
        self.pipeline = self.get_pipeline()

        # Model and feature selection
        # TODO ideally also the feature selection would take place within a CV pipeline

        if self.model_args['grid_search']:
            # print("Train classfier using grid search for best parameters.")
            cv = StratifiedShuffleSplit(n_splits=5,
                                        test_size=0.2,
                                        random_state=self.random_state)
            grid = RandomizedSearchCV(self.pipeline,
                                      param_distributions=self.grid,
                                      cv=cv,
                                      scoring='roc_auc',
                                      n_jobs=-2,
                                      n_iter=50)

            grid.fit(train_x, train_y)
            clf = grid.best_estimator_
            self.trained_classifiers += [clf]
            # print("Best estimator: ", clf)
        else:
            # Train classifier without optimization.
            clf = self.pipeline
            clf.fit(train_x, train_y)

        self.coefs.append(clf['XGB'].feature_importances_)

        test_y_hat = clf.predict_proba(test_x)  # Predict

        if 'feature_selection' in clf.named_steps:
            # columns = train_x.columns[np.argsort(clf.named_steps\
            #                             .feature_selection\
            #                             .pvalues_)][0:self.model_args['n_features']].to_list()
            # self.n_best_features += [columns]
            # print(columns)

            idx_sorted = np.argsort(clf['feature_selection'].pvalues_)
            f_values = clf['feature_selection'].scores_
            p_values = clf['feature_selection'].pvalues_
            columns = train_x.columns[
                idx_sorted[0:self.model_args['n_features']]].to_list()
            self.n_best_features += [[columns, f_values, p_values]]
            print(columns)

        else:
            columns = train_x.columns

        idx_train = train_x.index
        idx_test = test_x.index

        if self.model_args['add_missing_indicator']:
            missing_cols = columns.to_list()\
                              + ['{}_nan'.format(c)
                                 for c in train_x.loc[:, train_x.isna().any()]]

        train_x = pd.DataFrame(clf[:-1].transform(train_x))
        test_x = pd.DataFrame(clf[:-1].transform(test_x))

        if self.model_args['add_missing_indicator']:
            train_x.columns = missing_cols
            test_x.columns = missing_cols
        else:
            train_x.columns = columns
            test_x.columns = columns

        train_x.index = idx_train
        test_x.index = idx_test

        datasets = {
            "train_x": train_x,
            "test_x": test_x,
            "train_y": train_y,
            "test_y": test_y
        }

        explainer = shap.TreeExplainer(clf['XGB'])
        shap_values = explainer.shap_values(test_x)

        return clf, datasets, test_y_hat, shap_values, test_x