Esempio n. 1
0
 def test(self, test):
     x, y = test
     if self.num_class > 2:
         y = label_binarize(y, classes=self.classes)
         
     if self.num_class == 2:
         probs = np.array([[1-row, row] for row in self.model.predict(x)])
         preds = np.argmax(probs, axis=-1)
         stats = get_stat_dict(y, probs)
     else:
         probs = self.model.predict(x)
         preds = np.argmax(probs, axis=-1)
         stats = get_stat_dict(y, probs)
        
     return preds, stats
 def test(self, test):
     test_x, test_y = test
     num_class = test_y.shape[1]
     probs = self.model.predict(test_x)
     preds = np.argmax(probs, axis=1)
     stats = get_stat_dict(np.argmax(test_y, axis=1), probs, preds)
     return stats, preds
Esempio n. 3
0
 def test(self, test):
     x, y = test
     x_filt = x[:, self.feature_list]
     probs = np.array([row for row in self.model.predict_proba(x_filt)])
     preds = np.argmax(probs, axis=-1)
     stat = get_stat_dict(y, probs)
     return preds, stat
Esempio n. 4
0
    def test(self, test):
        test_x, test_y = test
        test_x = np.expand_dims(test_x, -1)
        test_x = np.expand_dims(test_x, 1)

        preds = self.model.predict(test_x)
        stats = get_stat_dict(test_y, preds)
        return preds, stats
Esempio n. 5
0
    def test(self, test):
        x, y = test
        if self.num_class > 2:
            y = label_binarize(y, classes=self.classes)

        probs = np.array([row for row in self.model.predict_proba(x)])
        preds = np.argmax(probs, axis=-1)
        stat= get_stat_dict(y, probs)
        return preds, stat    
def train(train, test, config, metric, seed=42):

    n_iter = int(config.get('LASSO', 'NumberIterations'))
    num_cv = int(config.get('LASSO', 'GridCV'))

    train_x, train_y = train
    test_x, test_y = test
    cl = np.unique(train_y)
    num_class = len(cl)

    if num_class > 2:
        train_y = label_binarize(train_y, classes=cl)
        test_y = label_binarize(test_y, classes=cl)
        clf = OneVsRestClassifier(
            LassoCV(alphas=np.logspace(-4, -0.5, 50),
                    cv=num_cv,
                    n_jobs=-1,
                    max_iter=n_iter))
    else:
        clf = LassoCV(alphas=np.logspace(-4, -0.5, 50),
                      cv=num_cv,
                      n_jobs=-1,
                      max_iter=n_iter)
    clf.fit(train_x, train_y)

    if num_class == 2:
        test_probs = np.array([[1 - row, row] for row in clf.predict(test_x)])
        test_pred = np.argmax(test_probs, axis=-1)
        test_stat_dict = get_stat_dict(test_y, test_probs)
        fpr, tpr, thresh = roc_curve(test_y, test_probs[:, 1])
        weights = clf.coef_
    else:
        test_pred = clf.predict(test_x)
        test_probs = clf.predict(test_x)
        test_stat_dict = get_stat_dict(test_y, test_pred)
        fpr, tpr, thresh = None, None, None
        weights = None

    return clf, test_stat_dict, tpr, fpr, thresh, weights, test_probs
def train(train, test, config, metric, seed=42, regularization=True):

    n_iter = int(config.get('Logistic Regression', 'NumberIterations'))
    num_cv = int(config.get('Logistic Regression', 'GridCV'))

    train_x, train_y = train
    test_x, test_y = test
    cl = np.unique(train_y)
    num_class = len(cl)

    if regularization:
        grid = {"C": np.logspace(-3, 3, 50), "penalty": ["l1"]}
        clf = GridSearchCV(LogisticRegression(solver="saga"),
                           grid,
                           cv=StratifiedKFold(num_cv).split(train_x, train_y))
        clf.fit(train_x, train_y)
        clf = clf.best_estimator_
    else:
        clf = LogisticRegression(solver="saga")
        clf.fit(train_x, train_y)

    test_probs = clf.predict_proba(test_x)
    test_preds = np.argmax(clf.predict_proba(test_x), axis=1)

    test_stat_dict = get_stat_dict(test_y, test_probs, test_preds)

    if regularization:
        weights = np.array(clf.coef_)
    else:
        weights = np.array(clf.coef_)

    if num_class == 2:
        fpr, tpr, thresh = roc_curve(test_y, test_probs[:, 1])
    else:
        fpr, tpr, thresh = 0, 0, 0

    return clf, test_stat_dict, tpr, fpr, thresh, weights, test_probs
Esempio n. 8
0
def tune_mlpnn(train, test, config, train_weights=[]):

    train_x, train_y = train
    test_x, test_y = test
    num_class = train_y.shape[1]
    input_len = train_x.shape[1]

    def auc_metric(y_true, y_pred):
        return tf.numpy_function(roc_auc_score, (y_true, y_pred), tf.double)

    dropout = [0.1, 0.3, 0.5]
    l2_grid = [0.01, 0.001, 0.0001]
    num_layer = [1, 2]
    num_nodes = [32, 64, 128]

    best_l2 = 0.0001
    best_drop = 0.5
    best_layer = 2
    best_nodes = 128

    best_stat = 0

    for d in dropout:
        for l in l2_grid:
            reg = tf.keras.regularizers.l2(l)
            model = tf.keras.Sequential()

            for i in range(0, best_layer):
                model.add(
                    tf.keras.layers.Dense(best_nodes,
                                          activation='relu',
                                          kernel_regularizer=reg,
                                          bias_regularizer=reg,
                                          name="fc_" + str(i)))
                model.add(tf.keras.layers.Dropout(d))

            model.add(
                tf.keras.layers.Dense(num_class,
                                      activation='softmax',
                                      kernel_regularizer=reg,
                                      bias_regularizer=reg,
                                      name="output"))

            patience = int(config.get('MLPNN', 'Patience'))
            batch_size = int(config.get('MLPNN', 'BatchSize'))
            learning_rate = float(config.get('MLPNN', 'LearningRate'))

            es_cb = tf.keras.callbacks.EarlyStopping('val_loss',
                                                     patience=patience,
                                                     restore_best_weights=True)
            model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
                          loss='categorical_crossentropy')
            print(train_x)
            print(train_y)
            model.fit(train_x,
                      train_y,
                      batch_size=batch_size,
                      verbose=1,
                      epochs=1000,
                      callbacks=[es_cb],
                      validation_split=0.1)
            model.fit(train_x,
                      train_y,
                      batch_size=batch_size,
                      verbose=1,
                      epochs=10)

            probs = model.predict(test_x)
            preds = np.argmax(probs, axis=1)
            stat = get_stat_dict(np.argmax(test_y, axis=1), probs, preds)

            if stat["AUC"] > best_stat:
                best_stat = stat["AUC"]
                best_drop = d
                best_l2 = l
            tf.reset_default_graph()
            tf.keras.backend.clear_session()

    for l in num_layer:
        for n in num_nodes:

            reg = tf.keras.regularizers.l2(best_l2)
            model = tf.keras.Sequential()

            for i in range(0, l):
                model.add(
                    tf.keras.layers.Dense(n,
                                          activation='relu',
                                          kernel_regularizer=reg,
                                          bias_regularizer=reg,
                                          name="fc_" + str(i)))
                model.add(tf.keras.layers.Dropout(best_drop))

            model.add(
                tf.keras.layers.Dense(num_class,
                                      activation='softmax',
                                      kernel_regularizer=reg,
                                      bias_regularizer=reg,
                                      name="output"))

            patience = int(config.get('MLPNN', 'Patience'))
            batch_size = int(config.get('MLPNN', 'BatchSize'))

            es_cb = tf.keras.callbacks.EarlyStopping('val_loss',
                                                     patience=patience,
                                                     restore_best_weights=True)
            model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
                          loss='categorical_crossentropy')

            model.fit(train_x,
                      train_y,
                      batch_size=batch_size,
                      verbose=0,
                      epochs=1000,
                      callbacks=[es_cb],
                      validation_split=0.1)
            model.fit(train_x,
                      train_y,
                      batch_size=batch_size,
                      verbose=0,
                      epochs=10)

            probs = model.predict(test_x)
            preds = np.argmax(probs, axis=1)
            stat = get_stat_dict(np.argmax(test_y, axis=1), probs, preds)

            if stat["AUC"] > best_stat:
                best_stat = stat["AUC"]
                best_layer = l
                best_nodes = n
            tf.reset_default_graph()
            tf.keras.backend.clear_session()

    return best_layer, best_nodes, best_l2, best_drop
Esempio n. 9
0
def train(train, test, config, metric, seed=42, feature_select=True):

    number_trees = int(config.get('RF', 'NumberTrees'))
    num_models = int(config.get('RF', 'ValidationModels'))

    x, y = train
    test_x, test_y = test

    if metric == "AUC":
        scoring = "roc_auc"
    else:
        scoring = "accuracy"

    clf = RandomForestClassifier(n_estimators=number_trees, n_jobs=-1)
    clf.fit(x, y)

    feature_importance = clf.feature_importances_

    feature_ranking = np.flip(np.argsort(feature_importance))
    num_features = x.shape[1]
    best_num_features = num_features

    if feature_select:
        percent_features = [1.0, 0.75, 0.5, 0.25]

        skf = StratifiedKFold(n_splits=num_models, shuffle=True)

        best_score = -1

        for percent in percent_features:
            run_score = -1
            run_probs = []
            for train_index, valid_index in skf.split(x, y):
                train_x, valid_x = x[train_index], x[valid_index]
                train_y, valid_y = y[train_index], y[valid_index]

                features_using = int(round(num_features * percent))
                feature_list = feature_ranking[0:features_using]
                filtered_train_x = train_x[:, feature_list]
                filtered_valid_x = valid_x[:, feature_list]
                clf = RandomForestClassifier(n_estimators=number_trees,
                                             n_jobs=-1).fit(
                                                 filtered_train_x, train_y)
                probs = [row for row in clf.predict_proba(filtered_valid_x)]
                run_probs = list(run_probs) + list(probs)
            run_score = get_stat(y, run_probs, metric)

            if run_score > best_score:
                best_num_features = num_features

    feature_list = feature_ranking[0:best_num_features]
    x_filt = x[:, feature_list]
    test_x_filt = test_x[:, feature_list]

    clf = RandomForestClassifier(n_estimators=number_trees,
                                 n_jobs=-1).fit(x, y)

    test_probs = np.array([row for row in clf.predict_proba(test_x)])
    test_pred = np.argmax(test_probs, axis=-1)

    test_stat_dict = get_stat_dict(test_y, test_probs, test_pred)

    if len(np.unique(y)) == 2:
        fpr, tpr, thresh = roc_curve(test_y, test_probs[:, 1])
    else:
        fpr, tpr, thresh = 0, 0, 0
    return clf, test_stat_dict, tpr, fpr, thresh, feature_importance, test_probs
Esempio n. 10
0
    def test(self, test):
        test_x, test_y = test

        preds = self.model.predict(test_x)
        stats = get_stat_dict(test_y, preds)
        return preds, stats
Esempio n. 11
0
def train(train,
          test,
          config,
          metric,
          seed=42,
          max_iter=100000,
          gaussian=False):

    num_cv = int(config.get('SVM', 'GridCV'))

    train_x, train_y = train
    test_x, test_y = test
    cl = np.unique(train_y)
    num_class = len(cl)

    scoring = "roc_auc"

    if num_class > 2:
        train_y_binarize = label_binarize(train_y, classes=cl)
        test_y_binarize = label_binarize(test_y, classes=cl)

        if gaussian == True:
            grid = [{
                'estimator__kernel': ['rbf'],
                'estimator__gamma': [1e-3, 1e-4],
                'estimator__C': [1, 10, 100, 1000]
            }, {
                'estimator__kernel': ['linear'],
                'estimator__C': [1, 10, 100, 1000]
            }]
        else:
            grid = [{
                'estimator__C': [1, 10, 100, 1000],
                'estimator__kernel': ['linear']
            }]

        clf = GridSearchCV(OneVsRestClassifier(
            SVC(probability=True, max_iter=max_iter)),
                           grid,
                           cv=StratifiedKFold(num_cv).split(train_x, train_y),
                           scoring=scoring,
                           n_jobs=-1)
        clf.fit(train_x, train_y_binarize)

    else:
        if gaussian == True:
            grid = [{
                'kernel': ['rbf'],
                'gamma': [1e-3, 1e-4],
                'C': [1, 10, 100, 1000]
            }, {
                'kernel': ['linear'],
                'C': [1, 10, 100, 1000]
            }]
        else:
            grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}]
        clf = GridSearchCV(SVC(probability=True, max_iter=max_iter),
                           grid,
                           cv=StratifiedKFold(num_cv).split(train_x, train_y),
                           scoring=scoring,
                           n_jobs=-1)

        clf.fit(train_x, train_y)

    test_probs = clf.predict_proba(test_x)
    test_preds = np.argmax(clf.predict_proba(test_x), axis=1)

    test_stat_dict = get_stat_dict(test_y, test_probs, test_preds)

    if num_class == 2 and gaussian == False:
        weights = np.array(clf.best_estimator_.coef_).reshape(-1)
        fpr, tpr, thresh = roc_curve(test_y, test_probs[:, 1])
    elif num_class > 2 and gaussian == False:
        weights = np.array(clf.best_estimator_.coef_)
        fpr, tpr, thresh = None, None, None
    elif gaussian == True:
        weights = None
        fpr, tpr, thresh = None, None, None

    return clf.best_estimator_, test_stat_dict, tpr, fpr, thresh, weights, test_probs