コード例 #1
0
def predictMTX(path):
    mtxCompressed = np.load(path)
    X = mtxCompressed['savedX']
    Y = mtxCompressed['savedY']
    print X.shape, Y.shape
    mod_x = np.reshape(X, (X.shape[0], X.shape[1] * X.shape[2]))
    print("Loading model")
    bbc = joblib.load('bag_model.pkl')
    bag = joblib.load('bag.pkl')
    std = joblib.load('std.pkl')
    # svc = joblib.load('svc.pkl')
    scaler = joblib.load('bag_scaler.pkl')
    # model = joblib.load('nn_pssm.pkl')
    scaledTestX = scaler.transform(mod_x)

    predY1 = bbc.predict(scaledTestX)
    predY2 = bag.predict(scaledTestX)
    predY3 = std.predict(scaledTestX)

    # Classification Metric display
    print "Balanced Bagging MLP"
    print(confusion_matrix(Y, predY1))
    print(classification_report_imbalanced(Y, predY1))
    print(matthews_corrcoef(Y, predY1))
    print "Balanced Bagging"
    print(confusion_matrix(Y, predY2))
    print(classification_report_imbalanced(Y, predY2))
    print(matthews_corrcoef(Y, predY2))
    print "Standard MLP"
    print(confusion_matrix(Y, predY3))
    print(classification_report_imbalanced(Y, predY3))
    print(matthews_corrcoef(Y, predY3))
コード例 #2
0
def predictOrganelle(path, seq_path):

    seqs = []
    lengths = []
    for seq_record in SeqIO.parse(seq_path, "fasta"):
        seq = str(seq_record.seq)
        seqs += [seq]
        lengths += [len(seq)]
    temp_org = np.array(buildPredict(seqs))
    probs_org = []
    for i in range(0, len(lengths)):
        probs_org += [temp_org[i]] * lengths[i]
    probs_org = np.array(probs_org)
    print "Len Probs Org: {} ".format(len(probs_org))
    # print probs_org.shape

    mtxCompressed = np.load(path)
    X = mtxCompressed['savedX']
    Y = mtxCompressed['savedY']
    # print X.shape, Y.shape
    mod_x = np.reshape(X, (X.shape[0], X.shape[1] * X.shape[2]))

    base = joblib.load('org_base.pkl')
    scaler = joblib.load('bag_scaler.pkl')
    scaledTestX = scaler.transform(mod_x)

    base_probs = base.predict_proba(scaledTestX)
    # print base_probs.shape
    org_X = np.hstack((base_probs, probs_org))
    print org_X.shape
    bbc = joblib.load('org_bbc.pkl')
    bag = joblib.load('org_bag.pkl')
    std = joblib.load('org_std.pkl')
    svc = joblib.load('org_svc.pkl')
    org_scaler = joblib.load('org_scaler.pkl')

    scaled_org_test = org_scaler.transform(org_X)

    predY1 = bbc.predict(scaled_org_test)
    predY2 = bag.predict(scaled_org_test)
    predY3 = std.predict(scaled_org_test)
    predY4 = svc.predict(scaled_org_test)

    # Classification Metric display
    print "Balanced Bagging MLP"
    print(confusion_matrix(Y, predY1))
    print(classification_report_imbalanced(Y, predY1))
    print(matthews_corrcoef(Y, predY1))
    print "Balanced Bagging"
    print(confusion_matrix(Y, predY2))
    print(classification_report_imbalanced(Y, predY2))
    print(matthews_corrcoef(Y, predY2))
    print "Standard MLP"
    print(confusion_matrix(Y, predY3))
    print(classification_report_imbalanced(Y, predY3))
    print(matthews_corrcoef(Y, predY3))
    print "SVC"
    print(confusion_matrix(Y, predY4))
    print(classification_report_imbalanced(Y, predY4))
    print(matthews_corrcoef(Y, predY4))
コード例 #3
0
def test_classification_report_imbalanced_multiclass_with_digits():
    iris = datasets.load_iris()
    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)

    # print classification report with class names
    expected_report = ('pre rec spe f1 geo iba sup setosa 0.82609 0.79167 '
                       '0.92157 0.80851 0.86409 0.74085 24 versicolor '
                       '0.33333 0.09677 0.86364 0.15000 0.43809 0.18727 31 '
                       'virginica 0.41860 0.90000 0.54545 0.57143 0.62645 '
                       '0.37208 20 avg / total 0.51375 0.53333 0.79733 '
                       '0.47310 0.62464 0.41370 75')
    report = classification_report_imbalanced(
        y_true,
        y_pred,
        labels=np.arange(len(iris.target_names)),
        target_names=iris.target_names,
        digits=5)
    assert _format_report(report) == expected_report
    # print classification report with label detection
    expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 '
                       '0.86 0.74 24 1 0.33 0.10 0.86 0.15 0.44 0.19 31 2 '
                       '0.42 0.90 0.55 0.57 0.63 0.37 20 avg / total 0.51 '
                       '0.53 0.80 0.47 0.62 0.41 75')
    report = classification_report_imbalanced(y_true, y_pred)
    assert _format_report(report) == expected_report
コード例 #4
0
def test_classification_report_imbalanced_multiclass():
    """Test classification report for multiclass problem"""
    iris = datasets.load_iris()
    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)

    # print classification report with class names
    expected_report = ('pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 '
                       '0.81 0.86 0.74 24 versicolor 0.33 0.10 0.86 0.15 '
                       '0.44 0.19 31 virginica 0.42 0.90 0.55 0.57 0.63 '
                       '0.37 20 avg / total 0.51 0.53 0.80 0.47 0.62 0.41 75')

    report = classification_report_imbalanced(y_true,
                                              y_pred,
                                              labels=np.arange(
                                                  len(iris.target_names)),
                                              target_names=iris.target_names)
    assert_equal(_format_report(report), expected_report)
    # print classification report with label detection
    expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 '
                       '0.86 0.74 24 1 0.33 0.10 0.86 0.15 0.44 0.19 31 2 '
                       '0.42 0.90 0.55 0.57 0.63 0.37 20 avg / total 0.51 '
                       '0.53 0.80 0.47 0.62 0.41 75')

    report = classification_report_imbalanced(y_true, y_pred)
    assert_equal(_format_report(report), expected_report)
コード例 #5
0
def test_classification_report_imbalanced_multiclass_with_digits():
    iris = datasets.load_iris()
    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)

    # print classification report with class names
    expected_report = ('pre rec spe f1 geo iba sup setosa 0.82609 0.79167 '
                       '0.92157 0.80851 0.85415 0.72010 24 versicolor '
                       '0.33333 0.09677 0.86364 0.15000 0.28910 0.07717 '
                       '31 virginica 0.41860 0.90000 0.54545 0.57143 0.70065 '
                       '0.50831 20 avg / total 0.51375 0.53333 0.79733 '
                       '0.47310 0.57966 0.39788 75')
    report = classification_report_imbalanced(
        y_true,
        y_pred,
        labels=np.arange(len(iris.target_names)),
        target_names=iris.target_names,
        digits=5)
    assert _format_report(report) == expected_report
    # print classification report with label detection
    expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 '
                       '0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 '
                       '2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total 0.51 '
                       '0.53 0.80 0.47 0.58 0.40 75')
    report = classification_report_imbalanced(y_true, y_pred)
    assert _format_report(report) == expected_report
コード例 #6
0
def test_classification_report_imbalanced_multiclass():
    iris = datasets.load_iris()
    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)

    # print classification report with class names
    expected_report = ("pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 "
                       "0.81 0.85 0.72 24 versicolor 0.33 0.10 0.86 0.15 "
                       "0.29 0.08 31 virginica 0.42 0.90 0.55 0.57 0.70 "
                       "0.51 20 avg / total 0.51 0.53 0.80 0.47 0.58 0.40 75")

    report = classification_report_imbalanced(
        y_true,
        y_pred,
        labels=np.arange(len(iris.target_names)),
        target_names=iris.target_names,
    )
    assert _format_report(report) == expected_report
    # print classification report with label detection
    expected_report = ("pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 "
                       "0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 "
                       "2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total "
                       "0.51 0.53 0.80 0.47 0.58 0.40 75")

    report = classification_report_imbalanced(y_true, y_pred)
    assert _format_report(report) == expected_report
コード例 #7
0
def test_classification_report_imbalanced_multiclass_with_digits():
    """Test performance report with added digits in floating point values"""
    iris = datasets.load_iris()
    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)

    # print classification report with class names
    expected_report = ('pre rec spe f1 geo iba sup setosa 0.82609 0.79167 '
                       '0.92157 0.80851 0.86409 0.74085 24 versicolor '
                       '0.33333 0.09677 0.86364 0.15000 0.43809 0.18727 31 '
                       'virginica 0.41860 0.90000 0.54545 0.57143 0.62645 '
                       '0.37208 20 avg / total 0.51375 0.53333 0.79733 '
                       '0.47310 0.62464 0.41370 75')
    report = classification_report_imbalanced(y_true,
                                              y_pred,
                                              labels=np.arange(
                                                  len(iris.target_names)),
                                              target_names=iris.target_names,
                                              digits=5)
    assert_equal(_format_report(report), expected_report)
    # print classification report with label detection
    expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 '
                       '0.86 0.74 24 1 0.33 0.10 0.86 0.15 0.44 0.19 31 2 '
                       '0.42 0.90 0.55 0.57 0.63 0.37 20 avg / total 0.51 '
                       '0.53 0.80 0.47 0.62 0.41 75')
    report = classification_report_imbalanced(y_true, y_pred)
    assert_equal(_format_report(report), expected_report)
コード例 #8
0
def test_classification_report_imbalanced_multiclass_with_digits():
    iris = datasets.load_iris()
    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)

    # print classification report with class names
    expected_report = ("pre rec spe f1 geo iba sup setosa 0.82609 0.79167 "
                       "0.92157 0.80851 0.85415 0.72010 24 versicolor "
                       "0.33333 0.09677 0.86364 0.15000 0.28910 0.07717 "
                       "31 virginica 0.41860 0.90000 0.54545 0.57143 0.70065 "
                       "0.50831 20 avg / total 0.51375 0.53333 0.79733 "
                       "0.47310 0.57966 0.39788 75")
    report = classification_report_imbalanced(
        y_true,
        y_pred,
        labels=np.arange(len(iris.target_names)),
        target_names=iris.target_names,
        digits=5,
    )
    assert _format_report(report) == expected_report
    # print classification report with label detection
    expected_report = ("pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 "
                       "0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 "
                       "2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total 0.51 "
                       "0.53 0.80 0.47 0.58 0.40 75")
    report = classification_report_imbalanced(y_true, y_pred)
    assert _format_report(report) == expected_report
コード例 #9
0
ファイル: model.py プロジェクト: postyear/Snorkel-Labeling
    def train(self, train_data, qtz, auto):
        y = []
        x = []
        for line in train_data:
            y.append(line.split(" ")[0])
            each_text = ' '.join(line.split(" ")[1:])
            each_text = re.sub('\n', '', each_text)
            x.append(each_text)

        x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                            test_size=self.train_test_split_rate,
                                                            random_state=0)

        train_interm_path = f"{WORK_PATH}/interm_data/train_interm.txt"
        test_interm_path = f"{WORK_PATH}/interm_data/test_interm.txt"

        train_ret = []
        for x_, y_ in zip(x_train, y_train):
            train_ret.append(y_ + " " + x_ + "\n")

        test_ret = []
        for x_, y_ in zip(x_test, y_test):
            test_ret.append(y_ + " " + x_ + "\n")

        with open(train_interm_path, "w", encoding="utf-8") as tr:
            tr.writelines(train_ret)

        with open(test_interm_path, "w", encoding="utf-8") as te:
            te.writelines(test_ret)

        if not auto:
            start_time = time.time()
            self.model = fasttext.train_supervised(input=train_interm_path, **self.params)
            print("Train Time: ", round(time.time() - start_time, 3), " s")
        else:
            start_time = time.time()
            self.model = fasttext.train_supervised(input=train_interm_path,
                                                    thread=CPUs,
                                                    verbose=2,
                                                    autotuneValidationFile=test_interm_path)
            print("Train Time: ", round(time.time() - start_time, 3), " s")

        if qtz:
            start_time = time.time()
            self.model.quantize(train_interm_path, thread=CPUs, verbose=2, retrain=True)
            print("Retrain Time: ", round(time.time() - start_time, 3), " s")

        y_train_pred = [e[0] for e in self.model.predict(x_train)[0]]
        print("train acc:")
        self._print_results(*self.model.test(train_interm_path))
        print("train label report:")
        print(metrics.classification_report_imbalanced(y_train, y_train_pred))

        y_test_pred = [e[0] for e in self.model.predict(x_test)[0]]
        print("test acc:")
        self._print_results(*self.model.test(test_interm_path))
        print("test label report:")
        print(metrics.classification_report_imbalanced(y_test, y_test_pred, labels=self.model.labels))

        return self
コード例 #10
0
ファイル: metrics.py プロジェクト: ykmanoj/RivalGan
def plot_metrics(parameters):
    """
    Report baseline scores vs scores on real + fake data
    :param parameters: y_test_baseline, y_pred_baseline, scores_baseline,
     y_pred_gan, scores_gan
    """
    [
        y_test_baseline, y_pred_baseline, scores_baseline, y_pred_gan,
        scores_gan
    ] = parameters
    print(
        '\n',
        '############################################# BASELINE REPORT #############################################'
    )

    print('Classification Report:', '\n',
          classification_report_imbalanced(y_test_baseline, y_pred_baseline))
    print('Accuracy score: {}'.format(
        accuracy_score(y_pred_baseline, y_test_baseline)))
    precision = precision_score(y_pred_baseline, y_test_baseline)
    print('Precision score: {}'.format(precision))
    recall = recall_score(y_pred_baseline, y_test_baseline)
    print('Recall score: {}'.format(recall))
    print('F1 score: {}'.format(compute_F1(precision, recall)))

    print(
        '\n',
        '############################################# GAN (DATA AUGMENTATION) REPORT ##############################'
    )
    print('Classification Report:', '\n',
          classification_report_imbalanced(y_test_baseline, y_pred_gan))
    print('Accuracy score: {}'.format(
        accuracy_score(y_pred_gan, y_test_baseline)))
    precision = precision_score(y_pred_gan, y_test_baseline)
    print('Precision score: {}'.format(precision))
    recall = recall_score(y_pred_gan, y_test_baseline)
    print('Recall score: {}'.format(recall))
    print('F1 score: {}'.format(compute_F1(precision, recall)))

    fig = plt.figure(figsize=(8, 8))

    fig.subplots_adjust(hspace=.5)

    plt.subplot(2, 2, 1)
    plot_cm(y_test_baseline, y_pred_baseline)
    plt.subplot(2, 2, 2)
    plot_cm(y_test_baseline, y_pred_gan)

    plt.subplot(2, 2, 3)
    plot_aucprc(y_test_baseline, scores_baseline)
    plt.subplot(2, 2, 4)
    plot_aucprc(y_test_baseline, scores_gan)

    plt.show()
コード例 #11
0
def buildModel(X, y):
    # X = np.reshape(X,(X.shape[0],X.shape[1] * X.shape[2]))
    print X.shape, y.shape
    scaler = StandardScaler()
    print(scaler.fit(X))
    scaled_train_x = scaler.transform(X)
    X_train, X_test, y_train, y_test = train_test_split(scaled_train_x,
                                                        y,
                                                        random_state=19,
                                                        test_size=0.3)

    bag = BalancedBaggingClassifier(n_estimators=200, random_state=19)
    svm = SVC(class_weight='balanced',
              random_state=19,
              decision_function_shape='ovo')
    neural = MLPClassifier(max_iter=500,
                           random_state=19,
                           solver='lbfgs',
                           alpha=1e-5,
                           hidden_layer_sizes=(49, 8, 4))
    ada = AdaBoostClassifier(n_estimators=100, random_state=19)
    logistic = LogisticRegression(solver='lbfgs', max_iter=500)

    bag.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    neural.fit(X_train, y_train)
    ada.fit(X_train, y_train)
    logistic.fit(X_train, y_train)
    # joblib.dump(bag,'bag.pkl')
    # joblib.dump(scaler,'scaler.pkl')

    y_pred = bag.predict(X_test)
    y_pred2 = svm.predict(X_test)
    y_pred3 = neural.predict(X_test)
    y_pred4 = ada.predict(X_test)
    y_pred5 = logistic.predict(X_test)

    print matthews_corrcoef(y_test, y_pred)
    print matthews_corrcoef(y_test, y_pred2)
    print matthews_corrcoef(y_test, y_pred3)
    print matthews_corrcoef(y_test, y_pred4)
    print matthews_corrcoef(y_test, y_pred5)

    print confusion_matrix(y_test, y_pred)
    print confusion_matrix(y_test, y_pred2)
    print confusion_matrix(y_test, y_pred3)
    print confusion_matrix(y_test, y_pred4)
    print confusion_matrix(y_test, y_pred5)

    print(classification_report_imbalanced(y_test, y_pred))
    print(classification_report_imbalanced(y_test, y_pred2))
    print(classification_report_imbalanced(y_test, y_pred3))
    print(classification_report_imbalanced(y_test, y_pred4))
    print(classification_report_imbalanced(y_test, y_pred5))
コード例 #12
0
ファイル: metrics.py プロジェクト: ykmanoj/RivalGan
def report_scores(parameters):
    """
    Report accuracy, precision, recall and F1 scores. Plot confusion matrix and AUC curves
    :param parameters: y_test, y_pred and scores
    :return:
    """
    [y_test, y_pred, scores, show_graph] = parameters
    print('Classification Report:', '\n',
          classification_report_imbalanced(y_test, y_pred))
    print('Accuracy score: {}'.format(accuracy_score(y_pred, y_test)))
    precision = precision_score(y_pred, y_test)
    print('Precision score: {}'.format(precision))
    recall = recall_score(y_pred, y_test)
    print('Recall score: {}'.format(recall))
    print('F1 score: {}'.format(compute_F1(precision, recall)))
    if show_graph:
        fig = plt.figure(figsize=(6, 5))
        fig.subplots_adjust(hspace=.5)
        plt.subplot(2, 1, 1)
        plot_cm(y_test, y_pred)
        plt.subplot(2, 1, 2)
        plot_aucprc(y_test, scores)
        plt.show()
    else:
        print('Confusion Matrix: ', '\n', confusion_matrix(y_test, y_pred),
              '\n')
コード例 #13
0
def evaluate_model(model_str):
    X, y = load_train()
    test_x, test_y = load_test()

    X = np.concatenate([X, test_x], axis=0)
    y = np.concatenate([y, test_y], axis=0)

    s = StandardScaler()
    X = s.fit_transform(X)

    X, test_x, y, test_y = train_test_split(X, y, test_size=0.3, shuffle=False)

    model = fit_model(X, y, model=model_str)

    y_hat = model.predict(test_x)

    # Calculate ROC-AUC score

    y_pred_prob = model.predict_proba(test_x)[:, 1]
    auc_score = roc_auc_score(test_y, y_pred_prob)

    # AUC with CV
    cv_scores = cross_val_score(model, X, y, cv=10)
    mean_cv = np.mean(cv_scores)

    # AccuracyScore
    accu = accuracy_score(test_y, y_hat)
    # Balanced Accuracy Score
    balanced_accuracy = balanced_accuracy_score(test_y, y_hat)

    simple = make_simple_report(model_str, accu, balanced_accuracy, auc_score,
                                mean_cv)
    imb_report = classification_report_imbalanced(test_y, y_hat)

    return simple, imb_report
コード例 #14
0
def main():
    fig, axes = plt.subplots(1, 3, sharey=True, sharex=True)
    #fig.suptitle('Resample approaches')

    for ax, title, model in zip(axes.flat, ['No resample', 'Oversample', 'Undersample', ], [no_resample, oversample, undersample]):
        y, y_pred, c = model()

        print(title)
        print(imetrics.classification_report_imbalanced(y, y_pred))

        acc = metrics.accuracy_score(y, y_pred)
        cm = metrics.confusion_matrix(y, y_pred, labels=labels)
        cm = norm_cm(cm)

        cm = pd.DataFrame(cm, index=labels, columns=labels)
        sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True)
        ax.set_title(f'{title}\naccuracy={acc:.3f}')


    count = class_counter(y)
    fig.suptitle('Population: ' + ', '.join([f'{key}: {count[key]*100:.1f}%' for key in labels]))

    fig.tight_layout()
    fig.savefig('./different_resampling.pdf', dpi=92, bbox_inches='tight')

    plt.show()
コード例 #15
0
def svm(X_tr, Y_tr, X_te, Y_te):
    if Y_tr.shape[1] > 1:
        Y_tr = np.argmax(Y_tr, axis=1)
        Y_te = np.argmax(Y_te, axis=1)
    parameters = [{
        'kernel': ['rbf'],
        'gamma': [1e-3, 1e-2, 1e-1, 1],
        'C': [1]
    }]
    #{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
    svc = svm.SVC()
    clf = GridSearchCV(svc, parameters, cv=5)

    clf.fit(X_tr, Y_tr)
    y_pred = clf.predict(X_te)
    acc = accuracy_score(Y_te, y_pred)
    fpr_vot, tpr_vot, _ = roc_curve(Y_te,
                                    y_pred,
                                    pos_label=1,
                                    drop_intermediate=False)
    roc_auc_vot = auc(fpr_vot, tpr_vot)
    cmat = classification_report_imbalanced(Y_te, y_pred)
    print("SVM")

    print(cmat)

    cnf_matrix = confusion_matrix(Y_te, y_pred)
    print(cnf_matrix)
    geo = geometric_mean_score(Y_te, y_pred)
    f1 = f1_score(Y_te, y_pred, average='micro')
    print('The geometric mean is {}'.format(geo))
    print('The auc is {}'.format(roc_auc_vot))
    print('The f1 is {}'.format(f1))

    return acc
コード例 #16
0
def apply_ml_model(X_train_input, y_train_input, X_test_input, y_test_input):
    models = ['LREG','RFC','Tree','Balanced RFC']
    scores = []
    # Specify the target classes
    classes = ["No re-admission","Re-admission in < 30 days"]
    for model in models:
        if model == 'LREG':
            model_select = LogisticRegression(solver='lbfgs', max_iter=500, random_state=78)
        elif model == 'RFC':
            model_select = RandomForestClassifier(n_estimators= 128, random_state=78)
        elif model == 'Tree':
            model_select = tree.DecisionTreeClassifier(random_state=78)
        elif model == 'Balanced RFC':
            model_select = BalancedRandomForestClassifier(n_estimators=128, random_state=78)
        model_select.fit(X_train_input, y_train_input)
        y_pred = model_select.predict(X_test_input)
        # Create a DataFrame from the confusion matrix.
        cm = confusion_matrix(y_test_input, y_pred)
        # Calculating the accuracy score.
        acc_score = balanced_accuracy_score(y_test, y_pred)
        scores.append(acc_score)
        print(f"Model: {model}")
        # Displaying results
        print("Confusion Matrix")
        cm_df = pd.DataFrame(
        cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
        print(cm_df)
        print(f"Accuracy Score : {acc_score}\n")
        print("Classification Report")
        print(classification_report_imbalanced(y_test_input, y_pred))
コード例 #17
0
def svm(X_tr, Y_tr, X_te, Y_te):
    # bw = (len(X_tr)/2.0)**0.5        #default value in One-class SVM
    # gamma = 1/(2*bw*bw)
    X_tr, X_te = normalize_data(X_tr, X_te, "minmax")
    if Y_tr.shape[1] > 1:
        Y_tr = np.argmax(Y_tr, axis=1)
        Y_te = np.argmax(Y_te, axis=1)
    # parameters =  [{'kernel': ['rbf'], 'gamma': [1e-3],
    #                 'C': [1]}]
    # {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
    # svc = svm.SVC()
    # clf = GridSearchCV(svc, parameters,cv= 5)
    # clf = SVC (gamma = gamma)
    clf = LinearSVC(random_state=0)
    clf.fit(X_tr, Y_tr)
    start = time.time()
    y_pred = clf.predict(X_te)
    end = time.time()
    elapsed = (end - start) / float(len(X_te))
    acc = accuracy_score(Y_te, y_pred)
    fpr_vot, tpr_vot, _ = roc_curve(Y_te,
                                    y_pred,
                                    pos_label=1,
                                    drop_intermediate=False)
    roc_auc_vot = auc(fpr_vot, tpr_vot)
    cmat = classification_report_imbalanced(Y_te, y_pred)
    print("SVM")
    geo = geometric_mean_score(Y_te, y_pred)
    f1 = f1_score(Y_te, y_pred, average='macro')
    print('The auc is {} '.format(roc_auc_vot))
    return roc_auc_vot, elapsed
コード例 #18
0
def test_classification_report_imbalanced_dict():
    iris = datasets.load_iris()
    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)

    report = classification_report_imbalanced(
        y_true,
        y_pred,
        labels=np.arange(len(iris.target_names)),
        target_names=iris.target_names,
        output_dict=True,
    )
    outer_keys = set(report.keys())
    inner_keys = set(report[0].keys())

    expected_outer_keys = {
        0,
        1,
        2,
        "avg_pre",
        "avg_rec",
        "avg_spe",
        "avg_f1",
        "avg_geo",
        "avg_iba",
        "total_support",
    }
    expected_inner_keys = {"spe", "f1", "sup", "rec", "geo", "iba", "pre"}

    assert outer_keys == expected_outer_keys
    assert inner_keys == expected_inner_keys
コード例 #19
0
def randomforest(X_tr, Y_tr, X_te, Y_te):
    if Y_tr.shape[1] > 1:
        Y_tr = np.argmax(Y_tr, axis=1)
        Y_te = np.argmax(Y_te, axis=1)
    rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=40, oob_score = True)

    param_grid = {
    'n_estimators': [40, 100]}


    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid)
    CV_rfc.fit(X_tr, Y_tr)
    #print CV_rfc.best_params_
    #clf = RandomForestClassifier(n_estimators=150, random_state =42)
    #clf.fit(X_tr, Y_tr)
    y_pred = CV_rfc.predict(X_te)
    fpr_vot , tpr_vot , _ = roc_curve(Y_te , y_pred , pos_label =1,  drop_intermediate=False)
    roc_auc_vot = auc(fpr_vot , tpr_vot)
    cmat = classification_report_imbalanced(Y_te, y_pred)
    #print (cmat.diagonal()/cmat.sum(axis=1))
    print (cmat)
    print('The geometric mean is {}'.format(geometric_mean_score(Y_te,y_pred)))
    print('The auc is {}'.format(roc_auc_vot))
    print('The f1 is {}'.format(f1_score(Y_te, y_pred, average='weighted')))
    return CV_rfc, fpr_vot, tpr_vot, roc_auc_vot
コード例 #20
0
def decisiontree(X_tr, Y_tr, X_te, Y_te):
    if Y_tr.shape[1] > 1:
        Y_tr = np.argmax(Y_tr, axis=1)
        Y_te = np.argmax(Y_te, axis=1)
    param_grid = {'max_depth': [5, 6, 7, 8, 9, 10, 50, 100]}
    tree = GridSearchCV(DecisionTreeClassifier(), param_grid)

    tree.fit(X_tr, Y_tr)
    y_pred = tree.predict(X_te)
    acc = accuracy_score(Y_te, y_pred)
    fpr_vot, tpr_vot, _ = roc_curve(Y_te,
                                    y_pred,
                                    pos_label=1,
                                    drop_intermediate=False)
    roc_auc_vot = auc(fpr_vot, tpr_vot)
    cmat = classification_report_imbalanced(Y_te, y_pred)
    print("Decision tree")
    print(cmat)
    cnf_matrix = confusion_matrix(Y_te, y_pred)
    print(cnf_matrix)
    geo = geometric_mean_score(Y_te, y_pred)
    f1 = f1_score(Y_te, y_pred, average='micro')
    print('The geometric mean is {}'.format(geo))
    print('The auc is {}'.format(roc_auc_vot))
    print('The f1 is {}'.format(f1))

    return acc
コード例 #21
0
def decisiontree(X_tr, Y_tr, X_te, Y_te):
    # X_tr, X_te = normalize_data(X_tr, X_te, "minmax")
    if Y_tr.shape[1] > 1:
        Y_tr = np.argmax(Y_tr, axis=1)
        Y_te = np.argmax(Y_te, axis=1)
    param_grid = {'max_depth': np.arange(3, 6)}

    tree = GridSearchCV(DecisionTreeClassifier(), param_grid)

    tree.fit(X_tr, Y_tr)
    start = time.time()
    y_pred = tree.predict(X_te)
    end = time.time()
    elapsed = (end - start) / float(len(X_te))
    acc = accuracy_score(Y_te, y_pred)
    fpr_vot, tpr_vot, _ = roc_curve(Y_te,
                                    y_pred,
                                    pos_label=1,
                                    drop_intermediate=False)
    roc_auc_vot = auc(fpr_vot, tpr_vot)
    cmat = classification_report_imbalanced(Y_te, y_pred)
    print("Decision tree")
    # print (cmat)

    geo = geometric_mean_score(Y_te, y_pred)
    f1 = f1_score(Y_te, y_pred, average='micro')

    print('The auc is {} '.format(roc_auc_vot))
    return roc_auc_vot, elapsed
コード例 #22
0
def test_classification_report_imbalanced_multiclass_with_unicode_label():
    y_true, y_pred, _ = make_prediction(binary=False)

    labels = np.array([u"blue\xa2", u"green\xa2", u"red\xa2"])
    y_true = labels[y_true]
    y_pred = labels[y_pred]

    expected_report = (u'pre rec spe f1 geo iba sup blue¢ 0.83 0.79 0.92 0.81 '
                       u'0.85 0.72 24 green¢ 0.33 0.10 0.86 0.15 0.29 0.08 31 '
                       u'red¢ 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total '
                       u'0.51 0.53 0.80 0.47 0.58 0.40 75')
    if np_version[:3] < (1, 7, 0):
        with pytest.raises(RuntimeError, match="NumPy < 1.7.0"):
            classification_report_imbalanced(y_true, y_pred)
    else:
        report = classification_report_imbalanced(y_true, y_pred)
        assert _format_report(report) == expected_report
コード例 #23
0
ファイル: main.py プロジェクト: Guilheeeeeeerme/fake-news
def makePipelineMultinomialNB(X_train, Y_train, X_test, Y_test):
    pipe = make_pipeline(TfidfVectorizer(), MultinomialNB())

    pipe.fit(X_train, Y_train)
    y_pred = pipe.predict(X_test)

    print(accuracy_score(Y_test, y_pred))
    print(classification_report_imbalanced(Y_test, y_pred))
コード例 #24
0
def print_classification_report(clf, X_train, X_test, y_train, y_test):
    """Fit classifier and print classification report."""
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    clf_name = clf.__class__.__name__
    div = '=' * len(clf_name)
    title = f'\n{div}\n{clf_name}\n{div}\n'
    print(title, classification_report_imbalanced(y_test, y_pred))
コード例 #25
0
def test_classification_report_imbalanced_multiclass_with_unicode_label():
    y_true, y_pred, _ = make_prediction(binary=False)

    labels = np.array([u"blue\xa2", u"green\xa2", u"red\xa2"])
    y_true = labels[y_true]
    y_pred = labels[y_pred]

    expected_report = (u'pre rec spe f1 geo iba sup blue\xa2 0.83 0.79 '
                       u'0.92 0.81 0.86 0.74 24 green\xa2 0.33 0.10 0.86 '
                       u'0.15 0.44 0.19 31 red\xa2 0.42 0.90 0.55 0.57 0.63 '
                       u'0.37 20 avg / total 0.51 0.53 0.80 0.47 0.62 0.41 75')
    if np_version[:3] < (1, 7, 0):
        with raises(RuntimeError, match="NumPy < 1.7.0"):
            classification_report_imbalanced(y_true, y_pred)
    else:
        report = classification_report_imbalanced(y_true, y_pred)
        assert _format_report(report) == expected_report
コード例 #26
0
def test_classification_report_imbalanced_multiclass_with_unicode_label():
    y_true, y_pred, _ = make_prediction(binary=False)

    labels = np.array(["blue\xa2", "green\xa2", "red\xa2"])
    y_true = labels[y_true]
    y_pred = labels[y_pred]

    expected_report = ('pre rec spe f1 geo iba sup blue¢ 0.83 0.79 0.92 0.81 '
                       '0.85 0.72 24 green¢ 0.33 0.10 0.86 0.15 0.29 0.08 31 '
                       'red¢ 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total '
                       '0.51 0.53 0.80 0.47 0.58 0.40 75')
    if np_version[:3] < (1, 7, 0):
        with pytest.raises(RuntimeError, match="NumPy < 1.7.0"):
            classification_report_imbalanced(y_true, y_pred)
    else:
        report = classification_report_imbalanced(y_true, y_pred)
        assert _format_report(report) == expected_report
コード例 #27
0
ファイル: main.py プロジェクト: Guilheeeeeeerme/fake-news
def makePipelineBernoulliNB(X_train, Y_train, X_test, Y_test, binarize):
    pipe = make_pipeline(TfidfVectorizer(), BernoulliNB(binarize=binarize))

    pipe.fit(X_train, Y_train)
    y_pred = pipe.predict(X_test)

    print('binarize', binarize, accuracy_score(Y_test, y_pred))
    print(classification_report_imbalanced(Y_test, y_pred))
コード例 #28
0
def Print_Result_Metrics(labels_test, predicted, targetnames, model_name):
    '''    Print Metrics after Training etc.    '''
    print('\n- - - - - RESULT METRICS', model_name, '- - - - -')
    print('Exact Accuracy: ', metrics.accuracy_score(labels_test, predicted))
    print(
        classification_report_imbalanced(labels_test,
                                         predicted,
                                         target_names=targetnames))
    print(metrics.confusion_matrix(labels_test, predicted))
コード例 #29
0
ファイル: main.py プロジェクト: Guilheeeeeeerme/fake-news
def makePipelineImpGaussianNB(X_train, Y_train, X_test, Y_test):
    pipe = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(),
                             GaussianNB())

    pipe.fit(X_train, Y_train)
    y_pred = pipe.predict(X_test)

    print(accuracy_score(Y_test, y_pred))
    print(classification_report_imbalanced(Y_test, y_pred))
コード例 #30
0
def print_evaluation_results(y_test, predictions):
    print()
    print("!!!!!!!!!!!!!!!!!!!!! EVALUATION RESULTS !!!!!!!!!!!!!!!!!!!!!!")
    print("Accuracy Score ", accuracy_score(y_test, predictions))
    print("Hamming Loss ", hamming_loss(y_test, predictions))
    print("Jaccard Similarity Score ",
          jaccard_similarity_score(y_test, predictions))
    print(confusion_matrix(y_test, predictions))
    # print(classification_report(y_test, predictions))
    print(classification_report_imbalanced(y_test, predictions))
    print()
コード例 #31
0
def test_classification_report_imbalanced_multiclass_with_string_label():
    y_true, y_pred, _ = make_prediction(binary=False)

    y_true = np.array(["blue", "green", "red"])[y_true]
    y_pred = np.array(["blue", "green", "red"])[y_pred]

    expected_report = ('pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 0.81 '
                       '0.85 0.72 24 green 0.33 0.10 0.86 0.15 0.29 0.08 31 '
                       'red 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total '
                       '0.51 0.53 0.80 0.47 0.58 0.40 75')
    report = classification_report_imbalanced(y_true, y_pred)
    assert _format_report(report) == expected_report

    expected_report = ('pre rec spe f1 geo iba sup a 0.83 0.79 0.92 0.81 0.85 '
                       '0.72 24 b 0.33 0.10 0.86 0.15 0.29 0.08 31 c 0.42 '
                       '0.90 0.55 0.57 0.70 0.51 20 avg / total 0.51 0.53 '
                       '0.80 0.47 0.58 0.40 75')
    report = classification_report_imbalanced(
        y_true, y_pred, target_names=["a", "b", "c"])
    assert _format_report(report) == expected_report
コード例 #32
0
ファイル: build_model.py プロジェクト: petercuret/woonfraude
def evaluate_performance(y_pred, y_label):
    """Compute and return the prediction performance."""
    precision = precision_score(y_label, y_pred)
    recall = recall_score(y_label, y_pred)
    f1 = f1_score(y_label, y_pred)
    f05 = fbeta_score(y_label, y_pred, beta=0.5)
    conf = confusion_matrix(y_label, y_pred) / len(y_pred)
    report = classification_report_imbalanced(y_true=y_label, y_pred=y_pred)
    print(report)
    print(f'f1: {f1} // f0.5: {f05}')
    return precision, recall, f1, f05, conf, report
コード例 #33
0
def confusion_plot(pred, y_true):
    sns.set(rc={'figure.figsize': (5, 4)})
    fault_labels = np.unique(y_true)
    print(fault_labels)
    cm_array = confusion_matrix(y_true, pred, labels=fault_labels)
    df_cm = pd.DataFrame(cm_array, index=fault_labels, columns=fault_labels)
    sns.heatmap(df_cm, annot=True)
    plt.show()

    print(classification_report_imbalanced(np.array(y_true), np.array(pred)))
    return plt
コード例 #34
0
def test_classification_report_imbalanced_multiclass_with_string_label():
    y_true, y_pred, _ = make_prediction(binary=False)

    y_true = np.array(["blue", "green", "red"])[y_true]
    y_pred = np.array(["blue", "green", "red"])[y_pred]

    expected_report = ('pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 '
                       '0.81 0.86 0.74 24 green 0.33 0.10 0.86 0.15 0.44 '
                       '0.19 31 red 0.42 0.90 0.55 0.57 0.63 0.37 20 '
                       'avg / total 0.51 0.53 0.80 0.47 0.62 0.41 75')
    report = classification_report_imbalanced(y_true, y_pred)
    assert_equal(_format_report(report), expected_report)

    expected_report = ('pre rec spe f1 geo iba sup a 0.83 0.79 0.92 0.81 '
                       '0.86 0.74 24 b 0.33 0.10 0.86 0.15 0.44 0.19 31 '
                       'c 0.42 0.90 0.55 0.57 0.63 0.37 20 avg / total '
                       '0.51 0.53 0.80 0.47 0.62 0.41 75')
    report = classification_report_imbalanced(
        y_true, y_pred, target_names=["a", "b", "c"])
    assert_equal(_format_report(report), expected_report)
コード例 #35
0
def test_classification_report_imbalanced_multiclass_with_long_string_label():
    y_true, y_pred, _ = make_prediction(binary=False)

    labels = np.array(["blue", "green" * 5, "red"])
    y_true = labels[y_true]
    y_pred = labels[y_pred]

    expected_report = ('pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 0.81 '
                       '0.85 0.72 24 greengreengreengreengreen 0.33 0.10 '
                       '0.86 0.15 0.29 0.08 31 red 0.42 0.90 0.55 0.57 0.70 '
                       '0.51 20 avg / total 0.51 0.53 0.80 0.47 0.58 0.40 75')

    report = classification_report_imbalanced(y_true, y_pred)
    assert _format_report(report) == expected_report
コード例 #36
0
def test_classification_report_imbalanced_multiclass():
    iris = datasets.load_iris()
    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)

    # print classification report with class names
    expected_report = ('pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 '
                       '0.81 0.85 0.72 24 versicolor 0.33 0.10 0.86 0.15 '
                       '0.29 0.08 31 virginica 0.42 0.90 0.55 0.57 0.70 '
                       '0.51 20 avg / total 0.51 0.53 0.80 0.47 0.58 0.40 75')

    report = classification_report_imbalanced(
        y_true,
        y_pred,
        labels=np.arange(len(iris.target_names)),
        target_names=iris.target_names)
    assert _format_report(report) == expected_report
    # print classification report with label detection
    expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 '
                       '0.85 0.72 24 1 0.33 0.10 0.86 0.15 0.29 0.08 31 '
                       '2 0.42 0.90 0.55 0.57 0.70 0.51 20 avg / total '
                       '0.51 0.53 0.80 0.47 0.58 0.40 75')

    report = classification_report_imbalanced(y_true, y_pred)
    assert _format_report(report) == expected_report
コード例 #37
0
X, y = ozone.data, ozone.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

bagging = BaggingClassifier(random_state=0)
balanced_bagging = BalancedBaggingClassifier(random_state=0)

print('Class distribution of the training set: {}'.format(Counter(y_train)))

bagging.fit(X_train, y_train)
balanced_bagging.fit(X_train, y_train)

print('Class distribution of the test set: {}'.format(Counter(y_test)))

print('Classification results using a bagging classifier on imbalanced data')
y_pred_bagging = bagging.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred_bagging))
cm_bagging = confusion_matrix(y_test, y_pred_bagging)
plt.figure()
plot_confusion_matrix(cm_bagging, classes=np.unique(ozone.target),
                      title='Confusion matrix using BaggingClassifier')

print('Classification results using a bagging classifier on balanced data')
y_pred_balanced_bagging = balanced_bagging.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred_balanced_bagging))
cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging)
plt.figure()
plot_confusion_matrix(cm_balanced_bagging, classes=np.unique(ozone.target),
                      title='Confusion matrix using BalancedBaggingClassifier')

###############################################################################
# Turning the balanced bagging classifier into a balanced random forest
コード例 #38
0
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

from imblearn.datasets import make_imbalance
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced

print(__doc__)

RANDOM_STATE = 42

# Create a folder to fetch the dataset
iris = load_iris()
X, y = make_imbalance(iris.data, iris.target, ratio={0: 25, 1: 50, 2: 50},
                      random_state=0)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=RANDOM_STATE)

print('Training target statistics: {}'.format(Counter(y_train)))
print('Testing target statistics: {}'.format(Counter(y_test)))

# Create a pipeline
pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE),
                         LinearSVC(random_state=RANDOM_STATE))
pipeline.fit(X_train, y_train)

# Classify and report the results
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))
コード例 #39
0
from imblearn import over_sampling as os
from imblearn import pipeline as pl
from imblearn.metrics import classification_report_imbalanced

print(__doc__)

RANDOM_STATE = 42

# Generate a dataset
X, y = datasets.make_classification(n_classes=2, class_sep=2,
                                    weights=[0.1, 0.9], n_informative=10,
                                    n_redundant=1, flip_y=0, n_features=20,
                                    n_clusters_per_class=4, n_samples=5000,
                                    random_state=RANDOM_STATE)

pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE),
                            LinearSVC(random_state=RANDOM_STATE))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=RANDOM_STATE)

# Train the classifier with balancing
pipeline.fit(X_train, y_train)

# Test the classifier and get the prediction
y_pred_bal = pipeline.predict(X_test)

# Show the classification report
print(classification_report_imbalanced(y_test, y_pred_bal))