Exemple #1
0
                                 skip_first_row=21,
                                 y_column_index=0,
                                 assignedColumnNames=APS_FULL_COLUMNS,
                                 missingSymbol='na',
                                 needImpute=True,
                                 dropOrNot=True)

    X_test, y_test = load_data(ROOT_PATH + APS_TEST,
                               skip_first_row=21,
                               y_column_index=0,
                               assignedColumnNames=APS_FULL_COLUMNS,
                               missingSymbol='na',
                               needImpute=True,
                               dropOrNot=True)

    y_train = to_binary_numeric(y_train, classNeg="neg")
    y_test = to_binary_numeric(y_test, classNeg="neg")

    randForestClf = RandomForestClassifier(n_estimators=50,
                                           random_state=2333,
                                           oob_score=True)
    randForestClf.fit(X_train, y_train)
    y_predict = randForestClf.predict(X_test)
    falsePositiveRate, truePositiveRate, thresholds = roc_curve(
        y_test, y_predict)
    # compute Area Under the Curve (AUC) using the trapezoidal rule
    area = auc(falsePositiveRate, truePositiveRate)

    plt.plot(falsePositiveRate,
             truePositiveRate,
             color='red',
Exemple #2
0
#
__author__ = 'Aaron Yang'
__email__ = '*****@*****.**'
__date__ = '10/4/2019 5:30 PM'

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from ay_hw_4._global import ROOT_PATH, APS_TRAIN, APS_FULL_COLUMNS
from ay_hw_4.util_data import load_data, to_binary_numeric

if __name__ == "__main__":
    pd.set_option('display.max_columns', 100)
    X_data, y_data = load_data(ROOT_PATH + APS_TRAIN,
                               skip_first_row=21,
                               y_column_index=0,
                               assignedColumnNames=APS_FULL_COLUMNS,
                               missingSymbol='na',
                               needImpute=True,
                               dropOrNot=True)
    y_data = to_binary_numeric(y_data, classNeg="neg")
    data = pd.concat([y_data, X_data], axis=1)
    correlation = data.corr()
    fig = plt.figure(figsize=(20, 15))
    sns.heatmap(correlation, vmin=-1, vmax=1, cmap=sns.color_palette("Blues"))
    plt.show()

# 把dropOrNot打开 将报错, 因为数据中有10列存在NaN
Exemple #3
0
    log_tree = Classifier(classname="weka.classifiers.trees.LMT")
    eval_smote_test_obj = Evaluation(smote_test_data)
    eval_smote_test_obj.crossvalidate_model(classifier=log_tree,
                                            data=smote_test_data,
                                            num_folds=5,
                                            rnd=Random(1))
    print("SMOTE Test CV (5-folds) Error = %.2f%%" %
          (eval_smote_test_obj.percent_incorrect))
    print(eval_smote_test_obj.matrix())
    print("=================\"Summary\"====================")
    print(eval_smote_test_obj.summary())

    log_tree.build_classifier(smote_test_data)
    y_predict = eval_smote_test_obj.test_model(log_tree, smote_test_data)

    y_test = to_binary_numeric(y_test.head(500), classNeg="neg")

    falsePositiveRate, truePositiveRate, thresholds = roc_curve(
        y_test, y_predict)
    # compute Area Under the Curve (AUC) using the trapezoidal rule
    area = auc(falsePositiveRate, truePositiveRate)

    plt.plot(falsePositiveRate,
             truePositiveRate,
             color='red',
             label='ROC = ' + str(area))
    plt.plot([0, 1], [0, 1], linestyle='dotted')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC & AUC (SMOTE Test)')
    plt.legend()
    log_tree = Classifier(classname="weka.classifiers.trees.LMT")
    eval_smote_train_obj = Evaluation(smote_train_data)
    eval_smote_train_obj.crossvalidate_model(classifier=log_tree,
                                             data=smote_train_data,
                                             num_folds=5,
                                             rnd=Random(1))
    print("SMOTE Train CV (5-folds) Error = %.2f%%" %
          (eval_smote_train_obj.percent_incorrect))
    print(eval_smote_train_obj.matrix())
    print("=================\"Summary\"====================")
    print(eval_smote_train_obj.summary())

    log_tree.build_classifier(smote_train_data)
    y_predict = eval_smote_train_obj.test_model(log_tree, smote_train_data)

    y_train_smote = to_binary_numeric(y_train_smote, classNeg="neg")

    falsePositiveRate, truePositiveRate, thresholds = roc_curve(y_train_smote,
                                                                y_predict,
                                                                pos_label=0)
    # compute Area Under the Curve (AUC) using the trapezoidal rule
    area = auc(falsePositiveRate, truePositiveRate)

    plt.plot(falsePositiveRate,
             truePositiveRate,
             color='red',
             label='ROC = ' + str(area))
    plt.plot([0, 1], [0, 1], linestyle='dotted')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC & AUC (SMOTE Train)')
Exemple #5
0
	export_train_data.to_csv(GENERATED_TRAIN_DATA_FILE_PATH, sep=',', index=False)
	train_data = convert.load_any_file(filename=GENERATED_TRAIN_DATA_FILE_PATH)
	train_data.class_is_first()

	# load logistic model tree algorithm
	log_tree = Classifier(classname="weka.classifiers.trees.LMT")
	eval_train_obj = Evaluation(train_data)
	eval_train_obj.crossvalidate_model(classifier=log_tree, data=train_data, num_folds=5, rnd=Random(1))
	print("Train CV (10-folds) Error = %.2f%%" % (eval_train_obj.percent_incorrect))
	print(eval_train_obj.matrix())
	print("=================\"Summary\"====================")
	print(eval_train_obj.summary())

	log_tree.build_classifier(train_data)
	y_predict = eval_train_obj.test_model(log_tree, train_data)

	# y_train = np.array(np.where(y_train.head(500).to_numpy() == 'neg', 0, 1))
	y_train = to_binary_numeric(y_train.head(500), classNeg="neg")

	falsePositiveRate, truePositiveRate, thresholds = roc_curve(y_train, y_predict, pos_label=0)
	# compute Area Under the Curve (AUC) using the trapezoidal rule
	area = auc(falsePositiveRate, truePositiveRate)

	plt.plot(falsePositiveRate, truePositiveRate, color='red', label='ROC = ' + str(area))
	plt.plot([0, 1], [0, 1], linestyle='dotted')
	plt.xlabel('False Positive Rate')
	plt.ylabel('True Positive Rate')
	plt.title('ROC & AUC (Train)')
	plt.legend()
	plt.show()
def count_neg_and_pos(y_data):
    y_value = to_binary_numeric(y_data)
    num_neg = np.count_nonzero(y_value)
    return len(y_value) - num_neg, num_neg