skip_first_row=21, y_column_index=0, assignedColumnNames=APS_FULL_COLUMNS, missingSymbol='na', needImpute=True, dropOrNot=True) X_test, y_test = load_data(ROOT_PATH + APS_TEST, skip_first_row=21, y_column_index=0, assignedColumnNames=APS_FULL_COLUMNS, missingSymbol='na', needImpute=True, dropOrNot=True) y_train = to_binary_numeric(y_train, classNeg="neg") y_test = to_binary_numeric(y_test, classNeg="neg") randForestClf = RandomForestClassifier(n_estimators=50, random_state=2333, oob_score=True) randForestClf.fit(X_train, y_train) y_predict = randForestClf.predict(X_test) falsePositiveRate, truePositiveRate, thresholds = roc_curve( y_test, y_predict) # compute Area Under the Curve (AUC) using the trapezoidal rule area = auc(falsePositiveRate, truePositiveRate) plt.plot(falsePositiveRate, truePositiveRate, color='red',
# __author__ = 'Aaron Yang' __email__ = '*****@*****.**' __date__ = '10/4/2019 5:30 PM' import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from ay_hw_4._global import ROOT_PATH, APS_TRAIN, APS_FULL_COLUMNS from ay_hw_4.util_data import load_data, to_binary_numeric if __name__ == "__main__": pd.set_option('display.max_columns', 100) X_data, y_data = load_data(ROOT_PATH + APS_TRAIN, skip_first_row=21, y_column_index=0, assignedColumnNames=APS_FULL_COLUMNS, missingSymbol='na', needImpute=True, dropOrNot=True) y_data = to_binary_numeric(y_data, classNeg="neg") data = pd.concat([y_data, X_data], axis=1) correlation = data.corr() fig = plt.figure(figsize=(20, 15)) sns.heatmap(correlation, vmin=-1, vmax=1, cmap=sns.color_palette("Blues")) plt.show() # 把dropOrNot打开 将报错, 因为数据中有10列存在NaN
log_tree = Classifier(classname="weka.classifiers.trees.LMT") eval_smote_test_obj = Evaluation(smote_test_data) eval_smote_test_obj.crossvalidate_model(classifier=log_tree, data=smote_test_data, num_folds=5, rnd=Random(1)) print("SMOTE Test CV (5-folds) Error = %.2f%%" % (eval_smote_test_obj.percent_incorrect)) print(eval_smote_test_obj.matrix()) print("=================\"Summary\"====================") print(eval_smote_test_obj.summary()) log_tree.build_classifier(smote_test_data) y_predict = eval_smote_test_obj.test_model(log_tree, smote_test_data) y_test = to_binary_numeric(y_test.head(500), classNeg="neg") falsePositiveRate, truePositiveRate, thresholds = roc_curve( y_test, y_predict) # compute Area Under the Curve (AUC) using the trapezoidal rule area = auc(falsePositiveRate, truePositiveRate) plt.plot(falsePositiveRate, truePositiveRate, color='red', label='ROC = ' + str(area)) plt.plot([0, 1], [0, 1], linestyle='dotted') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC & AUC (SMOTE Test)') plt.legend()
log_tree = Classifier(classname="weka.classifiers.trees.LMT") eval_smote_train_obj = Evaluation(smote_train_data) eval_smote_train_obj.crossvalidate_model(classifier=log_tree, data=smote_train_data, num_folds=5, rnd=Random(1)) print("SMOTE Train CV (5-folds) Error = %.2f%%" % (eval_smote_train_obj.percent_incorrect)) print(eval_smote_train_obj.matrix()) print("=================\"Summary\"====================") print(eval_smote_train_obj.summary()) log_tree.build_classifier(smote_train_data) y_predict = eval_smote_train_obj.test_model(log_tree, smote_train_data) y_train_smote = to_binary_numeric(y_train_smote, classNeg="neg") falsePositiveRate, truePositiveRate, thresholds = roc_curve(y_train_smote, y_predict, pos_label=0) # compute Area Under the Curve (AUC) using the trapezoidal rule area = auc(falsePositiveRate, truePositiveRate) plt.plot(falsePositiveRate, truePositiveRate, color='red', label='ROC = ' + str(area)) plt.plot([0, 1], [0, 1], linestyle='dotted') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC & AUC (SMOTE Train)')
export_train_data.to_csv(GENERATED_TRAIN_DATA_FILE_PATH, sep=',', index=False) train_data = convert.load_any_file(filename=GENERATED_TRAIN_DATA_FILE_PATH) train_data.class_is_first() # load logistic model tree algorithm log_tree = Classifier(classname="weka.classifiers.trees.LMT") eval_train_obj = Evaluation(train_data) eval_train_obj.crossvalidate_model(classifier=log_tree, data=train_data, num_folds=5, rnd=Random(1)) print("Train CV (10-folds) Error = %.2f%%" % (eval_train_obj.percent_incorrect)) print(eval_train_obj.matrix()) print("=================\"Summary\"====================") print(eval_train_obj.summary()) log_tree.build_classifier(train_data) y_predict = eval_train_obj.test_model(log_tree, train_data) # y_train = np.array(np.where(y_train.head(500).to_numpy() == 'neg', 0, 1)) y_train = to_binary_numeric(y_train.head(500), classNeg="neg") falsePositiveRate, truePositiveRate, thresholds = roc_curve(y_train, y_predict, pos_label=0) # compute Area Under the Curve (AUC) using the trapezoidal rule area = auc(falsePositiveRate, truePositiveRate) plt.plot(falsePositiveRate, truePositiveRate, color='red', label='ROC = ' + str(area)) plt.plot([0, 1], [0, 1], linestyle='dotted') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC & AUC (Train)') plt.legend() plt.show()
def count_neg_and_pos(y_data): y_value = to_binary_numeric(y_data) num_neg = np.count_nonzero(y_value) return len(y_value) - num_neg, num_neg