def sgd_run(X, y, splits, loss): book = xlwt.Workbook(encoding="utf-8") sheet1 = book.add_sheet("SGD") sheet1.write(0, 0, "Loss") sheet1.write(0, 1, "Penalty") sheet1.write(0, 2, "Apha") sheet1.write(0, 3, "L1 ratio") sheet1.write(0, 4, "Learning rate") sheet1.write(0, 5, "Warm start") sheet1.write(0, 6, "Class weigtht") sheet1.write(0, 7, "Precision") sheet1.write(0, 8, "Recall") sheet1.write(0, 9, "F1-score") sheet1.write(0, 10, "Geometric mean") sheet1.write(0, 11, "True Positive Rate") sheet1.write(0, 12, "False Positive Rate") sheet1.write(0, 13, "False Negative Rate") sgd_params = sgd_randomized_search(X, y, loss) res = [] with open(os.path.join("outputs", "results", "test", "sdg_" + str(loss) + "_results.txt"), "w") as f: pass for index, result in enumerate(sgd_params): penalty = result[1] alpha = result[2] l1_ratio = result[3] learning_rate = result[4] warm_start = result[5] class_weight = result[6] precision_list = [] recall_list = [] f1_list = [] gmean_list = [] TPR = [] FPR = [] FNR = [] for train_index, test_index in splits: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] scaler = StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) clf = SGDClassifier(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, learning_rate=learning_rate, warm_start=warm_start, class_weight=class_weight, random_state=42) clf.fit(X_train_scaled, y_train) y_pred = clf.predict(X_test_scaled) tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() TPR.append(round(float(tp) / (tp + fn), 2)) # also called specificity FPR.append(round(float(fp) / (fp + tn), 2)) FNR.append(round(float(fn) / (tp + fn), 2)) precision = round(precision_score(y_test, y_pred, average="binary", pos_label=-1), 2) precision_list.append(precision) recall = round(recall_score(y_test, y_pred, average="binary", pos_label=-1), 2) recall_list.append(recall) f1 = round(f1_score(y_test, y_pred, average="binary", pos_label=-1), 2) f1_list.append(f1) gmean = round(geometric_mean(y_test, y_pred), 2) gmean_list.append(gmean) TPR_mean = round(np.mean(TPR), 2) TPR_var = round(np.var(TPR) ** (0.5) * 2, 2) FPR_mean = round(np.mean(FPR), 2) FPR_var = round(np.var(FPR) ** (0.5) * 2, 2) FNR_mean = round(np.mean(FNR), 2) FNR_var = round(np.var(FNR) ** (0.5) * 2, 2) precision_mean = round(np.mean(precision_list), 2) precision_var = round(np.var(precision_list) ** (0.5) * 2, 2) recall_mean = round(np.mean(recall_list), 2) recall_var = round(np.var(recall_list) ** (0.5) * 2, 2) f1_mean = round(np.mean(f1_list), 2) f1_var = round(np.var(f1_list) ** (0.5) * 2, 2) gmean_mean = round(np.mean(gmean_list), 2) gmean_var = round(np.var(gmean_list) ** (0.5) * 2, 2) res.append([loss, penalty, alpha, l1_ratio, learning_rate, warm_start, class_weight, (precision_mean, precision_var), (recall_mean, recall_var), (f1_mean, f1_var), (gmean_mean, gmean_var), (TPR_mean, TPR_var), (FPR_mean, FPR_var), (FNR_mean, FNR_var)]) with open(os.path.join("outputs", "results", "test", "sdg_" + str(loss) + "_results.txt"), "a") as f: f.write("loss={0}, penalty={1}, alpha={2}, l1 ratio={3}, learning rate={4}, " "warm start={5}, class weight={6}\n".format( loss, penalty, alpha, l1_ratio, learning_rate, warm_start, str(class_weight))) f.write("PRECISION SCORES: {0}, PRECISION MEAN: {1}, PRECISION VARIANCE: {2}\n".format(str(precision_list), precision_mean, precision_var)) f.write("RECALL SCORES: {0}, RECALL MEAN: {1}, RECALL VARIANCE: {2}\n".format(str(recall_list), recall_mean, recall_var)) f.write("F1 SCORES: {0}, F1 MEAN: {1}, F1 VARIANCE: {2}\n".format(str(f1_list), f1_mean, f1_var)) f.write("GEOMETRIC MEAN SCORES: {0}, GEOMETRIC MEAN MEAN: {1}, GEOMETRIC MEAN VARIANCE: {2}\n".format( str(gmean_list), gmean_mean, gmean_var)) f.write('TRUE POSITIVE RATE SCORES: {0}, TRUE POSITIVE RATE MEAN: {1}, TRUE POSITIVE RATE VARIANCE:' ' {2}\n'.format(TPR, TPR_mean, TPR_var)) f.write('FALSE POSITIVE RATE SCORES: {0}, FALSE POSITIVE RATE MEAN: {1}, FALSE POSITIVE RATE VARIANCE:' ' {2}\n'.format(FPR, FPR_mean, FPR_var)) f.write('FALSE NEGATIVE RATE SCORES: {0}, FALSE NEGATIVE RATE MEAN: {1}, FALSE NEGATIVE RATE VARIANCE:' ' {2}\n'.format(FNR, FNR_mean, FNR_var)) f.write("----------------------------------------------------------\n") # res = sorted(res, key=operator.itemgetter(7), reverse=True) for index, result in enumerate(res): sheet1.write(index+1, 0, result[0]) sheet1.write(index+1, 1, result[1]) sheet1.write(index+1, 2, result[2]) sheet1.write(index+1, 3, result[3]) sheet1.write(index+1, 4, result[4]) sheet1.write(index+1, 5, result[5]) sheet1.write(index+1, 6, str(result[6])) sheet1.write(index+1, 7, str(result[7])) sheet1.write(index+1, 8, str(result[8])) sheet1.write(index+1, 9, str(result[9])) sheet1.write(index+1, 10, str(result[10])) sheet1.write(index+1, 11, str(result[11])) sheet1.write(index+1, 12, str(result[12])) sheet1.write(index+1, 13, str(result[13])) book.save(os.path.join("outputs", "results", "test", "SGD_" + str(loss) + "_test_results.xls"))
def svm_run(X, y, splits): book = xlwt.Workbook(encoding="utf-8") sheet1 = book.add_sheet("SVM") sheet1.write(0, 0, "Kernel") sheet1.write(0, 1, "C") sheet1.write(0, 2, "Gamma") sheet1.write(0, 3, "Class weight") sheet1.write(0, 4, "Decision_function_shape") sheet1.write(0, 5, "Precision") sheet1.write(0, 6, "Recall") sheet1.write(0, 7, "F1-score") sheet1.write(0, 8, "Geometric mean") sheet1.write(0, 9, "True Positive Rate") sheet1.write(0, 10, "False Positive Rate") sheet1.write(0, 11, "False Negative Rate") svm_params = svm_randomized_search(X, y) res = [] with open(os.path.join("outputs", "results", "test", "svm_results.txt"), "w") as f: pass for index, result in enumerate(svm_params): kernel = result[0] C = result[1] gamma = result[2] class_weight = result[3] decision_function_shape = result[4] precision_list = [] recall_list = [] f1_list = [] gmean_list = [] TPR = [] FPR = [] FNR = [] for train_index, test_index in splits: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] scaler = StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) clf = SVC(kernel=kernel, C=C, gamma=gamma, class_weight=class_weight, decision_function_shape=decision_function_shape, random_state=42) clf.fit(X_train_scaled, y_train) y_pred = clf.predict(X_test_scaled) tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() TPR.append(round(float(tp) / (tp + fn), 2)) # also called specificity FPR.append(round(float(fp) / (fp + tn), 2)) FNR.append(round(float(fn) / (tp + fn), 2)) precision = round(precision_score(y_test, y_pred, average="binary", pos_label=-1), 2) precision_list.append(precision) recall = round(recall_score(y_test, y_pred, average="binary", pos_label=-1), 2) recall_list.append(recall) f1 = round(f1_score(y_test, y_pred, average="binary", pos_label=-1), 2) f1_list.append(f1) gmean = round(geometric_mean(y_test, y_pred), 2) gmean_list.append(gmean) TPR_mean = round(np.mean(TPR), 2) TPR_var = round(np.var(TPR) ** (0.5) * 2, 2) FPR_mean = round(np.mean(FPR), 2) FPR_var = round(np.var(FPR) ** (0.5) * 2, 2) FNR_mean = round(np.mean(FNR), 2) FNR_var = round(np.var(FNR) ** (0.5) * 2, 2) precision_mean = round(np.mean(precision_list), 2) precision_var = round(np.var(precision_list) ** (0.5) * 2, 2) recall_mean = round(np.mean(recall_list), 2) recall_var = round(np.var(recall_list) ** (0.5) * 2, 2) f1_mean = round(np.mean(f1_list), 2) f1_var = round(np.var(f1_list) ** (0.5) * 2, 2) gmean_mean = round(np.mean(gmean_list), 2) gmean_var = round(np.var(gmean_list) ** (0.5) * 2, 2) res.append([kernel, C, gamma, class_weight, decision_function_shape, (precision_mean, precision_var), (recall_mean, recall_var), (f1_mean, f1_var), (gmean_mean, gmean_var), (TPR_mean, TPR_var), (FPR_mean, FPR_var), (FNR_mean, FNR_var)]) with open(os.path.join("outputs", "results", "test", "svm_results.txt"), "a") as f: f.write("kernel={0}, C={1}, gamma={2}, class_weight={3}, decision_function_shape={4}\n".format( kernel, C, gamma, str(class_weight),decision_function_shape)) f.write("PRECISION SCORES: {0}, PRECISION MEAN: {1}, PRECISION VARIANCE: {2}\n".format(str(precision_list), precision_mean, precision_var)) f.write("RECALL SCORES: {0}, RECALL MEAN: {1}, RECALL VARIANCE: {2}\n".format(str(recall_list), recall_mean, recall_var)) f.write("F1 SCORES: {0}, F1 MEAN: {1}, F1 VARIANCE: {2}\n".format(str(f1_list), f1_mean, f1_var)) f.write("GEOMETRIC MEAN SCORES: {0}, GEOMETRIC MEAN MEAN: {1}, GEOMETRIC MEAN VARIANCE: {2}\n".format( str(gmean_list), gmean_mean, gmean_var)) f.write('TRUE POSITIVE RATE SCORES: {0}, TRUE POSITIVE RATE MEAN: {1}, TRUE POSITIVE RATE VARIANCE:' ' {2}\n'.format(TPR, TPR_mean, TPR_var)) f.write('FALSE POSITIVE RATE SCORES: {0}, FALSE POSITIVE RATE MEAN: {1}, FALSE POSITIVE RATE VARIANCE:' ' {2}\n'.format(FPR, FPR_mean, FPR_var)) f.write('FALSE NEGATIVE RATE SCORES: {0}, FALSE NEGATIVE RATE MEAN: {1}, FALSE NEGATIVE RATE VARIANCE:' ' {2}\n'.format(FNR, FNR_mean, FNR_var)) f.write("----------------------------------------------------------\n") # res = sorted(res, key=operator.itemgetter(7), reverse=True) for index, result in enumerate(res): sheet1.write(index+1, 0, result[0]) sheet1.write(index+1, 1, result[1]) sheet1.write(index+1, 2, result[2]) sheet1.write(index+1, 3, str(result[3])) sheet1.write(index+1, 4, result[4]) sheet1.write(index+1, 5, str(result[5])) sheet1.write(index+1, 6, str(result[6])) sheet1.write(index+1, 7, str(result[7])) sheet1.write(index+1, 8, str(result[8])) sheet1.write(index+1, 9, str(result[9])) sheet1.write(index+1, 10, str(result[10])) sheet1.write(index+1, 11, str(result[11])) book.save(os.path.join("outputs", "results", "test", "SVM_test_results.xls"))
std_scale = preprocessing.StandardScaler().fit(X_train) X_train = std_scale.transform(X_train) X_test = std_scale.transform(X_test) clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=10, min_samples_leaf=5) clf_entropy.fit(X_train, y_train) y_pred = clf_entropy.predict(X_test) tn, fp, fn, tp = conf_matrix = confusion_matrix(y_test, y_pred).ravel() print "(TN, FP, FN, TP) = {}".format((tn, fp, fn, tp)) gmean += geometric_mean(y_test, y_pred) * 100 precision_mal += precision_score( y_test, y_pred, average="binary", pos_label=-1) * 100 recall_mal += recall_score(y_test, y_pred, average="binary", pos_label=-1) * 100 f1_mal += f1_score(y_test, y_pred, average='binary', pos_label=-1) * 100 f_scores.append( f1_score(y_test, y_pred, average='binary', pos_label=-1) * 100) # accuracy = accuracy + accuracy_score(y_test,y_pred)*100 # precision_ben = precision_ben + precision_score(y_test, y_pred, average="binary", pos_label=1)*100 # precision_avg = precision_avg + precision_score(y_test, y_pred, average="weighted", labels=[1, -1])*100 # recall_ben = recall_ben + recall_score(y_test, y_pred, average="binary", pos_label=1)*100 # recall_avg = recall_avg + recall_score(y_test, y_pred, average="weighted", labels=[1, -1])*100 # f1_ben = f1_ben + f1_score(y_test, y_pred, average='binary', pos_label=1)*100 # f1_avg = f1_avg + f1_score(y_test, y_pred, average="weighted", labels=[1, -1])*100
def decision_tree_run(X, y, splits): book = xlwt.Workbook(encoding="utf-8") sheet1 = book.add_sheet("Decision Tree") sheet1.write(0, 0, "Criterion") sheet1.write(0, 1, "Min samples split") sheet1.write(0, 2, "Max depth") sheet1.write(0, 3, "Min samples leaf") sheet1.write(0, 4, "Max leaf nodes") sheet1.write(0, 5, "Splitter") sheet1.write(0, 6, "Max features") sheet1.write(0, 7, "Class weight") sheet1.write(0, 8, "Precision") sheet1.write(0, 9, "Recall") sheet1.write(0, 10, "F1-score") sheet1.write(0, 11, "Geometric mean") sheet1.write(0, 12, "True Positive Rate") sheet1.write(0, 13, "False Positive Rate") sheet1.write(0, 14, "False Negative Rate") decision_tree_params = decision_tree_randomized_search(X, y) res = [] with open( os.path.join("outputs", "results", "test", "decision_tree_results.txt"), "w") as f: pass for index, result in enumerate(decision_tree_params): criterion = result[0] min_samples_split = result[1] max_depth = result[2] min_samples_leaf = result[3] max_leaf_nodes = result[4] splitter = result[5] max_features = result[6] class_weight = result[7] precision_list = [] recall_list = [] f1_list = [] gmean_list = [] TPR = [] FPR = [] FNR = [] for train_index, test_index in splits: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, class_weight=class_weight, splitter=splitter, random_state=42) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() TPR.append(round(float(tp) / (tp + fn), 2)) # also called specificity FPR.append(round(float(fp) / (fp + tn), 2)) FNR.append(round(float(fn) / (tp + fn), 2)) precision = round( precision_score(y_test, y_pred, average="binary", pos_label=-1), 2) precision_list.append(precision) recall = round( recall_score(y_test, y_pred, average="binary", pos_label=-1), 2) recall_list.append(recall) f1 = round( f1_score(y_test, y_pred, average="binary", pos_label=-1), 2) f1_list.append(f1) gmean = round(geometric_mean(y_test, y_pred), 2) gmean_list.append(gmean) TPR_mean = round(np.mean(TPR), 2) TPR_var = round(np.var(TPR)**(0.5) * 2, 2) FPR_mean = round(np.mean(FPR), 2) FPR_var = round(np.var(FPR)**(0.5) * 2, 2) FNR_mean = round(np.mean(FNR), 2) FNR_var = round(np.var(FNR)**(0.5) * 2, 2) precision_mean = round(np.mean(precision_list), 2) precision_var = round(np.var(precision_list)**(0.5) * 2, 2) recall_mean = round(np.mean(recall_list), 2) recall_var = round(np.var(recall_list)**(0.5) * 2, 2) f1_mean = round(np.mean(f1_list), 2) f1_var = round(np.var(f1_list)**(0.5) * 2, 2) gmean_mean = round(np.mean(gmean_list), 2) gmean_var = round(np.var(gmean_list)**(0.5) * 2, 2) res.append([ criterion, min_samples_split, max_depth, min_samples_leaf, max_leaf_nodes, splitter, max_features, class_weight, (precision_mean, precision_var), (recall_mean, recall_var), (f1_mean, f1_var), (gmean_mean, gmean_var), (TPR_mean, TPR_var), (FPR_mean, FPR_var), (FNR_mean, FNR_var) ]) with open( os.path.join("outputs", "results", "test", "decision_tree_results.txt"), "a") as f: f.write( "criterion={0}, min_samples_split={1}, max_depth={2}, min_samples_leaf={3}, max_leaf_nodes={4}, " "splitter = {5}, max_features= {6}, class_weight={7} \n". format(criterion, min_samples_split, max_depth, min_samples_leaf, max_leaf_nodes, splitter, max_features, str(class_weight))) f.write( "PRECISION SCORES: {0}, PRECISION MEAN: {1}, PRECISION VARIANCE: {2}\n" .format(str(precision_list), precision_mean, precision_var)) f.write( "RECALL SCORES: {0}, RECALL MEAN: {1}, RECALL VARIANCE: {2}\n". format(str(recall_list), recall_mean, recall_var)) f.write("F1 SCORES: {0}, F1 MEAN: {1}, F1 VARIANCE: {2}\n".format( str(f1_list), f1_mean, f1_var)) f.write( "GEOMETRIC MEAN SCORES: {0}, GEOMETRIC MEAN MEAN: {1}, GEOMETRIC MEAN VARIANCE: {2}\n" .format(str(gmean_list), gmean_mean, gmean_var)) f.write( 'TRUE POSITIVE RATE SCORES: {0}, TRUE POSITIVE RATE MEAN: {1}, TRUE POSITIVE RATE VARIANCE:' ' {2}\n'.format(TPR, TPR_mean, TPR_var)) f.write( 'FALSE POSITIVE RATE SCORES: {0}, FALSE POSITIVE RATE MEAN: {1}, FALSE POSITIVE RATE VARIANCE:' ' {2}\n'.format(FPR, FPR_mean, FPR_var)) f.write( 'FALSE NEGATIVE RATE SCORES: {0}, FALSE NEGATIVE RATE MEAN: {1}, FALSE NEGATIVE RATE VARIANCE:' ' {2}\n'.format(FNR, FNR_mean, FNR_var)) f.write( "----------------------------------------------------------\n") # res = sorted(res, key=operator.itemgetter(7), reverse=True) for index, result in enumerate(res): sheet1.write(index + 1, 0, result[0]) sheet1.write(index + 1, 1, result[1]) sheet1.write(index + 1, 2, result[2]) sheet1.write(index + 1, 3, result[3]) sheet1.write(index + 1, 4, result[4]) sheet1.write(index + 1, 5, result[5]) sheet1.write(index + 1, 6, result[6]) sheet1.write(index + 1, 7, str(result[7])) sheet1.write(index + 1, 8, str(result[8])) sheet1.write(index + 1, 9, str(result[9])) sheet1.write(index + 1, 10, str(result[10])) sheet1.write(index + 1, 11, str(result[11])) sheet1.write(index + 1, 12, str(result[12])) sheet1.write(index + 1, 13, str(result[13])) sheet1.write(index + 1, 14, str(result[14])) book.save( os.path.join("outputs", "results", "test", "Decision_tree_test_results.xls"))