def main(): data_sets = ["binary/balance-scale.csv", "binary/tic-tac-toe.csv", "binary/ionosphere.csv", "multi/splice.csv", "multi/glass.csv", "multi/segment.csv"] folds = 2 classifier_types = {} max_depth = [d for d in range(5, 50+1, 5)] min_samples_leaf = [d for d in range(5, 50+1, 5)] test_result = numpy.zeros((len(max_depth), len(min_samples_leaf))) for data in data_sets: set_data, set_target = functions.get_data_from_csv(data) kf = cross_validation.KFold(set_data.shape[0], n_folds=folds) for i in range(len(max_depth)): for j in range(len(min_samples_leaf)): classifier_types["dt"] = BinaryTreeClassifier(max_depth=max_depth[i], min_samples_leaf=min_samples_leaf[j]) # classifier_types["dtc"] = BinaryTreeClassifier(max_depth=max_depth[i], min_samples_leaf=min_samples_leaf[j]) # classifier_types["rf"] = RandomForest(max_depth=max_depth[i],min_samples_leaf=min_samples_leaf[j]) # classifier_types["rfc"] = RandomForestClassifier(max_depth=max_depth[i],min_samples_leaf=min_samples_leaf[j]) # classifier_types["knn"] = KNeighborsClassifier(n_neighbors=max_depth[i], leaf_size=min_samples_leaf[j]) for c_type, classifier in classifier_types.iteritems(): result = list() average_accuracy = 0.0 for train, test in kf: train_set = set_data[train] train_class = set_target[train] test_set = set_data[test] test_class = set_target[test] classifier.fit(train_set, train_class.ravel()) pred = classifier.predict(test_set) result.append(functions.predictions_to_values(test_class, pred)) for res in range(len(result)): average_accuracy += metrics.accuracy_score(result[res][0], result[res][1]) average_accuracy /= float(len(result)) test_result[i][j] += average_accuracy / len(data_sets) / len(classifier_types) print "{0:5}".format(""), for i in min_samples_leaf: print "{0:<5d}".format(i), print "\n", for i in range(len(max_depth)): print "{0:>5d}".format(max_depth[i]), for j in range(len(test_result[i])): print "{0:<5.3f}".format(test_result[i][j]), print "\n",
def main(): data_sets = ["binary/breast-w.csv", "binary/diabetes.csv", "binary/labor.csv", "multi/iris.csv", "multi/letter.csv", "multi/vehicle.csv"] # Data set for experiment 3 # data_sets = ["raop.csv"] folds = 10 dt = BinaryTreeClassifier(max_depth=30, min_samples_leaf=10) dtc = DecisionTreeClassifier(max_depth=50, min_samples_leaf=5) rf = RandomForest(max_depth=40, min_samples_leaf=15) rfc = RandomForestClassifier(max_depth=50, min_samples_leaf=5) kNN = KNeighborsClassifier(n_neighbors=10, leaf_size=10) classifier_types = {"dt": dt, "rf": rf, "dtc": dtc, "rfc": rfc, "kNN": kNN} for i in data_sets: set_data, set_target = functions.get_data_from_csv(i) kf = cross_validation.KFold(set_data.shape[0], n_folds=folds) print("\n{0}{1}".format("Data set: ", i)) print("{0:5}".format("Accuracy:")) for c_type, classifier in classifier_types.items(): result = list() average_acc = 0.0 for train, test in kf: train_set = set_data[train] train_class = set_target[train] test_set = set_data[test] test_class = set_target[test] classifier.fit(train_set, train_class.ravel()) pred = classifier.predict(test_set) result.append(functions.predictions_to_values(test_class, pred)) for r in range(len(result)): average_acc += metrics.accuracy_score(result[r][0], result[r][1]) average_acc /= float(len(result)) print("{0:5}{1:5.3f}".format(c_type, average_acc))
def main(): folds = 10 dt = BinaryTreeClassifier(max_depth=30, min_samples_leaf=10) dtc = DecisionTreeClassifier(max_depth=50, min_samples_leaf=5) rf = RandomForest(max_depth=40, min_samples_leaf=15) rfc = RandomForestClassifier(max_depth=50, min_samples_leaf=5) kNN = KNeighborsClassifier(n_neighbors=10, leaf_size=10) classifier_types = {"dt": dt, "rf": rf, "dtc": dtc, "rfc": rfc, "kNN": kNN} results = {} set_data, set_target = functions.get_data_from_csv("raop.csv") kf = cross_validation.KFold(set_data.shape[0], n_folds=folds) for c_type, classifier in classifier_types.items(): results[c_type] = 0 for train, test in kf: train_set = set_data[train] train_class = set_target[train] test_set = set_data[test] test_class = set_target[test] classifier.fit(train_set, train_class.ravel()) prob = classifier.predict(test_set) true_set, pred_set = functions.predictions_to_values(test_class, prob) results[c_type] += true_set[true_set == pred_set].size processed = [] for c_type, classifier in results.items(): for c_type_2, classifier_2 in results.items(): if c_type != c_type_2: if c_type_2 not in processed: print("{0:>7} vs {1:7} {2:<5.6f}".format(c_type, c_type_2, mcnemar_midp(classifier, classifier_2))) processed.append(c_type)
def main(): # # Tests for multi datasets set_data, set_target = functions.get_data_from_csv("multi/glass.csv") # set_data, set_target = functions.get_data_from_csv("multi/iris.csv") # set_data, set_target = functions.get_data_from_csv("multi/letter.csv") # set_data, set_target = functions.get_data_from_csv("multi/segment.csv") # set_data, set_target = functions.get_data_from_csv("multi/splice.csv") # set_data, set_target = functions.get_data_from_csv("multi/waveform-5000.csv") # set_data, set_target = functions.get_data_from_csv("multi/vehicle.csv") # # # Tests for binary datasets # set_data, set_target = functions.get_data_from_csv("binary/balance-scale.csv") # set_data, set_target = functions.get_data_from_csv("binary/breast-cancer.csv") # set_data, set_target = functions.get_data_from_csv("binary/breast-w.csv") # set_data, set_target = functions.get_data_from_csv("binary/credit-a.csv") # set_data, set_target = functions.get_data_from_csv("binary/credit-g.csv") # set_data, set_target = functions.get_data_from_csv("binary/diabetes.csv") # set_data, set_target = functions.get_data_from_csv("binary/haberman.csv") # set_data, set_target = functions.get_data_from_csv("binary/heart-c.csv") # set_data, set_target = functions.get_data_from_csv("binary/heart-h.csv") # set_data, set_target = functions.get_data_from_csv("binary/heart-s.csv") # set_data, set_target = functions.get_data_from_csv("binary/hepatitis.csv") # set_data, set_target = functions.get_data_from_csv("binary/ionosphere.csv") # set_data, set_target = functions.get_data_from_csv("binary/kr-vs-kp.csv") # set_data, set_target = functions.get_data_from_csv("binary/labor.csv") # set_data, set_target = functions.get_data_from_csv("binary/liver-disorders.csv") # set_data, set_target = functions.get_data_from_csv("binary/mushroom.csv") # set_data, set_target = functions.get_data_from_csv("binary/sick.csv") # set_data, set_target = functions.get_data_from_csv("binary/sonar.csv") # set_data, set_target = functions.get_data_from_csv("binary/spambase.csv") # set_data, set_target = functions.get_data_from_csv("binary/tic-tac-toe.csv") # dt, rf are our implementations dt = BinaryTreeClassifier() dtc = DecisionTreeClassifier() rf = RandomForest() rfc = RandomForestClassifier() classifier_types = {"dt": dt, "rf": rf, "dtc": dtc, "rfc": rfc} printed = False folds = 10 kf = cross_validation.KFold(set_data.shape[0], n_folds=folds) accuracy_result = {} for c_type, classifier in classifier_types.iteritems(): result = list() prob = list() average_accuracy = 0.0 average_precision = 0.0 average_recall = 0.0 average_auc = 0.0 average_train_time = 0.0 average_test_time = 0.0 print(c_type) for train, test in kf: train_set = set_data[train] train_class = set_target[train] test_set = set_data[test] test_class = set_target[test] timer = time() classifier.fit(train_set, train_class.ravel()) if c_type == 'dt' and not printed: classifier.print_tree_wrapper() printed = True average_train_time += time() - timer timer = time() pred = classifier.predict(test_set) prob.append(classifier.predict_proba(test_set)) average_test_time += time() - timer result.append(functions.predictions_to_values(test_class, pred)) accuracy_result[c_type] = list() for res in range(len(result)): average_accuracy += metrics.accuracy_score(result[res][0], result[res][1]) accuracy_result[c_type].append(metrics.accuracy_score(result[res][0], result[res][1])) average_precision += metrics.precision_score(result[res][0], result[res][1]) average_recall += metrics.recall_score(result[res][0], result[res][1]) unique = numpy.unique(result[res][0]) proba = numpy.array(prob[res]) max_proba = numpy.array([max(d) for d in proba]) if len(unique) > 2: for i in unique: fpr, tpr, thresholds = metrics.roc_curve(result[res][0], max_proba, pos_label=int(i)) average_auc += metrics.auc(fpr, tpr) average_auc /= unique.size / 2.0 else: fpr, tpr, thresholds = metrics.roc_curve(result[res][0], max_proba) average_auc += metrics.auc(fpr, tpr) print ("Accuracy: "), print('{0:.3f}'.format(average_accuracy / float(len(result)))) print ("Precision: "), print('{0:.3f}'.format(average_precision / float(len(result)))) print ("Recall: "), print('{0:.3f}'.format(average_recall / float(len(result)))) print ("Auc: "), print('{0:.3f}'.format(average_auc / float(len(result)))) print ("") print('Training time: '), print(average_train_time / float(len(result))) print('Test time: '), print(average_test_time / float(len(result))) print ("Wilcoxon Decision Tree Classifiers: "), print (wilcoxon(accuracy_result['dt'], accuracy_result['dtc'])) print ("Wilcoxon Random Forest Classifiers: "), print (wilcoxon(accuracy_result['rf'], accuracy_result['rfc']))