def exercise2(): breast_cancer = pd.read_csv('breast_cancer.csv') X, y = split_dataset_transformed(breast_cancer, 'Class', ['?']) X_train, X_test, y_train, y_test = split_train_test(X, y) # 2.a clf = RandomForestClassifier() clf_stats = classifier_statistics(clf, X_train, X_test, y_train, y_test) print_dict(clf_stats, ['predicted']) # 2.b numb_trees = np.arange(10, 201, step=10) for trees in numb_trees: print('Experimenting with {} number of trees'.format(trees)) clf = RandomForestClassifier(n_estimators=trees) clf_stats = classifier_statistics(clf, X_train, X_test, y_train, y_test) print_dict(clf_stats, ['predicted']) # 2.c depths = np.arange(5, 20) for dep in depths: print('Experimenting with {} depth'.format(dep)) clf = RandomForestClassifier(max_depth=dep) clf_stats = classifier_statistics(clf, X_train, X_test, y_train, y_test) print_dict(clf_stats, ['predicted'])
def exercise3(): credit = pd.read_csv('credit.csv') X, y = split_dataset_transformed(credit, 'class') X_train, X_test, y_train, y_test = split_train_test(X, y) min_samples = np.arange(2, 11) for samples in min_samples: print('Experimenting with {} number of instances to split'.format( samples)) print('Train data') clf = DecisionTreeClassifier(min_samples_split=samples) clf_stats = classifier_statistics(clf, X_train, X_train, y_train, y_train) print_dict(clf_stats, ['predicted']) print('Test data') clf = DecisionTreeClassifier(min_samples_split=samples) clf_stats = classifier_statistics(clf, X_train, X_test, y_train, y_test) print_dict(clf_stats, ['predicted']) print() print() numb_trees = np.arange(10, 201, step=10) for trees in numb_trees: print('Experimenting with {} number of trees'.format(trees)) print('Train data') clf = RandomForestClassifier(n_estimators=trees) clf_stats = classifier_statistics(clf, X_train, X_train, y_train, y_train) print_dict(clf_stats, ['predicted']) print('Test data') clf = RandomForestClassifier(n_estimators=trees) clf_stats = classifier_statistics(clf, X_train, X_test, y_train, y_test) print_dict(clf_stats, ['predicted']) print() print()
def naive_bayes(): gnb = GaussianNB() for d in data: X, y = split_dataset_transformed(d[0], 'consensus') X_train, X_test, y_train, y_test = split_train_test(X, y) res = classifier_statistics(gnb, X_train, X_test, y_train, y_test) export_file(res, d[1], 'Naive bayes', "")
def compare_baseline_clf(): neighbors = [3, 5, 10] estimators = [25, 50, 100] naive_bayes = [(GaussianNB(), 'Gaussian'), (MultinomialNB(), 'Multinomial'), (BernoulliNB(), 'Bernoulli')] knns = [(KNeighborsClassifier(n_neighbors=x), 'K Nearest Neighbors {}'.format(x)) for x in neighbors] random_forests = [(RandomForestClassifier(n_estimators=x), 'Random Forest {}'.format(x)) for x in estimators] classifiers = { 'Naive Bayes': naive_bayes, 'K Nearest Neighbors': knns, 'Random Forest': random_forests } CLASSIFIER = 'Classifier' measures_dict = {} i = 0 for model_type in classifiers: for specific in classifiers[model_type]: clf, parameter = specific res = classifier_statistics(clf, X_train, X_test, y_train, y_test) conf_matrix = res['confusion_matrix'] score(conf_matrix) accuracy = res['accuracy'] sensibility = res['sensibility'] specificity = res['specificity'] measures_dict[i] = { CLASSIFIER: parameter, 'Measure': 'Accuracy', 'Value': accuracy } i += 1 measures_dict[i] = { CLASSIFIER: parameter, 'Measure': 'Sensibility', 'Value': sensibility } i += 1 measures_dict[i] = { CLASSIFIER: parameter, 'Measure': 'Specificity', 'Value': specificity } i += 1 measures = pd.DataFrame.from_dict(measures_dict, "index") measures.to_csv('plot_data/initial_results.csv') plt.figure(figsize=(22, 6)) ax = sns.barplot(x=CLASSIFIER, y='Value', hue='Measure', data=measures) #plt.savefig('images/initial_results.pdf') plt.clf()
def knn (): n_neighbors_values = [2,3,10] for d in data: for n in n_neighbors_values: X, y = split_dataset_transformed(d[0], 'consensus') X_train, X_test, y_train, y_test = split_train_test(X, y) neigh = KNeighborsClassifier(n_neighbors=n) res = classifier_statistics(neigh, X_train, X_test, y_train, y_test) export_file(res, d[1], 'KNN', "k=" + str(n))
cost1 = 10 cost2 = 500 conf_matrix = results['confusion_matrix'] fp = conf_matrix[0][1] fn = conf_matrix[1][0] total_cost = cost1*fp + cost2*fn print('Total cost aachived: {}'.format(total_cost)) """ """ probably something is wrong here, the results seem to be too good """ clf = KNeighborsClassifier(n_neighbors=10) results = classifier_statistics(clf, X_train_res, X_test, y_train_res, y_test) print_dict(results, excluded_keys=['predicted']) cost1 = 10 cost2 = 500 conf_matrix = results['confusion_matrix'] fp = conf_matrix[0][1] fn = conf_matrix[1][0] total_cost = cost1*fp + cost2*fn print('Total cost aachived: {}'.format(total_cost))
def naive_bayes_balenced(): gnb = GaussianNB() for d in data: X_train, X_test, y_train, y_test, X_train_res, y_train_res = balance_dataset(d[0], 'consensus') res = classifier_statistics(gnb, X_train_res, X_test, y_train_res.ravel(), y_test.ravel()) export_file(res, d[1], 'Naive bayes balenced', "")
def knn (): n_neighbors_values = [2,3,10] for d in data: for n in n_neighbors_values: X, y = split_dataset_transformed(d[0], 'consensus') X_train, X_test, y_train, y_test = split_train_test(X, y) neigh = KNeighborsClassifier(n_neighbors=n) res = classifier_statistics(neigh, X_train, X_test, y_train, y_test) export_file(res, d[1], 'KNN', "k=" + str(n)) def naive_bayes(): gnb = GaussianNB() for d in data: X, y = split_dataset_transformed(d[0], 'consensus') X_train, X_test, y_train, y_test = split_train_test(X, y) res = classifier_statistics(gnb, X_train, X_test, y_train, y_test) export_file(res, d[1], 'Naive bayes', "") def naive_bayes_balenced(): gnb = GaussianNB() for d in data: X_train, X_test, y_train, y_test, X_train_res, y_train_res = balance_dataset(d[0], 'consensus') res = classifier_statistics(gnb, X_train_res, X_test, y_train_res.ravel(), y_test.ravel()) export_file(res, d[1], 'Naive bayes balenced', "") clf_RF = RandomForestClassifier(n_estimators=800, max_depth=4) X_train, X_test, y_train, y_test, X_train_res, y_train_res = balance_dataset(data[0][0], 'consensus') res = classifier_statistics(clf_RF, X_train, X_test, y_train, y_test) pprint(res)
print_dict(clf_stats, ['predicted']) print() print() diabetes = pd.read_csv('diabetes.csv') X, y = split_dataset_transformed(diabetes, 'class') X_train, X_test, y_train, y_test = split_train_test(X, y) # 3.a enc = KBinsDiscretizer(n_bins=10, encode='onehot') X_binned = enc.fit_transform(X) X_binned = X_binned.toarray() X_binned_train, X_binned_test, y_binned_train, y_binned_test = split_train_test( X_binned, y) print('With discretization') clf1 = RandomForestClassifier() clf_stats = classifier_statistics(clf1, X_binned_train, X_binned_test, y_binned_train, y_binned_test) print_dict(clf_stats, ['predicted']) print('Without discretization') clf2 = RandomForestClassifier() clf_stats = classifier_statistics(clf2, X_train, X_test, y_train, y_test) print_dict(clf_stats, ['predicted'])
aps_train = aps_train.dropna() aps_test = aps_test.dropna() X_train, y_train = split_dataset(aps_train, 'class') X_test, y_test = split_dataset(aps_train, 'class') y_train = y_train.map({'pos': 1, 'neg': 0}) y_test = y_test.map({'pos': 1, 'neg': 0}) X_train = X_train.astype('float64') X_test = X_test.astype('float64') neigh = KNeighborsClassifier(n_neighbors=10) res = classifier_statistics(neigh, X_train, X_test, y_train, y_test) pprint(res) """ k=2 accuracy: 0.936 sensibility: 0.5 k=3 accuracy: 0.946 sensibility: 0.658 k=10 accuracy: 0.892 sensibility: 0.224