def knn_variation(x_train, y_train): result_df_uniform = pd.DataFrame() result_df_distance = pd.DataFrame() foldnum = 0 for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0): foldnum += 1 [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val) tr_targets = tr_targets.as_matrix().ravel() val_targets = val_targets.as_matrix().ravel() for n in range(1, 25): for w in ["uniform", "distance"]: clf = neighbors.KNeighborsClassifier(n_neighbors=n, weights=w) clf.fit(tr_data, tr_targets) prediction = clf.predict(val_data) accuracy = metrics.accuracy_score(prediction, val_targets) if w == "uniform": result_df_uniform.loc[ foldnum, "neigbhours={0}".format(n)] = accuracy elif w == "distance": result_df_distance.loc[ foldnum, "neigbhours={0}".format(n)] = accuracy return result_df_uniform, result_df_distance
def dtree_variations(x_train, y_train): result_df = pd.DataFrame() foldnum = 0 for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0): foldnum += 1 [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val) tr_targets = tr_targets.as_matrix().ravel() val_targets = val_targets.as_matrix().ravel() clf_best = tree.DecisionTreeClassifier( max_depth=7, min_samples_leaf=10 ) clf_best.fit(tr_data, tr_targets) clf_default = tree.DecisionTreeClassifier() clf_default.fit(tr_data, tr_targets) prediction_default = clf_default.predict(val_data) prediction_best = clf_best.predict(val_data) accuracy_default = metrics.accuracy_score(prediction_default, val_targets) accuracy_best = metrics.accuracy_score(prediction_best, val_targets) result_df.loc[foldnum, "default setting"] = accuracy_default result_df.loc[foldnum, "best setting"] = accuracy_best return result_df
def dtree_variations(x_train, y_train): result_df = pd.DataFrame() foldnum = 0 for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0): foldnum += 1 [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val) tr_targets = tr_targets.as_matrix().ravel() val_targets = val_targets.as_matrix().ravel() clf_best = tree.DecisionTreeClassifier(max_depth=7, min_samples_leaf=10) clf_best.fit(tr_data, tr_targets) clf_default = tree.DecisionTreeClassifier() clf_default.fit(tr_data, tr_targets) prediction_default = clf_default.predict(val_data) prediction_best = clf_best.predict(val_data) accuracy_default = metrics.accuracy_score(prediction_default, val_targets) accuracy_best = metrics.accuracy_score(prediction_best, val_targets) result_df.loc[foldnum, "default setting"] = accuracy_default result_df.loc[foldnum, "best setting"] = accuracy_best return result_df
def lr_variation(x_train, y_train): result_df1 = pd.DataFrame() result_df2 = pd.DataFrame() weight_row_list = [] foldnum = 0 for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0): foldnum += 1 [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val) tr_targets = tr_targets.as_matrix().ravel() val_targets = val_targets.as_matrix().ravel() # L1 regularization for C in [0.1, 1, 5, 10, 100, 10**3]: clf = linear_model.LogisticRegression(C=C, penalty="l1") clf.fit(tr_data, tr_targets) prediction = clf.predict(val_data) accuracy = metrics.accuracy_score(prediction, val_targets) result_df1.loc[foldnum, "C={0}".format(C)] = accuracy weight_dict = dict(zip(x_train.columns, clf.coef_[0])) weight_row_list.append(weight_dict) # L2 regularization for C in [0.1, 1, 5, 10, 100, 10**3]: clf = linear_model.LogisticRegression(C=C, penalty="l2") clf.fit(tr_data, tr_targets) prediction = clf.predict(val_data) accuracy = metrics.accuracy_score(prediction, val_targets) result_df2.loc[foldnum, "C={0}".format(C)] = accuracy weight_dict = dict(zip(x_train.columns, clf.coef_[0])) weight_row_list.append(weight_dict) weight_df = pd.DataFrame.from_dict(weight_row_list) return result_df1, result_df2, weight_df
def tenfold_cross_validation(x_train, y_train, classifiers): result_df = pd.DataFrame() foldnum = 0 for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0): foldnum += 1 [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val) tr_targets = tr_targets.as_matrix().ravel() val_targets = val_targets.as_matrix().ravel() for classfier_name, clf in classifiers.iteritems(): clf.fit(tr_data, tr_targets) prediction = clf.predict(val_data) accuracy = metrics.accuracy_score(prediction, val_targets) result_df.loc[foldnum, classfier_name] = accuracy return result_df
def perceptron_variation(x_train, y_train): result_df = pd.DataFrame() foldnum = 0 for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0): foldnum += 1 [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val) tr_targets = tr_targets.as_matrix().ravel() val_targets = val_targets.as_matrix().ravel() for n in [1, 5, 10, 50, 100, 500, 1000]: clf = linear_model.Perceptron(n_iter=n) clf.fit(tr_data, tr_targets) prediction = clf.predict(val_data) accuracy = metrics.accuracy_score(prediction, val_targets) result_df.loc[foldnum, "n_iter={0}".format(n)] = accuracy return result_df
def svm_variation(x_train, y_train): result_df = pd.DataFrame() foldnum = 0 for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0): foldnum += 1 [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val) tr_targets = tr_targets.as_matrix().ravel() val_targets = val_targets.as_matrix().ravel() for C in [0.1, 0.3, 0.5, 0.7, 1, 2, 3, 4, 5, 10, 100, 10**3, 10**5, 10**7, 10**9]: clf = svm.SVC(kernel="rbf", C=C) clf.fit(tr_data, tr_targets) prediction = clf.predict(val_data) accuracy = metrics.accuracy_score(prediction, val_targets) result_df.loc[foldnum, "C={0}".format(C)] = accuracy return result_df
def majority_vote(x_train, y_train): result_df = pd.DataFrame() foldnum = 0 for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0): foldnum += 1 [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val) tr_targets = tr_targets.as_matrix().ravel() val_targets = val_targets.as_matrix().ravel() final_estimators = [(i, FINAL_ALGOS[i]) for i in FINAL_ALGOS.keys()] clf = ensemble.VotingClassifier(estimators=final_estimators) clf.fit(tr_data, tr_targets) prediction = clf.predict(val_data) accuracy = metrics.accuracy_score(prediction, val_targets) result_df.loc[foldnum, "Voting"] = accuracy return result_df
def ada_boost_variation(x_train, y_train): result_df = pd.DataFrame() foldnum = 0 for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0): foldnum += 1 [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val) tr_targets = tr_targets.as_matrix().ravel() val_targets = val_targets.as_matrix().ravel() for n in [5, 10, 50, 100, 500, 1000]: clf = ensemble.AdaBoostClassifier(n_estimators=n) clf.fit(tr_data, tr_targets) prediction = clf.predict(val_data) accuracy = metrics.accuracy_score(prediction, val_targets) result_df.loc[foldnum, "n estimator={0}".format(n)] = accuracy return result_df
def boosting_cv(x_train, y_train, classifiers): base_learner = tree.DecisionTreeClassifier() result_df = pd.DataFrame() foldnum = 0 for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0): foldnum += 1 [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val) tr_targets = tr_targets.as_matrix().ravel() val_targets = val_targets.as_matrix().ravel() for classfier_name, classifier in classifiers.iteritems(): clf = classifier(n_estimators=50, base_estimator=base_learner) clf.fit(tr_data, tr_targets) prediction = clf.predict(val_data) accuracy = metrics.accuracy_score(prediction, val_targets) result_df.loc[foldnum, "{0}".format(classfier_name)] = accuracy return result_df
def Random_forest_variation(x_train, y_train): result_df = pd.DataFrame() foldnum = 0 for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0): foldnum += 1 [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val) tr_targets = tr_targets.as_matrix().ravel() val_targets = val_targets.as_matrix().ravel() clf = ensemble.RandomForestClassifier(max_depth=6, n_estimators=50) clf.fit(tr_data, tr_targets) prediction = clf.predict(val_data) accuracy = metrics.accuracy_score(prediction, val_targets) result_df.loc[foldnum, "best"] = accuracy clf = ensemble.RandomForestClassifier() clf.fit(tr_data, tr_targets) prediction = clf.predict(val_data) accuracy = metrics.accuracy_score(prediction, val_targets) result_df.loc[foldnum, "default"] = accuracy return result_df
def knn_variation(x_train, y_train): result_df_uniform = pd.DataFrame() result_df_distance = pd.DataFrame() foldnum = 0 for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0): foldnum += 1 [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val) tr_targets = tr_targets.as_matrix().ravel() val_targets = val_targets.as_matrix().ravel() for n in range(1, 25): for w in ["uniform", "distance"]: clf = neighbors.KNeighborsClassifier(n_neighbors=n, weights=w) clf.fit(tr_data, tr_targets) prediction = clf.predict(val_data) accuracy = metrics.accuracy_score(prediction, val_targets) if w == "uniform": result_df_uniform.loc[foldnum, "neigbhours={0}".format(n)] = accuracy elif w == "distance": result_df_distance.loc[foldnum, "neigbhours={0}".format(n)] = accuracy return result_df_uniform, result_df_distance
def svm_variation(x_train, y_train): result_df = pd.DataFrame() foldnum = 0 for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0): foldnum += 1 [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val) tr_targets = tr_targets.as_matrix().ravel() val_targets = val_targets.as_matrix().ravel() for C in [ 0.1, 0.3, 0.5, 0.7, 1, 2, 3, 4, 5, 10, 100, 10**3, 10**5, 10**7, 10**9 ]: clf = svm.SVC(kernel="rbf", C=C) clf.fit(tr_data, tr_targets) prediction = clf.predict(val_data) accuracy = metrics.accuracy_score(prediction, val_targets) result_df.loc[foldnum, "C={0}".format(C)] = accuracy return result_df