Esempio n. 1
0
def knn_variation(x_train, y_train):
    result_df_uniform = pd.DataFrame()
    result_df_distance = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train),
                                             shuffle=True,
                                             n_folds=10,
                                             random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets,
         val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        for n in range(1, 25):
            for w in ["uniform", "distance"]:
                clf = neighbors.KNeighborsClassifier(n_neighbors=n, weights=w)
                clf.fit(tr_data, tr_targets)
                prediction = clf.predict(val_data)
                accuracy = metrics.accuracy_score(prediction, val_targets)
                if w == "uniform":
                    result_df_uniform.loc[
                        foldnum, "neigbhours={0}".format(n)] = accuracy
                elif w == "distance":
                    result_df_distance.loc[
                        foldnum, "neigbhours={0}".format(n)] = accuracy

    return result_df_uniform, result_df_distance
def dtree_variations(x_train, y_train):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        clf_best = tree.DecisionTreeClassifier(
                                          max_depth=7,
                                          min_samples_leaf=10
                                          )
        clf_best.fit(tr_data, tr_targets)
        clf_default = tree.DecisionTreeClassifier()
        clf_default.fit(tr_data, tr_targets)

        prediction_default = clf_default.predict(val_data)
        prediction_best = clf_best.predict(val_data)
        accuracy_default = metrics.accuracy_score(prediction_default, val_targets)
        accuracy_best = metrics.accuracy_score(prediction_best, val_targets)
        result_df.loc[foldnum, "default setting"] = accuracy_default
        result_df.loc[foldnum, "best setting"] = accuracy_best

    return result_df
Esempio n. 3
0
def dtree_variations(x_train, y_train):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train),
                                             shuffle=True,
                                             n_folds=10,
                                             random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets,
         val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        clf_best = tree.DecisionTreeClassifier(max_depth=7,
                                               min_samples_leaf=10)
        clf_best.fit(tr_data, tr_targets)
        clf_default = tree.DecisionTreeClassifier()
        clf_default.fit(tr_data, tr_targets)

        prediction_default = clf_default.predict(val_data)
        prediction_best = clf_best.predict(val_data)
        accuracy_default = metrics.accuracy_score(prediction_default,
                                                  val_targets)
        accuracy_best = metrics.accuracy_score(prediction_best, val_targets)
        result_df.loc[foldnum, "default setting"] = accuracy_default
        result_df.loc[foldnum, "best setting"] = accuracy_best

    return result_df
def lr_variation(x_train, y_train):
    result_df1 = pd.DataFrame()
    result_df2 = pd.DataFrame()
    weight_row_list = []

    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        # L1 regularization
        for C in [0.1, 1, 5, 10, 100, 10**3]:
            clf = linear_model.LogisticRegression(C=C, penalty="l1")
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df1.loc[foldnum, "C={0}".format(C)] = accuracy
            weight_dict = dict(zip(x_train.columns, clf.coef_[0]))
            weight_row_list.append(weight_dict)

        # L2 regularization
        for C in [0.1, 1, 5, 10, 100, 10**3]:
            clf = linear_model.LogisticRegression(C=C, penalty="l2")
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df2.loc[foldnum, "C={0}".format(C)] = accuracy
            weight_dict = dict(zip(x_train.columns, clf.coef_[0]))
            weight_row_list.append(weight_dict)

    weight_df = pd.DataFrame.from_dict(weight_row_list)

    return result_df1, result_df2, weight_df
def tenfold_cross_validation(x_train, y_train, classifiers):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        for classfier_name, clf in classifiers.iteritems():
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df.loc[foldnum, classfier_name] = accuracy
    return result_df
def perceptron_variation(x_train, y_train):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        for n in [1, 5, 10, 50, 100, 500, 1000]:
            clf = linear_model.Perceptron(n_iter=n)
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df.loc[foldnum, "n_iter={0}".format(n)] = accuracy
    return result_df
def svm_variation(x_train, y_train):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        for C in [0.1, 0.3, 0.5, 0.7, 1, 2, 3, 4, 5, 10, 100, 10**3, 10**5, 10**7, 10**9]:
            clf = svm.SVC(kernel="rbf", C=C)
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df.loc[foldnum, "C={0}".format(C)] = accuracy
    return result_df
def majority_vote(x_train, y_train):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        final_estimators = [(i, FINAL_ALGOS[i]) for i in FINAL_ALGOS.keys()]
        clf = ensemble.VotingClassifier(estimators=final_estimators)
        clf.fit(tr_data, tr_targets)
        prediction = clf.predict(val_data)
        accuracy = metrics.accuracy_score(prediction, val_targets)
        result_df.loc[foldnum, "Voting"] = accuracy
    return result_df
def ada_boost_variation(x_train, y_train):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        for n in [5, 10, 50, 100, 500, 1000]:
            clf = ensemble.AdaBoostClassifier(n_estimators=n)
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df.loc[foldnum, "n estimator={0}".format(n)] = accuracy
    return result_df
def boosting_cv(x_train, y_train, classifiers):
    base_learner = tree.DecisionTreeClassifier()
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        for classfier_name, classifier in classifiers.iteritems():
            clf = classifier(n_estimators=50, base_estimator=base_learner)
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df.loc[foldnum, "{0}".format(classfier_name)] = accuracy
    return result_df
Esempio n. 11
0
def tenfold_cross_validation(x_train, y_train, classifiers):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train),
                                             shuffle=True,
                                             n_folds=10,
                                             random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets,
         val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        for classfier_name, clf in classifiers.iteritems():
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df.loc[foldnum, classfier_name] = accuracy
    return result_df
Esempio n. 12
0
def ada_boost_variation(x_train, y_train):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train),
                                             shuffle=True,
                                             n_folds=10,
                                             random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets,
         val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        for n in [5, 10, 50, 100, 500, 1000]:
            clf = ensemble.AdaBoostClassifier(n_estimators=n)
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df.loc[foldnum, "n estimator={0}".format(n)] = accuracy
    return result_df
def Random_forest_variation(x_train, y_train):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()
        clf = ensemble.RandomForestClassifier(max_depth=6,
                                              n_estimators=50)
        clf.fit(tr_data, tr_targets)
        prediction = clf.predict(val_data)
        accuracy = metrics.accuracy_score(prediction, val_targets)
        result_df.loc[foldnum, "best"] = accuracy
        clf = ensemble.RandomForestClassifier()
        clf.fit(tr_data, tr_targets)
        prediction = clf.predict(val_data)
        accuracy = metrics.accuracy_score(prediction, val_targets)
        result_df.loc[foldnum, "default"] = accuracy
    return result_df
Esempio n. 14
0
def majority_vote(x_train, y_train):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train),
                                             shuffle=True,
                                             n_folds=10,
                                             random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets,
         val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        final_estimators = [(i, FINAL_ALGOS[i]) for i in FINAL_ALGOS.keys()]
        clf = ensemble.VotingClassifier(estimators=final_estimators)
        clf.fit(tr_data, tr_targets)
        prediction = clf.predict(val_data)
        accuracy = metrics.accuracy_score(prediction, val_targets)
        result_df.loc[foldnum, "Voting"] = accuracy
    return result_df
Esempio n. 15
0
def perceptron_variation(x_train, y_train):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train),
                                             shuffle=True,
                                             n_folds=10,
                                             random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets,
         val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        for n in [1, 5, 10, 50, 100, 500, 1000]:
            clf = linear_model.Perceptron(n_iter=n)
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df.loc[foldnum, "n_iter={0}".format(n)] = accuracy
    return result_df
Esempio n. 16
0
def boosting_cv(x_train, y_train, classifiers):
    base_learner = tree.DecisionTreeClassifier()
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train),
                                             shuffle=True,
                                             n_folds=10,
                                             random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets,
         val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        for classfier_name, classifier in classifiers.iteritems():
            clf = classifier(n_estimators=50, base_estimator=base_learner)
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df.loc[foldnum, "{0}".format(classfier_name)] = accuracy
    return result_df
Esempio n. 17
0
def lr_variation(x_train, y_train):
    result_df1 = pd.DataFrame()
    result_df2 = pd.DataFrame()
    weight_row_list = []

    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train),
                                             shuffle=True,
                                             n_folds=10,
                                             random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets,
         val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        # L1 regularization
        for C in [0.1, 1, 5, 10, 100, 10**3]:
            clf = linear_model.LogisticRegression(C=C, penalty="l1")
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df1.loc[foldnum, "C={0}".format(C)] = accuracy
            weight_dict = dict(zip(x_train.columns, clf.coef_[0]))
            weight_row_list.append(weight_dict)

        # L2 regularization
        for C in [0.1, 1, 5, 10, 100, 10**3]:
            clf = linear_model.LogisticRegression(C=C, penalty="l2")
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df2.loc[foldnum, "C={0}".format(C)] = accuracy
            weight_dict = dict(zip(x_train.columns, clf.coef_[0]))
            weight_row_list.append(weight_dict)

    weight_df = pd.DataFrame.from_dict(weight_row_list)

    return result_df1, result_df2, weight_df
def knn_variation(x_train, y_train):
    result_df_uniform = pd.DataFrame()
    result_df_distance = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        for n in range(1, 25):
            for w in ["uniform", "distance"]:
                clf = neighbors.KNeighborsClassifier(n_neighbors=n, weights=w)
                clf.fit(tr_data, tr_targets)
                prediction = clf.predict(val_data)
                accuracy = metrics.accuracy_score(prediction, val_targets)
                if w == "uniform":
                    result_df_uniform.loc[foldnum, "neigbhours={0}".format(n)] = accuracy
                elif w == "distance":
                    result_df_distance.loc[foldnum, "neigbhours={0}".format(n)] = accuracy

    return result_df_uniform, result_df_distance
Esempio n. 19
0
def svm_variation(x_train, y_train):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train),
                                             shuffle=True,
                                             n_folds=10,
                                             random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets,
         val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        for C in [
                0.1, 0.3, 0.5, 0.7, 1, 2, 3, 4, 5, 10, 100, 10**3, 10**5,
                10**7, 10**9
        ]:
            clf = svm.SVC(kernel="rbf", C=C)
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df.loc[foldnum, "C={0}".format(C)] = accuracy
    return result_df
Esempio n. 20
0
def Random_forest_variation(x_train, y_train):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train),
                                             shuffle=True,
                                             n_folds=10,
                                             random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets,
         val_targets] = helper.folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()
        clf = ensemble.RandomForestClassifier(max_depth=6, n_estimators=50)
        clf.fit(tr_data, tr_targets)
        prediction = clf.predict(val_data)
        accuracy = metrics.accuracy_score(prediction, val_targets)
        result_df.loc[foldnum, "best"] = accuracy
        clf = ensemble.RandomForestClassifier()
        clf.fit(tr_data, tr_targets)
        prediction = clf.predict(val_data)
        accuracy = metrics.accuracy_score(prediction, val_targets)
        result_df.loc[foldnum, "default"] = accuracy
    return result_df