def run_once(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)
    # build model
    dc_tree = DecisionTreeClassifier(criterion='entropy',
                                     min_samples_split=20,
                                     random_state=99)
    rfe = RFE(dc_tree, best_nof_feature)
    rfe_train_x = rfe.fit_transform(train_x, train_y)
    rfe_test_x = rfe.transform(test_x)
    dc_tree.fit(rfe_train_x, train_y)
    labels = df_label.unique()
    # predict
    test_y_predict = dc_tree.predict(rfe_test_x)
    get_accuracy("decision tree", test_y, test_y_predict, labels)
    # print features
    cols = list(df_ohe.columns)
    temp = pd.Series(rfe.support_, index=cols)
    selected_features_rfe = temp[temp == True].index
    save_print("Top " + str(best_nof_feature) + " features are: ")
    save_print(selected_features_rfe)
    # dump model
    joblib.dump(dc_tree, root_folder + "dc_tree.pkl")
    save_print("dc_tree Model dumped!")
    joblib.dump(selected_features_rfe, root_folder + "dc_tree_cols.pkl")
    save_print("dc_tree models columns dumped!")
Ejemplo n.º 2
0
def run_once(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)
    # build model
    lg_regression = linear_model.LogisticRegression(solver='lbfgs')
    rfe = RFE(lg_regression, best_nof_feature)
    rfe_train_x = rfe.fit_transform(train_x, train_y)
    rfe_test_x = rfe.transform(test_x)
    lg_regression.fit(rfe_train_x, train_y)
    labels = df_label.unique()
    # predict probs
    test_y_predict_probs = lg_regression.predict_proba(rfe_test_x)
    test_y_predict_prob = test_y_predict_probs[:, 1]
    prob_df = pd.DataFrame(test_y_predict_prob)
    prob_df['predict'] = np.where(prob_df[0] >= lg_threshold, 1, 0)
    get_accuracy("logistic regression predict_probs", test_y,
                 prob_df['predict'], labels)
    # print features
    cols = list(df_ohe.columns)
    temp = pd.Series(rfe.support_, index=cols)
    selected_features_rfe = temp[temp == True].index
    save_print("Top " + str(best_nof_feature) + " features are: ")
    save_print(selected_features_rfe)
    # dump model
    joblib.dump(lg_regression, root_folder + "lg_regression.pkl")
    save_print("lg_regression Model dumped!")
    joblib.dump(selected_features_rfe, root_folder + "lg_regression_cols.pkl")
    save_print("lg_regression models columns dumped!")
Ejemplo n.º 3
0
def build_random_forest(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe, df_label, test_size=0.3, random_state=99)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)
    rf = RandomForestClassifier(n_estimators=1000)
    rf.fit(train_x, train_y)
    test_y_predict = rf.predict(test_x)
    labels = df_label.unique()
    get_accuracy("random forest", test_y, test_y_predict, labels)
Ejemplo n.º 4
0
def build_nearest_neighbors(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)
    knn = KNeighborsClassifier(num_of_neighbors, weights=knn_weights)
    knn.fit(train_x, train_y)
    test_y_predict = knn.predict(test_x)
    labels = df_label.unique()
    get_accuracy("nearest neighbors", test_y, test_y_predict, labels)
Ejemplo n.º 5
0
def build_ada_boosting(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)
    ab = AdaBoostClassifier(n_estimators=1000)
    ab.fit(train_x, train_y)
    test_y_predict = ab.predict(test_x)
    labels = df_label.unique()
    get_accuracy("ada boosting", test_y, test_y_predict, labels)
Ejemplo n.º 6
0
def run_rfe(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)

    # build model
    nof_list = np.arange(1, (max_feature_try_numbers + 1))
    class_1_precision_list = []
    class_1_recall_list = []
    for n in range(len(nof_list)):
        save_print("********Current nof features are: " + str(nof_list[n]))
        lg_regression = linear_model.LogisticRegression(solver='lbfgs')
        rfe = RFE(lg_regression, nof_list[n])
        rfe_train_x = rfe.fit_transform(train_x, train_y)
        rfe_test_x = rfe.transform(test_x)
        lg_regression.fit(rfe_train_x, train_y)
        labels = df_label.unique()
        # predict probs
        test_y_predict_probs = lg_regression.predict_proba(rfe_test_x)
        test_y_predict_prob = test_y_predict_probs[:, 1]
        prob_df = pd.DataFrame(test_y_predict_prob)
        prob_df['predict'] = np.where(prob_df[0] >= lg_threshold, 1, 0)
        class_1_precision, class_1_recall = get_accuracy(
            "logistic regression predict_probs", test_y, prob_df['predict'],
            labels)
        class_1_precision_list.append(class_1_precision)
        class_1_recall_list.append(class_1_recall)
    plot_pre_recall(nof_list, class_1_precision_list, class_1_recall_list,
                    'logistic regression')
def run_rfe(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)

    # build model
    nof_list = np.arange(1, (max_feature_try_numbers + 1))
    class_1_precision_list = []
    class_1_recall_list = []
    for n in range(len(nof_list)):
        save_print("********Current nof features are: " + str(nof_list[n]))
        dc_tree = DecisionTreeClassifier(criterion='entropy',
                                         min_samples_split=20,
                                         random_state=99)
        rfe = RFE(dc_tree, nof_list[n])
        rfe_train_x = rfe.fit_transform(train_x, train_y)
        rfe_test_x = rfe.transform(test_x)
        dc_tree.fit(rfe_train_x, train_y)
        labels = df_label.unique()
        # predict
        test_y_predict = dc_tree.predict(rfe_test_x)
        class_1_precision, class_1_recall = get_accuracy(
            "decision tree", test_y, test_y_predict, labels)
        class_1_precision_list.append(class_1_precision)
        class_1_recall_list.append(class_1_recall)
    plot_pre_recall(nof_list, class_1_precision_list, class_1_recall_list,
                    'decision tree')