def main(train_file_name,valid_file_name,test_file_name):

    X_train, y_train, X_validation, y_validation, X_test = \
        load_process_data(train_file_name, valid_file_name, test_file_name)

    gp_classifier = SymbolicClassifier(population_size=20,
                                       generations=65,
                                       tournament_size=3,
                                       const_range=None,
                                       init_depth=(4, 12),
                                       parsimony_coefficient=0.00000000000000000000000000000001,
                                       # parsimony_coefficient=0.0,
                                       # init_method='full',
                                       function_set=('add', 'sub',
                                                     'mul', 'div'),
                                       # make_function(my_sqr, "sqr", arity=2, wrap=False)),
                                       transformer='sigmoid',
                                       #metric=f_beta,
                                       p_crossover=0.85,
                                       p_subtree_mutation=0.04,
                                       p_hoist_mutation=0.01,
                                       p_point_mutation=0.04,
                                       p_point_replace=0.005,
                                       max_samples=1.0,
                                       feature_names=None,
                                       warm_start=False,
                                       low_memory=True,
                                       n_jobs=8,
                                       verbose=1,
                                       random_state=None)


    gp_classifier.fit(X_train, y_train)

    y_val_proba = gp_classifier.predict_proba(X_validation)
    y_train_proba = gp_classifier.predict_proba(X_train)
    best_threshold = get_best_threshold(y_val_proba, y_validation)

    y_train_pred = np.where(y_train_proba[:, 1]
                            > best_threshold, 1, 0)
    y_val_pred = np.where(y_val_proba[:, 1] > best_threshold, 1, 0)
    str_header = "$"*78
    print(str_header)
    print(str_header)
    print('Train accuracy', accuracy_score(y_train, y_train_pred))
    print('Validation accuracy', accuracy_score(y_validation, y_val_pred))

    print('Train precision', precision_score(y_train, y_train_pred))
    print('Validation precision', precision_score(y_validation, y_val_pred))

    print('Train recall', recall_score(y_train, y_train_pred))
    print('Validation recall', recall_score(y_validation, y_val_pred))

    print('Train f-beta score', fbeta_score(y_train, y_train_pred, beta=0.25))
    validation_beta_score = fbeta_score(y_validation, y_val_pred, beta=0.25)
    print(f'Validation f-beta score {validation_beta_score}')
    print(str_header)
    print(str_header)
Exemple #2
0
def test_symbolic_classifier():
    """Check that SymbolicClassifier example works"""

    rng = check_random_state(0)
    cancer = load_breast_cancer()
    perm = rng.permutation(cancer.target.size)
    cancer.data = cancer.data[perm]
    cancer.target = cancer.target[perm]

    est = SymbolicClassifier(parsimony_coefficient=.01,
                             feature_names=cancer.feature_names,
                             random_state=1)
    est.fit(cancer.data[:400], cancer.target[:400])

    y_true = cancer.target[400:]
    y_score = est.predict_proba(cancer.data[400:])[:, 1]
    assert_almost_equal(roc_auc_score(y_true, y_score), 0.96937869822485212)

    dot_data = est._program.export_graphviz()
    expected = ('digraph program {\nnode [style=filled]\n0 [label="sub", '
                'fillcolor="#136ed4"] ;\n1 [label="div", fillcolor="#136ed4"] '
                ';\n2 [label="worst fractal dimension", fillcolor="#60a6f6"] '
                ';\n3 [label="mean concave points", fillcolor="#60a6f6"] '
                ';\n1 -> 3 ;\n1 -> 2 ;\n4 [label="mul", fillcolor="#136ed4"] '
                ';\n5 [label="mean concave points", fillcolor="#60a6f6"] ;\n6 '
                '[label="area error", fillcolor="#60a6f6"] ;\n4 -> 6 ;\n4 -> '
                '5 ;\n0 -> 4 ;\n0 -> 1 ;\n}')
    assert_equal(dot_data, expected)
Exemple #3
0
def main():

    seed = 0
    np.random.seed(seed)
    df = Dataset('ml_project1_data.xlsx').rm_df
    y = df['Response']
    X = df.drop(columns='Response')
    training, testing, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)





    training['Response'] = y_train
    testing['Response'] = y_test
    pr = Processor(training,testing,seed = 0)
    fe = FeatureEngineer(pr.training,pr.unseen,seed = 0)
    training = fe.training
    testing = fe.unseen
    est = SymbolicClassifier(generations = 200, random_state = 0)
    est.fit(training.drop('Response',axis = 1), training['Response'])
    assess_generalization_auroc(est,testing,True)
    y_pred = est.predict_proba(testing.drop('Response',axis = 1))[:,1]
    y_true = testing['Response']
    print(profit(y_true, y_pred))

    #+++++++++++++++++ 5) modelling
    #Create Optimizer
    '''
    mlp_param_grid = {'mlpc__hidden_layer_sizes': [(3), (6), (3, 3), (5, 5)],
                      'mlpc__learning_rate_init': [0.001, 0.01]}
    mlp_gscv = bayes_optimization_MLP(fe.training,mlp_param_grid, cv = 5,seed = 0)
    #mlp_gscv.fit(training.loc[:, (training.columns != "Response")].values, training["Response"].values)
    print("Best parameter set: ", mlp_gscv.best_params_)
    # pd.DataFrame.from_dict(mlp_gscv.cv_results_).to_excel("D:\\PipeLines\\project_directory\\data\\mlp_gscv.xlsx")

    #+++++++++++++++++ 6) retraining & assessment of generalization ability
    #auprc,precision, recall = assess_generalization_auroc(mlp_gscv.best_estimator_, testing)
    #print("AUPRC: {:.2f}".format(auprc))
    '''

    plt.show()
import pandas as pd
from gplearn.genetic import SymbolicClassifier
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle

if __name__ == '__main__':
    # creating data structures
    train_set = pd.read_csv("training.txt", sep=" ")
    test_set = pd.read_csv("test.txt", sep=" ")

    x_train = train_set.drop("Target", axis=1)
    y_train = train_set["Target"]
    x_test = test_set.drop("Target", axis=1)
    y_test = test_set["Target"]

    est = SymbolicClassifier(parsimony_coefficient=.01,
                             stopping_criteria=0.01,
                             feature_names=list(x_train.columns.values),
                             random_state=3)

    est.fit(x_train, y_train)

    y_true = y_test
    y_score = est.predict_proba(x_test)[:, 1]

    print("Accuracy:", roc_auc_score(y_true, y_score), "Program:",
          est._program)