def main(train_file_name,valid_file_name,test_file_name): X_train, y_train, X_validation, y_validation, X_test = \ load_process_data(train_file_name, valid_file_name, test_file_name) gp_classifier = SymbolicClassifier(population_size=20, generations=65, tournament_size=3, const_range=None, init_depth=(4, 12), parsimony_coefficient=0.00000000000000000000000000000001, # parsimony_coefficient=0.0, # init_method='full', function_set=('add', 'sub', 'mul', 'div'), # make_function(my_sqr, "sqr", arity=2, wrap=False)), transformer='sigmoid', #metric=f_beta, p_crossover=0.85, p_subtree_mutation=0.04, p_hoist_mutation=0.01, p_point_mutation=0.04, p_point_replace=0.005, max_samples=1.0, feature_names=None, warm_start=False, low_memory=True, n_jobs=8, verbose=1, random_state=None) gp_classifier.fit(X_train, y_train) y_val_proba = gp_classifier.predict_proba(X_validation) y_train_proba = gp_classifier.predict_proba(X_train) best_threshold = get_best_threshold(y_val_proba, y_validation) y_train_pred = np.where(y_train_proba[:, 1] > best_threshold, 1, 0) y_val_pred = np.where(y_val_proba[:, 1] > best_threshold, 1, 0) str_header = "$"*78 print(str_header) print(str_header) print('Train accuracy', accuracy_score(y_train, y_train_pred)) print('Validation accuracy', accuracy_score(y_validation, y_val_pred)) print('Train precision', precision_score(y_train, y_train_pred)) print('Validation precision', precision_score(y_validation, y_val_pred)) print('Train recall', recall_score(y_train, y_train_pred)) print('Validation recall', recall_score(y_validation, y_val_pred)) print('Train f-beta score', fbeta_score(y_train, y_train_pred, beta=0.25)) validation_beta_score = fbeta_score(y_validation, y_val_pred, beta=0.25) print(f'Validation f-beta score {validation_beta_score}') print(str_header) print(str_header)
def test_symbolic_classifier(): """Check that SymbolicClassifier example works""" rng = check_random_state(0) cancer = load_breast_cancer() perm = rng.permutation(cancer.target.size) cancer.data = cancer.data[perm] cancer.target = cancer.target[perm] est = SymbolicClassifier(parsimony_coefficient=.01, feature_names=cancer.feature_names, random_state=1) est.fit(cancer.data[:400], cancer.target[:400]) y_true = cancer.target[400:] y_score = est.predict_proba(cancer.data[400:])[:, 1] assert_almost_equal(roc_auc_score(y_true, y_score), 0.96937869822485212) dot_data = est._program.export_graphviz() expected = ('digraph program {\nnode [style=filled]\n0 [label="sub", ' 'fillcolor="#136ed4"] ;\n1 [label="div", fillcolor="#136ed4"] ' ';\n2 [label="worst fractal dimension", fillcolor="#60a6f6"] ' ';\n3 [label="mean concave points", fillcolor="#60a6f6"] ' ';\n1 -> 3 ;\n1 -> 2 ;\n4 [label="mul", fillcolor="#136ed4"] ' ';\n5 [label="mean concave points", fillcolor="#60a6f6"] ;\n6 ' '[label="area error", fillcolor="#60a6f6"] ;\n4 -> 6 ;\n4 -> ' '5 ;\n0 -> 4 ;\n0 -> 1 ;\n}') assert_equal(dot_data, expected)
def main(): seed = 0 np.random.seed(seed) df = Dataset('ml_project1_data.xlsx').rm_df y = df['Response'] X = df.drop(columns='Response') training, testing, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) training['Response'] = y_train testing['Response'] = y_test pr = Processor(training,testing,seed = 0) fe = FeatureEngineer(pr.training,pr.unseen,seed = 0) training = fe.training testing = fe.unseen est = SymbolicClassifier(generations = 200, random_state = 0) est.fit(training.drop('Response',axis = 1), training['Response']) assess_generalization_auroc(est,testing,True) y_pred = est.predict_proba(testing.drop('Response',axis = 1))[:,1] y_true = testing['Response'] print(profit(y_true, y_pred)) #+++++++++++++++++ 5) modelling #Create Optimizer ''' mlp_param_grid = {'mlpc__hidden_layer_sizes': [(3), (6), (3, 3), (5, 5)], 'mlpc__learning_rate_init': [0.001, 0.01]} mlp_gscv = bayes_optimization_MLP(fe.training,mlp_param_grid, cv = 5,seed = 0) #mlp_gscv.fit(training.loc[:, (training.columns != "Response")].values, training["Response"].values) print("Best parameter set: ", mlp_gscv.best_params_) # pd.DataFrame.from_dict(mlp_gscv.cv_results_).to_excel("D:\\PipeLines\\project_directory\\data\\mlp_gscv.xlsx") #+++++++++++++++++ 6) retraining & assessment of generalization ability #auprc,precision, recall = assess_generalization_auroc(mlp_gscv.best_estimator_, testing) #print("AUPRC: {:.2f}".format(auprc)) ''' plt.show()
import pandas as pd from gplearn.genetic import SymbolicClassifier from sklearn.metrics import roc_auc_score from sklearn.utils import shuffle if __name__ == '__main__': # creating data structures train_set = pd.read_csv("training.txt", sep=" ") test_set = pd.read_csv("test.txt", sep=" ") x_train = train_set.drop("Target", axis=1) y_train = train_set["Target"] x_test = test_set.drop("Target", axis=1) y_test = test_set["Target"] est = SymbolicClassifier(parsimony_coefficient=.01, stopping_criteria=0.01, feature_names=list(x_train.columns.values), random_state=3) est.fit(x_train, y_train) y_true = y_test y_score = est.predict_proba(x_test)[:, 1] print("Accuracy:", roc_auc_score(y_true, y_score), "Program:", est._program)