def genetic_feat(df, num_gen=20, num_comp=10): from gplearn.genetic import SymbolicTransformer function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan' ] gp = SymbolicTransformer(generations=num_gen, population_size=200, hall_of_fame=100, n_components=num_comp, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6) gen_feats = gp.fit_transform(df.drop("close", axis=1), df["close"]) df.iloc[:, :8] gen_feats = pd.DataFrame( gen_feats, columns=["gen_" + str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = df.index return pd.concat((df, gen_feats), axis=1)
def Genetic_P(dataset, target): append = 'mean_per_hour' a = dataset[append] y = dataset[target] X = dataset.copy() X = X.drop(target, axis=1) X = X.drop(append, axis=1) function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min', 'sin', 'cos', 'tan' ] gp = SymbolicTransformer(generations=20, population_size=2000, hall_of_fame=100, n_components=15, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=random_seed, n_jobs=3) gp_features = gp.fit_transform(X, y) print('Number of features created out of genetic programing: {}'.format( gp_features.shape)) n = pd.DataFrame(gp_features) n = n.set_index(dataset.index.values) new_X = pd.concat([dataset, n], axis=1) new_X = new_X.dropna() return new_X
def pd_colcat_symbolic(df, col, pars): """ https://github.com/arita37/deltapy pip install deltapy """ pars_encoder = pars pars_encoder['cols'] = col if 'path_pipeline_export' in pars: try: pars_encoder = load(pars['path_pipeline_export'] + '/col_genetic_pars.pkl') model_encoder = load(pars['path_pipeline_export'] + '/col_genetic_model.pkl') col_encoder = load(pars['path_pipeline_export'] + '/col_genetic.pkl') except: pass ################################################################################### coly = pars['coly'] from gplearn.genetic import SymbolicTransformer function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan' ] gp = SymbolicTransformer(generations=20, population_size=200, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6) gen_feats = gp.fit_transform(df[col], df[coly]) gen_feats = pd.DataFrame( gen_feats, columns=["gen_" + str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = df.index dfnew = gen_feats dfnew.columns = [t for t in dfnew.columns] ################################################################################### colnew = list(dfnew.columns) if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfnew, 'dfgen', pars['path_features_store']) save(gp, pars['path_pipeline_export'] + "/col_genetic_model.pkl") save(pars_encoder, pars['path_pipeline_export'] + "/col_genetic_pars.pkl") save(colnew, pars['path_pipeline_export'] + "/col_genetic.pkl") col_pars = {'model': gp} col_pars['cols_new'] = { 'col_genetic': colnew ### list } return dfnew, col_pars
def pd_col_genetic_transform(df=None, col=None, pars=None): num_gen=20 num_comp=10 function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv','tan'] gp = SymbolicTransformer(generations=num_gen, population_size=200, hall_of_fame=100, n_components=num_comp, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6) gen_feats = gp.fit_transform(train_X, train_y) gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = train_X.index train_X_all=pd.concat((train_X,gen_feats),axis=1) gen_feats = gp.transform(test_X) gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = test_X.index test_X_all=pd.concat((test_X,gen_feats),axis=1) gen_feats = gp.transform(val_X) gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = val_X.index val_X_all=pd.concat((val_X,gen_feats),axis=1) return train_X_all,test_X_all,val_X_all
def symbolic_features(p_x, p_y): """ Funcion para crear regresores no lineales Parameters ---------- p_x: pd.DataFrame with regressors or predictor variables p_y: pd.DataFrame with variable to predict Returns ------- results: model """ model = SymbolicTransformer(function_set=[ "sub", "add", 'inv', 'mul', 'div', 'abs', 'log', "max", "min", "sin", "cos" ], population_size=5000, hall_of_fame=100, n_components=20, generations=20, tournament_size=20, stopping_criteria=.05, const_range=None, init_depth=(4, 12), metric='pearson', parsimony_coefficient=0.001, p_crossover=0.4, p_subtree_mutation=0.2, p_hoist_mutation=0.1, p_point_mutation=0.3, p_point_replace=.05, verbose=1, random_state=None, n_jobs=-1, feature_names=p_x.columns, warm_start=True) init = model.fit_transform(p_x[:'01-01-2019'], p_y[:'01-01-2019']) model_params = model.get_params() gp_features = model.transform(p_x) model_fit = np.hstack((p_x, gp_features)) results = {'fit': model_fit, 'params': model_params, 'model': model} return results
def symbolic_features(p_x, p_y): """ Funcion para crear regresores no lineales Parameters ---------- p_x: pd.DataFrame with regressors or predictor variables p_x = data_features.iloc[0:30, 3:] p_y: pd.DataFrame with variable to predict p_y = data_features.iloc[0:30, 1] Returns ------- score_gp: float error of prediction """ # funcion de generacion de variables simbolicas model = SymbolicTransformer( function_set=["sub", "add", 'inv', 'mul', 'div', 'abs', 'log'], population_size=12000, hall_of_fame=300, n_components=30, generations=4, tournament_size=600, stopping_criteria=.75, const_range=None, init_method='half and half', init_depth=(4, 20), metric='pearson', parsimony_coefficient=0.001, p_crossover=0.4, p_subtree_mutation=0.3, p_hoist_mutation=0.1, p_point_mutation=0.2, p_point_replace=0.2, verbose=1, random_state=None, n_jobs=-1, feature_names=p_x.columns, warm_start=True) # SymbolicTransformer fit model_fit = model.fit_transform(p_x, p_y) # output data of the model data = pd.DataFrame(np.round(model_fit, 6)) # parameters of the model model_params = model.get_params() # best programs dataframe best_programs = {} for p in model._best_programs: factor_name = 'sym_' + str(model._best_programs.index(p)) best_programs[factor_name] = { 'raw_fitness': p.raw_fitness_, 'reg_fitness': p.fitness_, 'expression': str(p), 'depth': p.depth_, 'length': p.length_ } # formatting, drop duplicates and sort by reg_fitness best_programs = pd.DataFrame(best_programs).T best_programs = best_programs.drop_duplicates(subset=['expression']) best_programs = best_programs.sort_values(by='reg_fitness', ascending=False) # results results = { 'fit': model_fit, 'params': model_params, 'model': model, 'data': data, 'best_programs': best_programs, 'details': model.run_details_ } return results
def symbolic_features(p_x, p_y, p_params): """ Feature engineering process with symbolic variables by using genetic programming. Parameters ---------- p_x: pd.DataFrame / np.array / list with regressors or predictor variables p_x = data_features.iloc[:, 1:] p_y: pd.DataFrame / np.array / list with variable to predict p_y = data_features.iloc[:, 0] p_params: dict with parameters for the genetic programming function p_params = {'functions': ["sub", "add", 'inv', 'mul', 'div', 'abs', 'log'], 'population': 5000, 'tournament':20, 'hof': 20, 'generations': 5, 'n_features':20, 'init_depth': (4,8), 'init_method': 'half and half', 'parsimony': 0.1, 'constants': None, 'metric': 'pearson', 'metric_goal': 0.65, 'prob_cross': 0.4, 'prob_mutation_subtree': 0.3, 'prob_mutation_hoist': 0.1. 'prob_mutation_point': 0.2, 'verbose': True, 'random_cv': None, 'parallelization': True, 'warm_start': True } Returns ------- results: dict With response information {'fit': model fitted, 'params': model parameters, 'model': model, 'data': generated data with variables, 'best_programs': models best programs} References ---------- https://gplearn.readthedocs.io/en/stable/reference.html#gplearn.genetic.SymbolicTransformer **** NOTE **** simplified internal calculation for correlation (asuming w=1) y_pred_demean = y_pred - np.average(y_pred) y_demean = y - np.average(y) np.sum(y_pred_demean * y_demean) pearson = --------------------------------------------------------------- np.sqrt((np.sum(y_pred_demean ** 2) * np.sum(y_demean ** 2))) """ # Function to produce Symbolic Features model = SymbolicTransformer( function_set=p_params['functions'], population_size=p_params['population'], tournament_size=p_params['tournament'], hall_of_fame=p_params['hof'], generations=p_params['generations'], n_components=p_params['n_features'], init_depth=p_params['init_depth'], init_method=p_params['init_method'], parsimony_coefficient=p_params['parsimony'], const_range=p_params['constants'], metric=p_params['metric'], stopping_criteria=p_params['metric_goal'], p_crossover=p_params['prob_cross'], p_subtree_mutation=p_params['prob_mutation_subtree'], p_hoist_mutation=p_params['prob_mutation_hoist'], p_point_mutation=p_params['prob_mutation_point'], max_samples=p_params['max_samples'], verbose=p_params['verbose'], warm_start=p_params['warm_start'], random_state=123, n_jobs=-1 if p_params['parallelization'] else 1, feature_names=p_x.columns) # SymbolicTransformer fit model_fit = model.fit_transform(p_x, p_y) # output data of the model data = pd.DataFrame(model_fit) # parameters of the model model_params = model.get_params() # best programs dataframe best_programs = {} for p in model._best_programs: factor_name = 'sym' + str(model._best_programs.index(p)) best_programs[factor_name] = { 'raw_fitness': p.raw_fitness_, 'reg_fitness': p.fitness_, 'expression': str(p), 'depth': p.depth_, 'length': p.length_ } # format and sorting best_programs = pd.DataFrame(best_programs).T best_programs = best_programs.sort_values(by='raw_fitness', ascending=False) # results results = { 'fit': model_fit, 'params': model_params, 'model': model, 'data': data, 'best_programs': best_programs, 'details': model.run_details_ } return results
def symbolic_features(p_x, p_y): """ Funcion para crear regresores no lineales Parameters ---------- p_x: pd.DataFrame with regressors or predictor variables p_x = data_features.iloc[0:30, 3:] p_y: pd.DataFrame with variable to predict p_y = data_features.iloc[0:30, 1] Returns ------- score_gp: float error of prediction """ model = SymbolicTransformer( function_set=["sub", "add", 'inv', 'mul', 'div', 'abs', 'log'], population_size=5000, hall_of_fame=20, n_components=10, tournament_size=20, generations=5, init_depth=(4, 8), init_method='half and half', parsimony_coefficient=0.1, const_range=None, metric='pearson', stopping_criteria=0.65, p_crossover=0.4, p_subtree_mutation=0.3, p_hoist_mutation=0.1, p_point_mutation=0.2, verbose=True, warm_start=True, n_jobs=-1, feature_names=p_x.columns) model.fit_transform(p_x, p_y) model_params = model.get_params() gp_features = model.transform(p_x) model_fit = np.hstack((p_x, gp_features)) results = { 'fit': model_fit, 'params': model_params, 'model': model, "features": gp_features } best_p = model._best_programs best_p_dict = {} for p in best_p: factor_name = 'alpha_' + str(best_p.index(p) + 1) best_p_dict[factor_name] = { 'fitness': p.fitness_, "expression": str(p), 'depth': p.depth_, "length": p.length_ } best_p_dict = pd.DataFrame(best_p_dict).T best_p_dict = best_p_dict.sort_values(by="fitness") return results, best_p_dict
stopping_criteria=1.0, const_range=(-1., 1.), init_depth=(2, 6), init_method='half and half', function_set=('add', 'sub', 'mul', 'div'), metric=mape, parsimony_coefficient=0.001, p_crossover=0.9, p_subtree_mutation=0.01, p_hoist_mutation=0.01, p_point_mutation=0.01, p_point_replace=0.05, max_samples=1.0, feature_names=None, warm_start=False, low_memory=False, n_jobs=1, verbose=0, random_state=None) tran = sr.fit_transform(X, y) print(sr._best_programs[0]) print(sr._best_programs[0].fitness_) # print(sr._program) # pre = sr.predict(X) # # # # bp = BasePlot() # bp.scatter(y, pre, strx='y_true', stry='y_predict') # plt.show()