Esempio n. 1
0
def genetic_feat(df, num_gen=20, num_comp=10):
    from gplearn.genetic import SymbolicTransformer

    function_set = [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan'
    ]

    gp = SymbolicTransformer(generations=num_gen,
                             population_size=200,
                             hall_of_fame=100,
                             n_components=num_comp,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9,
                             verbose=1,
                             random_state=0,
                             n_jobs=6)

    gen_feats = gp.fit_transform(df.drop("close", axis=1), df["close"])
    df.iloc[:, :8]
    gen_feats = pd.DataFrame(
        gen_feats,
        columns=["gen_" + str(a) for a in range(gen_feats.shape[1])])
    gen_feats.index = df.index
    return pd.concat((df, gen_feats), axis=1)
def Genetic_P(dataset, target):
    append = 'mean_per_hour'
    a = dataset[append]
    y = dataset[target]
    X = dataset.copy()
    X = X.drop(target, axis=1)
    X = X.drop(append, axis=1)
    function_set = [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max',
        'min', 'sin', 'cos', 'tan'
    ]
    gp = SymbolicTransformer(generations=20,
                             population_size=2000,
                             hall_of_fame=100,
                             n_components=15,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9,
                             verbose=1,
                             random_state=random_seed,
                             n_jobs=3)
    gp_features = gp.fit_transform(X, y)
    print('Number of features created out of genetic programing: {}'.format(
        gp_features.shape))
    n = pd.DataFrame(gp_features)
    n = n.set_index(dataset.index.values)
    new_X = pd.concat([dataset, n], axis=1)
    new_X = new_X.dropna()
    return new_X
Esempio n. 3
0
def pd_colcat_symbolic(df, col, pars):
    """
       https://github.com/arita37/deltapy

       pip install deltapy

    """
    pars_encoder = pars
    pars_encoder['cols'] = col
    if 'path_pipeline_export' in pars:
        try:
            pars_encoder = load(pars['path_pipeline_export'] +
                                '/col_genetic_pars.pkl')
            model_encoder = load(pars['path_pipeline_export'] +
                                 '/col_genetic_model.pkl')
            col_encoder = load(pars['path_pipeline_export'] +
                               '/col_genetic.pkl')
        except:
            pass

    ###################################################################################
    coly = pars['coly']
    from gplearn.genetic import SymbolicTransformer
    function_set = [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan'
    ]

    gp = SymbolicTransformer(generations=20,
                             population_size=200,
                             hall_of_fame=100,
                             n_components=10,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9,
                             verbose=1,
                             random_state=0,
                             n_jobs=6)

    gen_feats = gp.fit_transform(df[col], df[coly])
    gen_feats = pd.DataFrame(
        gen_feats,
        columns=["gen_" + str(a) for a in range(gen_feats.shape[1])])
    gen_feats.index = df.index
    dfnew = gen_feats
    dfnew.columns = [t for t in dfnew.columns]

    ###################################################################################
    colnew = list(dfnew.columns)
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(dfnew, 'dfgen', pars['path_features_store'])
        save(gp, pars['path_pipeline_export'] + "/col_genetic_model.pkl")
        save(pars_encoder,
             pars['path_pipeline_export'] + "/col_genetic_pars.pkl")
        save(colnew, pars['path_pipeline_export'] + "/col_genetic.pkl")

    col_pars = {'model': gp}
    col_pars['cols_new'] = {
        'col_genetic': colnew  ### list
    }
    return dfnew, col_pars
Esempio n. 4
0
def pd_col_genetic_transform(df=None, col=None, pars=None):
    num_gen=20
    num_comp=10
    function_set = ['add', 'sub', 'mul', 'div',
           'sqrt', 'log', 'abs', 'neg', 'inv','tan']

    gp = SymbolicTransformer(generations=num_gen, population_size=200,
    hall_of_fame=100, n_components=num_comp,
    function_set=function_set,
    parsimony_coefficient=0.0005,
    max_samples=0.9, verbose=1,
    random_state=0, n_jobs=6)

    gen_feats = gp.fit_transform(train_X, train_y)
    gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])])
    gen_feats.index = train_X.index
    train_X_all=pd.concat((train_X,gen_feats),axis=1)
    gen_feats = gp.transform(test_X)
    gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])])
    gen_feats.index = test_X.index
    test_X_all=pd.concat((test_X,gen_feats),axis=1)

    gen_feats = gp.transform(val_X)
    gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])])
    gen_feats.index = val_X.index
    val_X_all=pd.concat((val_X,gen_feats),axis=1)
    return train_X_all,test_X_all,val_X_all
Esempio n. 5
0
def symbolic_features(p_x, p_y):
    """
    Funcion para crear regresores no lineales

    Parameters
    ----------
    p_x: pd.DataFrame
        with regressors or predictor variables

    p_y: pd.DataFrame with variable to predict

    Returns
    -------
    results: model

    """
    model = SymbolicTransformer(function_set=[
        "sub", "add", 'inv', 'mul', 'div', 'abs', 'log', "max", "min", "sin",
        "cos"
    ],
                                population_size=5000,
                                hall_of_fame=100,
                                n_components=20,
                                generations=20,
                                tournament_size=20,
                                stopping_criteria=.05,
                                const_range=None,
                                init_depth=(4, 12),
                                metric='pearson',
                                parsimony_coefficient=0.001,
                                p_crossover=0.4,
                                p_subtree_mutation=0.2,
                                p_hoist_mutation=0.1,
                                p_point_mutation=0.3,
                                p_point_replace=.05,
                                verbose=1,
                                random_state=None,
                                n_jobs=-1,
                                feature_names=p_x.columns,
                                warm_start=True)

    init = model.fit_transform(p_x[:'01-01-2019'], p_y[:'01-01-2019'])
    model_params = model.get_params()
    gp_features = model.transform(p_x)
    model_fit = np.hstack((p_x, gp_features))
    results = {'fit': model_fit, 'params': model_params, 'model': model}

    return results
Esempio n. 6
0
def symbolic_features(p_x, p_y):
    """
    Funcion para crear regresores no lineales

    Parameters
    ----------
    p_x: pd.DataFrame
        with regressors or predictor variables
        p_x = data_features.iloc[0:30, 3:]

    p_y: pd.DataFrame
        with variable to predict
        p_y = data_features.iloc[0:30, 1]

    Returns
    -------
    score_gp: float
        error of prediction

    """

    # funcion de generacion de variables simbolicas
    model = SymbolicTransformer(
        function_set=["sub", "add", 'inv', 'mul', 'div', 'abs', 'log'],
        population_size=12000,
        hall_of_fame=300,
        n_components=30,
        generations=4,
        tournament_size=600,
        stopping_criteria=.75,
        const_range=None,
        init_method='half and half',
        init_depth=(4, 20),
        metric='pearson',
        parsimony_coefficient=0.001,
        p_crossover=0.4,
        p_subtree_mutation=0.3,
        p_hoist_mutation=0.1,
        p_point_mutation=0.2,
        p_point_replace=0.2,
        verbose=1,
        random_state=None,
        n_jobs=-1,
        feature_names=p_x.columns,
        warm_start=True)

    # SymbolicTransformer fit
    model_fit = model.fit_transform(p_x, p_y)

    # output data of the model
    data = pd.DataFrame(np.round(model_fit, 6))

    # parameters of the model
    model_params = model.get_params()

    # best programs dataframe
    best_programs = {}
    for p in model._best_programs:
        factor_name = 'sym_' + str(model._best_programs.index(p))
        best_programs[factor_name] = {
            'raw_fitness': p.raw_fitness_,
            'reg_fitness': p.fitness_,
            'expression': str(p),
            'depth': p.depth_,
            'length': p.length_
        }

    # formatting, drop duplicates and sort by reg_fitness
    best_programs = pd.DataFrame(best_programs).T
    best_programs = best_programs.drop_duplicates(subset=['expression'])
    best_programs = best_programs.sort_values(by='reg_fitness',
                                              ascending=False)

    # results
    results = {
        'fit': model_fit,
        'params': model_params,
        'model': model,
        'data': data,
        'best_programs': best_programs,
        'details': model.run_details_
    }

    return results
Esempio n. 7
0
def symbolic_features(p_x, p_y, p_params):
    """
    Feature engineering process with symbolic variables by using genetic programming.
    Parameters
    ----------
    p_x: pd.DataFrame / np.array / list
        with regressors or predictor variables
        p_x = data_features.iloc[:, 1:]
    p_y: pd.DataFrame / np.array / list
        with variable to predict
        p_y = data_features.iloc[:, 0]
    p_params: dict
        with parameters for the genetic programming function
        p_params = {'functions': ["sub", "add", 'inv', 'mul', 'div', 'abs', 'log'],
        'population': 5000, 'tournament':20, 'hof': 20, 'generations': 5, 'n_features':20,
        'init_depth': (4,8), 'init_method': 'half and half', 'parsimony': 0.1, 'constants': None,
        'metric': 'pearson', 'metric_goal': 0.65,
        'prob_cross': 0.4, 'prob_mutation_subtree': 0.3,
        'prob_mutation_hoist': 0.1. 'prob_mutation_point': 0.2,
        'verbose': True, 'random_cv': None, 'parallelization': True, 'warm_start': True }
    Returns
    -------
    results: dict
        With response information
        {'fit': model fitted, 'params': model parameters, 'model': model,
         'data': generated data with variables, 'best_programs': models best programs}
    References
    ----------
    https://gplearn.readthedocs.io/en/stable/reference.html#gplearn.genetic.SymbolicTransformer


    **** NOTE ****
    simplified internal calculation for correlation (asuming w=1)

    y_pred_demean = y_pred - np.average(y_pred)
    y_demean = y - np.average(y)
                              np.sum(y_pred_demean * y_demean)
    pearson =  ---------------------------------------------------------------
                np.sqrt((np.sum(y_pred_demean ** 2) * np.sum(y_demean ** 2)))
    """

    # Function to produce Symbolic Features
    model = SymbolicTransformer(
        function_set=p_params['functions'],
        population_size=p_params['population'],
        tournament_size=p_params['tournament'],
        hall_of_fame=p_params['hof'],
        generations=p_params['generations'],
        n_components=p_params['n_features'],
        init_depth=p_params['init_depth'],
        init_method=p_params['init_method'],
        parsimony_coefficient=p_params['parsimony'],
        const_range=p_params['constants'],
        metric=p_params['metric'],
        stopping_criteria=p_params['metric_goal'],
        p_crossover=p_params['prob_cross'],
        p_subtree_mutation=p_params['prob_mutation_subtree'],
        p_hoist_mutation=p_params['prob_mutation_hoist'],
        p_point_mutation=p_params['prob_mutation_point'],
        max_samples=p_params['max_samples'],
        verbose=p_params['verbose'],
        warm_start=p_params['warm_start'],
        random_state=123,
        n_jobs=-1 if p_params['parallelization'] else 1,
        feature_names=p_x.columns)

    # SymbolicTransformer fit
    model_fit = model.fit_transform(p_x, p_y)

    # output data of the model
    data = pd.DataFrame(model_fit)

    # parameters of the model
    model_params = model.get_params()

    # best programs dataframe
    best_programs = {}
    for p in model._best_programs:
        factor_name = 'sym' + str(model._best_programs.index(p))
        best_programs[factor_name] = {
            'raw_fitness': p.raw_fitness_,
            'reg_fitness': p.fitness_,
            'expression': str(p),
            'depth': p.depth_,
            'length': p.length_
        }

    # format and sorting
    best_programs = pd.DataFrame(best_programs).T
    best_programs = best_programs.sort_values(by='raw_fitness',
                                              ascending=False)

    # results
    results = {
        'fit': model_fit,
        'params': model_params,
        'model': model,
        'data': data,
        'best_programs': best_programs,
        'details': model.run_details_
    }

    return results
def symbolic_features(p_x, p_y):
    """
    Funcion para crear regresores no lineales

    Parameters
    ----------
    p_x: pd.DataFrame
        with regressors or predictor variables
        p_x = data_features.iloc[0:30, 3:]

    p_y: pd.DataFrame
        with variable to predict
        p_y = data_features.iloc[0:30, 1]

    Returns
    -------
    score_gp: float
        error of prediction

    """
    model = SymbolicTransformer(
        function_set=["sub", "add", 'inv', 'mul', 'div', 'abs', 'log'],
        population_size=5000,
        hall_of_fame=20,
        n_components=10,
        tournament_size=20,
        generations=5,
        init_depth=(4, 8),
        init_method='half and half',
        parsimony_coefficient=0.1,
        const_range=None,
        metric='pearson',
        stopping_criteria=0.65,
        p_crossover=0.4,
        p_subtree_mutation=0.3,
        p_hoist_mutation=0.1,
        p_point_mutation=0.2,
        verbose=True,
        warm_start=True,
        n_jobs=-1,
        feature_names=p_x.columns)
    model.fit_transform(p_x, p_y)
    model_params = model.get_params()
    gp_features = model.transform(p_x)

    model_fit = np.hstack((p_x, gp_features))
    results = {
        'fit': model_fit,
        'params': model_params,
        'model': model,
        "features": gp_features
    }
    best_p = model._best_programs
    best_p_dict = {}

    for p in best_p:
        factor_name = 'alpha_' + str(best_p.index(p) + 1)
        best_p_dict[factor_name] = {
            'fitness': p.fitness_,
            "expression": str(p),
            'depth': p.depth_,
            "length": p.length_
        }

    best_p_dict = pd.DataFrame(best_p_dict).T
    best_p_dict = best_p_dict.sort_values(by="fitness")

    return results, best_p_dict
Esempio n. 9
0
                         stopping_criteria=1.0,
                         const_range=(-1., 1.),
                         init_depth=(2, 6),
                         init_method='half and half',
                         function_set=('add', 'sub', 'mul', 'div'),
                         metric=mape,
                         parsimony_coefficient=0.001,
                         p_crossover=0.9,
                         p_subtree_mutation=0.01,
                         p_hoist_mutation=0.01,
                         p_point_mutation=0.01,
                         p_point_replace=0.05,
                         max_samples=1.0,
                         feature_names=None,
                         warm_start=False,
                         low_memory=False,
                         n_jobs=1,
                         verbose=0,
                         random_state=None)

tran = sr.fit_transform(X, y)
print(sr._best_programs[0])
print(sr._best_programs[0].fitness_)
# print(sr._program)
# pre = sr.predict(X)
#
# #
# bp = BasePlot()
# bp.scatter(y, pre, strx='y_true', stry='y_predict')
# plt.show()