def pd_col_genetic_transform(df=None, col=None, pars=None):
    num_gen=20
    num_comp=10
    function_set = ['add', 'sub', 'mul', 'div',
           'sqrt', 'log', 'abs', 'neg', 'inv','tan']

    gp = SymbolicTransformer(generations=num_gen, population_size=200,
    hall_of_fame=100, n_components=num_comp,
    function_set=function_set,
    parsimony_coefficient=0.0005,
    max_samples=0.9, verbose=1,
    random_state=0, n_jobs=6)

    gen_feats = gp.fit_transform(train_X, train_y)
    gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])])
    gen_feats.index = train_X.index
    train_X_all=pd.concat((train_X,gen_feats),axis=1)
    gen_feats = gp.transform(test_X)
    gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])])
    gen_feats.index = test_X.index
    test_X_all=pd.concat((test_X,gen_feats),axis=1)

    gen_feats = gp.transform(val_X)
    gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])])
    gen_feats.index = val_X.index
    val_X_all=pd.concat((val_X,gen_feats),axis=1)
    return train_X_all,test_X_all,val_X_all
    def data_prepare(self):
        self.__digists = load_digits(n_class=2)
        self.__X = self.__digists.data
        self.__y = self.__digists.target

        self.__train, self.__test, self.__train_label, self.__test_label = train_test_split(
            self.__X, self.__y, test_size=0.2, random_state=9)

        # standard scaler
        scaler = StandardScaler().fit(self.__train)
        self.__train = scaler.transform(self.__train)
        self.__test = scaler.transform(self.__test)

        # gp feature
        function_set = ("add", "sub", "mul", "div", "sqrt", "log", "abs",
                        "neg", "inv", "max", "min")

        gp = SymbolicTransformer(generations=5,
                                 population_size=2000,
                                 hall_of_fame=100,
                                 n_components=10,
                                 function_set=function_set,
                                 parsimony_coefficient=0.0005,
                                 max_samples=0.9,
                                 verbose=1,
                                 random_state=0,
                                 n_jobs=3)

        # 使用 stacking 的方式得到 generic feature 感觉更为合理
        gp.fit(self.__train, self.__train_label)
        self.__train_gfeature = np.hstack(
            (self.__train, gp.transform(self.__train)))
        self.__test_gfeature = np.hstack(
            (self.__test, gp.transform(self.__test)))
Exemple #3
0
class GplearnDemo(object):
    def __init__(self):
        # data prepare
        self.__boston = None
        self.__boston_feature = None
        self.__boston_label = None
        self.__train_feature, self.__test_feature = [None for _ in range(2)]
        self.__train_label, self.__test_label = [None for _ in range(2)]
        self.__transformer = None
        self.__gp_train_feature = None
        self.__gp_test_feature = None

        # model fit
        self.__regressor = None

    def data_prepare(self):
        self.__boston = load_boston()
        self.__boston_feature = pd.DataFrame(
            self.__boston.data, columns=self.__boston.feature_names)
        self.__boston_label = pd.Series(
            self.__boston.target).to_frame("TARGET").squeeze()

        self.__train_feature, self.__test_feature, self.__train_label, self.__test_label = (
            train_test_split(self.__boston_feature,
                             self.__boston_label,
                             test_size=0.5,
                             shuffle=True))

        # 不能有缺失值
        self.__transformer = SymbolicTransformer(n_jobs=4)
        self.__transformer.fit(self.__train_feature, self.__train_label)
        self.__gp_train_feature = self.__transformer.transform(
            self.__train_feature)
        self.__gp_test_feature = self.__transformer.transform(
            self.__test_feature)

    def model_fit_predict(self):
        self.__regressor = Ridge()
        self.__regressor.fit(self.__train_feature, self.__train_label)
        print(
            mean_squared_error(self.__test_label,
                               self.__regressor.predict(self.__test_feature)))

        self.__regressor = Ridge()
        self.__regressor.fit(
            np.hstack((self.__train_feature.values, self.__gp_train_feature)),
            self.__train_label)
        print(
            mean_squared_error(
                self.__test_label,
                self.__regressor.predict(
                    np.hstack((self.__test_feature.values,
                               self.__gp_test_feature)))))
Exemple #4
0
def symbolic_transformer(X, y, encoder=None):
    """Transform features using multiple operations. This will add new features to the data frame.

    Args:
        X (DataFrame): Independent features
        y (Series): Dependen feature or target
        encoder (obj, optional): Object of the type 'SymbolicTransformer'. Defaults to None.

    Returns:
        DataFrame: Additional columns calculated by the algorithm
    """
    if encoder is None:
        function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log',
                        'abs', 'neg', 'inv', 'max', 'min']
        encoder = SymbolicTransformer(generations=10,
                                population_size=1000,
                                hall_of_fame=100,
                                n_components=12,
                                function_set=function_set,
                                parsimony_coefficient=0.0005,
                                max_samples=0.9,
                                verbose=1,
                                random_state=123,
                                n_jobs=-1)
        encoder.fit(X, y)
    gp_features = encoder.transform(X)

    return gp_features, encoder
Exemple #5
0
def test_symbolic_transformer():
    """Check that SymbolicTransformer example works"""

    rng = check_random_state(0)
    boston = load_boston()
    perm = rng.permutation(boston.target.size)
    boston.data = boston.data[perm]
    boston.target = boston.target[perm]

    est = Ridge()
    est.fit(boston.data[:300, :], boston.target[:300])
    assert_almost_equal(est.score(boston.data[300:, :], boston.target[300:]),
                        0.759319453049884)

    function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log',
                    'abs', 'neg', 'inv', 'max', 'min']
    gp = SymbolicTransformer(generations=20, population_size=2000,
                             hall_of_fame=100, n_components=10,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9,
                             random_state=0)
    gp.fit(boston.data[:300, :], boston.target[:300])

    gp_features = gp.transform(boston.data)
    new_boston = np.hstack((boston.data, gp_features))

    est = Ridge()
    est.fit(new_boston[:300, :], boston.target[:300])
    assert_almost_equal(est.score(new_boston[300:, :], boston.target[300:]),
                        0.8418372105182055)
Exemple #6
0
def test_output_shape():
    """Check output shape is as expected"""

    random_state = check_random_state(415)
    X = np.reshape(random_state.uniform(size=50), (5, 10))
    y = random_state.uniform(size=5)

    # Check the transformer
    est = SymbolicTransformer(n_components=5, generations=2, random_state=0)
    est.fit(X, y)
    assert_true(est.transform(X).shape == (5, 5))
Exemple #7
0
def test_output_shape():
    """Check output shape is as expected"""

    random_state = check_random_state(415)
    X = np.reshape(random_state.uniform(size=50), (5, 10))
    y = random_state.uniform(size=5)

    # Check the transformer
    est = SymbolicTransformer(n_components=5, generations=2, random_state=0)
    est.fit(X, y)
    assert_true(est.transform(X).shape == (5, 5))
Exemple #8
0
def getSymbolTrans(train, valid, y, random_state=888):

    X_train = train.copy()
    X_valid = valid.copy()
    y_train = y.copy()
    function_set = [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max',
        'min'
    ]

    gp = SymbolicTransformer(generations=20,
                             population_size=2000,
                             hall_of_fame=100,
                             n_components=10,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9,
                             verbose=0,
                             random_state=0,
                             n_jobs=3)

    gp.fit(X_train, y_train)

    gp_features_train = gp.transform(X_train)
    dt_gp_features_train = pd.DataFrame(gp_features_train)
    dt_gp_features_train.columns = [
        "ST_" + str(i) for i in range(1, dt_gp_features_train.shape[1] + 1)
    ]
    X_train = X_train.join(dt_gp_features_train)
    X_train = X_train.fillna(0)

    gp_features_valid = gp.transform(X_valid)
    dt_gp_features_valid = pd.DataFrame(gp_features_valid)
    dt_gp_features_valid.columns = [
        "ST_" + str(i) for i in range(1, dt_gp_features_valid.shape[1] + 1)
    ]
    X_valid = X_valid.join(dt_gp_features_valid)
    X_valid = X_valid.fillna(0)

    return (X_train, X_valid)
Exemple #9
0
def pd_col_genetic_transform(df=None, col=None, pars=None):
    """
        Find Symbolic formulae for faeture engineering

    """
    prefix = 'col_genetic'
    ######################################################################################
    from gplearn.genetic import SymbolicTransformer
    coly = pars['coly']
    colX = [t for t in col if t not in [coly]]
    train_X = df[colX]
    train_y = df[coly]

    function_set = [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan'
    ]
    pars_genetic = pars.get('pars_genetic', {
        'generations': 20,
        'n_components': 10,
        'population_size': 200
    })

    gp = SymbolicTransformer(hall_of_fame=100,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9,
                             verbose=1,
                             random_state=0,
                             n_jobs=6,
                             **pars_genetic)

    gp.fit(train_X, train_y)
    df_genetic = gp.transform(train_X)
    df_genetic = pd.DataFrame(
        df_genetic,
        columns=["gen_" + str(a) for a in range(df_genetic.shape[1])])
    df_genetic.index = train_X.index

    col_genetic = list(df_genetic.columns)
    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df_genetic, 'df_genetic', pars['path_features_store'])
        save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_genetic,
             pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {'model': gp, 'pars': pars_genetic}
    col_pars['cols_new'] = {
        'col_genetic': col_genetic  ### list
    }
    return df_genetic, col_pars
Exemple #10
0
def symbolic_features(p_x, p_y):
    """
    Funcion para crear regresores no lineales

    Parameters
    ----------
    p_x: pd.DataFrame
        with regressors or predictor variables

    p_y: pd.DataFrame with variable to predict

    Returns
    -------
    results: model

    """
    model = SymbolicTransformer(function_set=[
        "sub", "add", 'inv', 'mul', 'div', 'abs', 'log', "max", "min", "sin",
        "cos"
    ],
                                population_size=5000,
                                hall_of_fame=100,
                                n_components=20,
                                generations=20,
                                tournament_size=20,
                                stopping_criteria=.05,
                                const_range=None,
                                init_depth=(4, 12),
                                metric='pearson',
                                parsimony_coefficient=0.001,
                                p_crossover=0.4,
                                p_subtree_mutation=0.2,
                                p_hoist_mutation=0.1,
                                p_point_mutation=0.3,
                                p_point_replace=.05,
                                verbose=1,
                                random_state=None,
                                n_jobs=-1,
                                feature_names=p_x.columns,
                                warm_start=True)

    init = model.fit_transform(p_x[:'01-01-2019'], p_y[:'01-01-2019'])
    model_params = model.get_params()
    gp_features = model.transform(p_x)
    model_fit = np.hstack((p_x, gp_features))
    results = {'fit': model_fit, 'params': model_params, 'model': model}

    return results
Exemple #11
0
def symbolicLearning(df_list):
    '''
    
    :param df_list: 
    :return: 
    '''
    df_list = pd.DataFrame(df_list)
    function_set = ['add', 'sub', 'mul', 'div', 'log', 'sqrt', 'abs', 'neg', 'max', 'min']

    gp = SymbolicTransformer(generations=10, population_size=1000,
                              hall_of_fame=100, n_components=10,
                              function_set=function_set,
                              parsimony_coefficient=0.0005,
                              max_samples=0.9, verbose=1,
                              random_state=0, n_jobs=3)
    gp_feature = gp.transform(df_list)
    new_feature_name = [str(i) + 'V' for i in range(1, len(function_set)+1)]
    new_feature = pd.DataFrame(gp_feature, columns=new_feature_name)
    return new_feature
Exemple #12
0
def get_feature_symbolic_learning(df, gp_config):
    """

    Parameters
    ----------
    df: pd.DataFrame,the input dataFrame.
    gp_config: GPConfig object, the config object of gplearn.SymbolicTransformer.

    Returns
    -------
    df_t: pd.DataFrame, df with the features of SymbolicTransformer trans.
        The new features named like 'symbolic_component_{0 to n}'(n is the n_components)
    """

    gp = SymbolicTransformer(
        generations=gp_config.generation,
        population_size=gp_config.population_size,
        hall_of_fame=gp_config.hall_of_fame,
        n_components=gp_config.n_components,
        function_set=gp_config.function_set,
        parsimony_coefficient=gp_config.parsimony_coefficient,
        max_samples=gp_config.max_samples,
        verbose=1,
        random_state=0,
        n_jobs=3)

    X = df[gp_config.feature_cols]
    y = df[gp_config.target_col]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        shuffle=True)
    gp.fit(X_train, y_train)
    names = [
        "symbolic_component_" + str(i) for i in range(gp_config.n_components)
    ]
    res = pd.DataFrame(gp.transform(X), columns=names)
    df_t = pd.concat([df, res], axis=1)
    return df_t
Exemple #13
0
class GplearnGenerateFeature(object):
    def __init__(self, input_path, output_path):
        self.__input_path, self.__output_path = input_path, output_path

        # data prepare
        self.__feature_importance = None
        self.__feature_top_column = None
        self.__train, self.__test = [None for _ in range(2)]
        self.__train_label = None
        self.__train_feature, self.__test_feature = [None for _ in range(2)]

        self.__categorical_columns = None
        self.__encoder = None
        self.__numeric_columns = None
        self.__filler = None

        # feature generate
        self.__genetic_transformer = None
        self.__genetic_train_feature = None
        self.__genetic_test_feature = None

    def data_prepare(self):
        self.__feature_importance = pd.read_csv(
            os.path.join(self.__input_path,
                         "feature_importance_feature_data_V5.csv"))
        self.__feature_importance = (self.__feature_importance.groupby([
            "feature"
        ])["importance"].mean().to_frame("importance").reset_index(
            drop=False)).sort_values("importance",
                                     ascending=False).reset_index(drop=True)
        self.__feature_top_column = list(self.__feature_importance.iloc[0:200,
                                                                        0])

        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_select_feature_df.csv"),
            usecols=self.__feature_top_column + ["TARGET"])
        self.__test = pd.read_csv(os.path.join(self.__input_path,
                                               "test_select_feature_df.csv"),
                                  usecols=self.__feature_top_column)

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop("TARGET", axis=1)
        self.__test_feature = self.__test[
            self.__train_feature.columns.tolist()]

        # encoder
        self.__categorical_columns = self.__train_feature.select_dtypes(
            include="object").columns.tolist()
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature[self.__categorical_columns],
                           self.__train_label)
        self.__train_feature[
            self.__categorical_columns] = self.__encoder.transform(
                self.__train_feature[self.__categorical_columns])
        self.__test_feature[
            self.__categorical_columns] = self.__encoder.transform(
                self.__test_feature[self.__categorical_columns])

        # filler
        self.__numeric_columns = self.__train_feature.select_dtypes(
            exclude="object").columns.tolist()
        self.__filler = Imputer(strategy="median")
        self.__filler.fit(self.__train_feature[self.__numeric_columns])
        self.__train_feature[self.__numeric_columns] = self.__filler.transform(
            self.__train_feature[self.__numeric_columns])
        self.__test_feature[self.__numeric_columns] = self.__filler.transform(
            self.__test_feature[self.__numeric_columns])

    def feature_generate(self):
        self.__genetic_transformer = SymbolicTransformer(population_size=10000,
                                                         generations=200,
                                                         tournament_size=200,
                                                         metric="spearman",
                                                         n_jobs=-1,
                                                         verbose=1)
        self.__genetic_transformer.fit(self.__train_feature,
                                       self.__train_label)
        self.__genetic_train_feature = self.__genetic_transformer.transform(
            self.__train_feature)
        self.__genetic_test_feature = self.__genetic_transformer.transform(
            self.__test_feature)

    def data_output(self):
        self.__genetic_train_feature = pd.DataFrame(
            self.__genetic_train_feature,
            columns=[
                "Genetic_" + str(i)
                for i in range(self.__genetic_train_feature.shape[1])
            ])
        self.__genetic_test_feature = pd.DataFrame(
            self.__genetic_test_feature,
            columns=[
                "Genetic_" + str(i)
                for i in range(self.__genetic_test_feature.shape[1])
            ])
        self.__genetic_train_feature.to_csv(os.path.join(
            self.__output_path, "genetic_train_feature.csv"),
                                            index=False)
        self.__genetic_test_feature.to_csv(os.path.join(
            self.__output_path, "genetic_test_feature.csv"),
                                           index=False)
Exemple #14
0
def pd_col_genetic_transform(df=None, col=None, pars=None):
    """
        Find Symbolic formulae for faeture engineering

    """
    prefix = 'col_genetic'
    ######################################################################################
    from gplearn.genetic import SymbolicTransformer
    from gplearn.functions import make_function
    import random

    colX = col  # [col_ for col_ in col if col_ not in coly]
    train_X = df[colX].fillna(method='ffill')
    feature_name_ = colX

    def squaree(x):
        return x * x

    square_ = make_function(function=squaree, name='square_', arity=1)

    function_set = pars.get('function_set', [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan',
        square_
    ])
    pars_genetic = pars.get(
        'pars_genetic',
        {
            'generations': 5,
            'population_size': 10,  ### Higher than nb_features
            'metric': 'spearman',
            'tournament_size': 20,
            'stopping_criteria': 1.0,
            'const_range': (-1., 1.),
            'p_crossover': 0.9,
            'p_subtree_mutation': 0.01,
            'p_hoist_mutation': 0.01,
            'p_point_mutation': 0.01,
            'p_point_replace': 0.05,
            'parsimony_coefficient': 0.005,  ####   0.00005 Control Complexity
            'max_samples': 0.9,
            'verbose': 1,

            #'n_components'      ### Control number of outtput features  : n_components
            'random_state': 0,
            'n_jobs': 4,
        })

    if 'path_pipeline' in pars:  #### Inference time
        gp = load(pars['path_pipeline'] + f"/{prefix}_model.pkl")
        pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")
    else:  ### Training time
        coly = pars['coly']
        train_y = pars['dfy']
        gp = SymbolicTransformer(
            hall_of_fame=train_X.shape[1] + 1,  ### Buggy
            n_components=pars_genetic.get('n_components', train_X.shape[1]),
            feature_names=feature_name_,
            function_set=function_set,
            **pars_genetic)
        gp.fit(train_X, train_y)

    ##### Transform Data  #########################################
    df_genetic = gp.transform(train_X)
    tag = random.randint(0, 10)  #### UNIQUE TAG
    col_genetic = [f"gen_{tag}_{i}" for i in range(df_genetic.shape[1])]
    df_genetic = pd.DataFrame(df_genetic,
                              columns=col_genetic,
                              index=train_X.index)
    df_genetic.index = train_X.index
    pars_gen_all = {'pars_genetic': pars_genetic, 'function_set': function_set}

    ##### Formulae Exrraction #####################################
    formula = str(gp).replace("[", "").replace("]", "")
    flist = formula.split(",\n")
    form_dict = {x: flist[i] for i, x in enumerate(col_genetic)}
    pars_gen_all['formulae_dict'] = form_dict
    log("########## Formulae ", form_dict)
    # col_pars['map_dict'] = dict(zip(train_X.columns.to_list(), feature_name_))

    col_new = col_genetic

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df_genetic, 'df_genetic', pars['path_features_store'])
        save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_gen_all,
             pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")
        # save(form_dict,      pars['path_pipeline_export'] + f"/{prefix}_formula.pkl")
        save_json(form_dict, pars['path_pipeline_export'] +
                  f"/{prefix}_formula.json")  ### Human readable

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: col_new  ### list
    }
    return df_genetic, col_pars
boston.target = boston.target[perm]

est = Ridge()
est.fit(boston.data[:300, :], boston.target[:300])
print(est.score(boston.data[300:, :], boston.target[300:]))

del est

function_set = [
    'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max',
    'min'
]
gp = SymbolicTransformer(generations=20,
                         population_size=2000,
                         hall_of_fame=100,
                         n_components=10,
                         function_set=function_set,
                         parsimony_coefficient=0.0005,
                         max_samples=0.9,
                         verbose=1,
                         random_state=0,
                         n_jobs=3)
gp.fit(boston.data[:300, :], boston.target[:300])

gp_features = gp.transform(boston.data)
new_boston = np.hstack((boston.data, gp_features))

est = Ridge()
est.fit(new_boston[:300, :], boston.target[:300])
print(est.score(new_boston[300:, :], boston.target[300:]))
Exemple #16
0
    cv = KFold(n_splits=6, shuffle=True, random_state=42)
    results = []
    feature_import = pd.DataFrame()
    sub_array = []
    # feature_import['name'] = train.columns

    y_train = y_train.values

    y_mean = np.mean(y_train)

    for model in [model_lgb]:
        for traincv, testcv in cv.split(train, y_train):
            gp.fit(train[traincv], y_train[traincv])

            gp_features = gp.transform(train)
            print(gp_features)
            train = np.hstack((train, gp_features))

            m = model.fit(train[traincv],
                          y_train[traincv],
                          eval_set=[(train[testcv], y_train[testcv])],
                          early_stopping_rounds=150)

            y_tmp = m.predict(train[testcv], num_iteration=m.best_iteration)
            res = mean_squared_error(y_train[testcv], (y_tmp)) / 2
            results.append(res)

            t_gp_features = gp.transform(test)
            print(t_gp_features)
            test = np.hstack((test, t_gp_features))
]

gp = SymbolicTransformer(generations=10,
                         population_size=50000,
                         hall_of_fame=100,
                         n_components=10,
                         function_set=function_set,
                         parsimony_coefficient=0.0005,
                         max_samples=0.9,
                         verbose=1,
                         random_state=42,
                         n_jobs=4)

# Fit & save to dataframe
gp.fit(total_df.iloc[train_idx], y)
gp_features = gp.transform(total_df)
genetic_df = pd.DataFrame(
    gp_features, columns=[f'Genetic_{i}' for i in range(gp_features.shape[1])])


def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Taken from: https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
def symbolic_features(p_x, p_y):
    """
    Funcion para crear regresores no lineales

    Parameters
    ----------
    p_x: pd.DataFrame
        with regressors or predictor variables
        p_x = data_features.iloc[0:30, 3:]

    p_y: pd.DataFrame
        with variable to predict
        p_y = data_features.iloc[0:30, 1]

    Returns
    -------
    score_gp: float
        error of prediction

    """
    model = SymbolicTransformer(
        function_set=["sub", "add", 'inv', 'mul', 'div', 'abs', 'log'],
        population_size=5000,
        hall_of_fame=20,
        n_components=10,
        tournament_size=20,
        generations=5,
        init_depth=(4, 8),
        init_method='half and half',
        parsimony_coefficient=0.1,
        const_range=None,
        metric='pearson',
        stopping_criteria=0.65,
        p_crossover=0.4,
        p_subtree_mutation=0.3,
        p_hoist_mutation=0.1,
        p_point_mutation=0.2,
        verbose=True,
        warm_start=True,
        n_jobs=-1,
        feature_names=p_x.columns)
    model.fit_transform(p_x, p_y)
    model_params = model.get_params()
    gp_features = model.transform(p_x)

    model_fit = np.hstack((p_x, gp_features))
    results = {
        'fit': model_fit,
        'params': model_params,
        'model': model,
        "features": gp_features
    }
    best_p = model._best_programs
    best_p_dict = {}

    for p in best_p:
        factor_name = 'alpha_' + str(best_p.index(p) + 1)
        best_p_dict[factor_name] = {
            'fitness': p.fitness_,
            "expression": str(p),
            'depth': p.depth_,
            "length": p.length_
        }

    best_p_dict = pd.DataFrame(best_p_dict).T
    best_p_dict = best_p_dict.sort_values(by="fitness")

    return results, best_p_dict
Exemple #19
0
# 使用gplearn的genetic方法组合特征
data = datasets.load_boston()  # 加载数据集
x, y = data.data, data.target  # 分割形成x和y
print(x.shape)  # 查看x的形状
print(x[0])  # 查看x的第一条数据
model_symbolic = SymbolicTransformer(n_components=5,
                                     generations=18,
                                     function_set=('add', 'sub', 'mul', 'div',
                                                   'sqrt', 'log', 'abs', 'neg',
                                                   'inv', 'max', 'min'),
                                     max_samples=0.9,
                                     metric='pearson',
                                     random_state=0,
                                     n_jobs=2)
model_symbolic.fit(x, y)  # 训练数据
symbolic_features = model_symbolic.transform(x)  # 转换数据
print(symbolic_features.shape)  # 打印形状
print(symbolic_features[0])  # 打印第1条数据
print(model_symbolic)  # 输出公式

#读者可取消注释执行下面的代码段
#%%
'''
# 本段示例代码将输出重复的重复特征
reg_data = np.loadtxt('data5.txt')
x, y = reg_data[:, :-1], reg_data[:, -1]
model_symbolic = SymbolicTransformer(n_components=5, generations=18,
                                     function_set=(
                                         'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg',
                                         'inv','max', 'min'),
                                     max_samples=0.9, metric='pearson',
Exemple #20
0
function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv']

gp = SymbolicTransformer(generations=20,
                         population_size=2000,
                         hall_of_fame=100,
                         n_components=10,
                         function_set=function_set,
                         parsimony_coefficient=0.0005,
                         max_samples=0.9,
                         verbose=1,
                         random_state=0,
                         n_jobs=6)

gp.fit(train[numeric_feats], train['target'])

gp_feats = gp.transform(tt[numeric_feats])
tt = pd.concat([tt, pd.DataFrame(gp_feats)], axis=1)

### box cox transform
'''
#numeric_feats = tt.dtypes[tt.dtypes != 'object'].index 
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    
skewed_feats = skewed_feats[skewed_feats > 0.2]
skewed_feats = skewed_feats.index
for feat in skewed_feats:
    tt[feat] = tt[feat] +10
    (tt[feat], lam) = boxcox(tt[feat])
    
'''