Ejemplo n.º 1
0
    def data_prepare(self):
        self.__digists = load_digits(n_class=2)
        self.__X = self.__digists.data
        self.__y = self.__digists.target

        self.__train, self.__test, self.__train_label, self.__test_label = train_test_split(
            self.__X, self.__y, test_size=0.2, random_state=9)

        # standard scaler
        scaler = StandardScaler().fit(self.__train)
        self.__train = scaler.transform(self.__train)
        self.__test = scaler.transform(self.__test)

        # gp feature
        function_set = ("add", "sub", "mul", "div", "sqrt", "log", "abs",
                        "neg", "inv", "max", "min")

        gp = SymbolicTransformer(generations=5,
                                 population_size=2000,
                                 hall_of_fame=100,
                                 n_components=10,
                                 function_set=function_set,
                                 parsimony_coefficient=0.0005,
                                 max_samples=0.9,
                                 verbose=1,
                                 random_state=0,
                                 n_jobs=3)

        # 使用 stacking 的方式得到 generic feature 感觉更为合理
        gp.fit(self.__train, self.__train_label)
        self.__train_gfeature = np.hstack(
            (self.__train, gp.transform(self.__train)))
        self.__test_gfeature = np.hstack(
            (self.__test, gp.transform(self.__test)))
Ejemplo n.º 2
0
def test_symbolic_transformer():
    """Check that SymbolicTransformer example works"""

    rng = check_random_state(0)
    boston = load_boston()
    perm = rng.permutation(boston.target.size)
    boston.data = boston.data[perm]
    boston.target = boston.target[perm]

    est = Ridge()
    est.fit(boston.data[:300, :], boston.target[:300])
    assert_almost_equal(est.score(boston.data[300:, :], boston.target[300:]),
                        0.759319453049884)

    function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log',
                    'abs', 'neg', 'inv', 'max', 'min']
    gp = SymbolicTransformer(generations=20, population_size=2000,
                             hall_of_fame=100, n_components=10,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9,
                             random_state=0)
    gp.fit(boston.data[:300, :], boston.target[:300])

    gp_features = gp.transform(boston.data)
    new_boston = np.hstack((boston.data, gp_features))

    est = Ridge()
    est.fit(new_boston[:300, :], boston.target[:300])
    assert_almost_equal(est.score(new_boston[300:, :], boston.target[300:]),
                        0.8418372105182055)
Ejemplo n.º 3
0
def symbolic_transformer(X, y, encoder=None):
    """Transform features using multiple operations. This will add new features to the data frame.

    Args:
        X (DataFrame): Independent features
        y (Series): Dependen feature or target
        encoder (obj, optional): Object of the type 'SymbolicTransformer'. Defaults to None.

    Returns:
        DataFrame: Additional columns calculated by the algorithm
    """
    if encoder is None:
        function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log',
                        'abs', 'neg', 'inv', 'max', 'min']
        encoder = SymbolicTransformer(generations=10,
                                population_size=1000,
                                hall_of_fame=100,
                                n_components=12,
                                function_set=function_set,
                                parsimony_coefficient=0.0005,
                                max_samples=0.9,
                                verbose=1,
                                random_state=123,
                                n_jobs=-1)
        encoder.fit(X, y)
    gp_features = encoder.transform(X)

    return gp_features, encoder
def test_transformer_iterable():
    """Check that the transformer is iterable"""

    random_state = check_random_state(415)
    X = np.reshape(random_state.uniform(size=50), (5, 10))
    y = random_state.uniform(size=5)
    function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg',
                    'inv', 'max', 'min']
    est = SymbolicTransformer(population_size=500, generations=2,
                              function_set=function_set, random_state=0)

    # Check unfitted
    unfitted_len = len(est)
    unfitted_iter = [gp.length_ for gp in est]
    expected_iter = []

    assert_true(unfitted_len == 0)
    assert_true(unfitted_iter == expected_iter)

    # Check fitted
    est.fit(X, y)
    fitted_len = len(est)
    fitted_iter = [gp.length_ for gp in est]
    expected_iter = [8, 12, 2, 29, 9, 33, 9, 8, 4, 22]

    assert_true(fitted_len == 10)
    assert_true(fitted_iter == expected_iter)

    # Check IndexError
    assert_raises(IndexError, est.__getitem__, 10)
Ejemplo n.º 5
0
def test_transformer_iterable():
    """Check that the transformer is iterable"""

    random_state = check_random_state(415)
    X = np.reshape(random_state.uniform(size=50), (5, 10))
    y = random_state.uniform(size=5)
    est = SymbolicTransformer(generations=2, random_state=0)

    # Check unfitted
    unfitted_len = len(est)
    unfitted_iter = [gp.length_ for gp in est]
    expected_iter = []

    assert_true(unfitted_len == 0)
    assert_true(unfitted_iter == expected_iter)

    # Check fitted
    est.fit(X, y)
    fitted_len = len(est)
    fitted_iter = [gp.length_ for gp in est]
    expected_iter = [15, 19, 19, 12, 9, 10, 7, 14, 6, 21]

    assert_true(fitted_len == 10)
    assert_true(fitted_iter == expected_iter)

    # Check IndexError
    assert_raises(IndexError, est.__getitem__, 10)
Ejemplo n.º 6
0
def test_output_shape():
    """Check output shape is as expected"""

    random_state = check_random_state(415)
    X = np.reshape(random_state.uniform(size=50), (5, 10))
    y = random_state.uniform(size=5)

    # Check the transformer
    est = SymbolicTransformer(n_components=5, generations=2, random_state=0)
    est.fit(X, y)
    assert_true(est.transform(X).shape == (5, 5))
Ejemplo n.º 7
0
def test_output_shape():
    """Check output shape is as expected"""

    random_state = check_random_state(415)
    X = np.reshape(random_state.uniform(size=50), (5, 10))
    y = random_state.uniform(size=5)

    # Check the transformer
    est = SymbolicTransformer(n_components=5, generations=2, random_state=0)
    est.fit(X, y)
    assert_true(est.transform(X).shape == (5, 5))
Ejemplo n.º 8
0
class GplearnDemo(object):
    def __init__(self):
        # data prepare
        self.__boston = None
        self.__boston_feature = None
        self.__boston_label = None
        self.__train_feature, self.__test_feature = [None for _ in range(2)]
        self.__train_label, self.__test_label = [None for _ in range(2)]
        self.__transformer = None
        self.__gp_train_feature = None
        self.__gp_test_feature = None

        # model fit
        self.__regressor = None

    def data_prepare(self):
        self.__boston = load_boston()
        self.__boston_feature = pd.DataFrame(
            self.__boston.data, columns=self.__boston.feature_names)
        self.__boston_label = pd.Series(
            self.__boston.target).to_frame("TARGET").squeeze()

        self.__train_feature, self.__test_feature, self.__train_label, self.__test_label = (
            train_test_split(self.__boston_feature,
                             self.__boston_label,
                             test_size=0.5,
                             shuffle=True))

        # 不能有缺失值
        self.__transformer = SymbolicTransformer(n_jobs=4)
        self.__transformer.fit(self.__train_feature, self.__train_label)
        self.__gp_train_feature = self.__transformer.transform(
            self.__train_feature)
        self.__gp_test_feature = self.__transformer.transform(
            self.__test_feature)

    def model_fit_predict(self):
        self.__regressor = Ridge()
        self.__regressor.fit(self.__train_feature, self.__train_label)
        print(
            mean_squared_error(self.__test_label,
                               self.__regressor.predict(self.__test_feature)))

        self.__regressor = Ridge()
        self.__regressor.fit(
            np.hstack((self.__train_feature.values, self.__gp_train_feature)),
            self.__train_label)
        print(
            mean_squared_error(
                self.__test_label,
                self.__regressor.predict(
                    np.hstack((self.__test_feature.values,
                               self.__gp_test_feature)))))
Ejemplo n.º 9
0
def pd_col_genetic_transform(df=None, col=None, pars=None):
    """
        Find Symbolic formulae for faeture engineering

    """
    prefix = 'col_genetic'
    ######################################################################################
    from gplearn.genetic import SymbolicTransformer
    coly = pars['coly']
    colX = [t for t in col if t not in [coly]]
    train_X = df[colX]
    train_y = df[coly]

    function_set = [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan'
    ]
    pars_genetic = pars.get('pars_genetic', {
        'generations': 20,
        'n_components': 10,
        'population_size': 200
    })

    gp = SymbolicTransformer(hall_of_fame=100,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9,
                             verbose=1,
                             random_state=0,
                             n_jobs=6,
                             **pars_genetic)

    gp.fit(train_X, train_y)
    df_genetic = gp.transform(train_X)
    df_genetic = pd.DataFrame(
        df_genetic,
        columns=["gen_" + str(a) for a in range(df_genetic.shape[1])])
    df_genetic.index = train_X.index

    col_genetic = list(df_genetic.columns)
    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df_genetic, 'df_genetic', pars['path_features_store'])
        save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_genetic,
             pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {'model': gp, 'pars': pars_genetic}
    col_pars['cols_new'] = {
        'col_genetic': col_genetic  ### list
    }
    return df_genetic, col_pars
Ejemplo n.º 10
0
def test_custom_transformer_metrics():
    """Check whether greater_is_better works for SymbolicTransformer."""

    est_gp = SymbolicTransformer(generations=2,
                                 population_size=100,
                                 hall_of_fame=10,
                                 n_components=1,
                                 metric='pearson',
                                 random_state=415)
    est_gp.fit(boston.data, boston.target)
    for program in est_gp:
        formula = program.__str__()
    expected_formula = ('sub(div(mul(X4, X12), div(X9, X9)), '
                        'sub(div(X11, X12), add(X12, X0)))')
    assert_equal(expected_formula, formula, True)

    def _neg_weighted_pearson(y, y_pred, w):
        """Calculate the weighted Pearson correlation coefficient."""
        with np.errstate(divide='ignore', invalid='ignore'):
            y_pred_demean = y_pred - np.average(y_pred, weights=w)
            y_demean = y - np.average(y, weights=w)
            corr = (
                (np.sum(w * y_pred_demean * y_demean) / np.sum(w)) / np.sqrt(
                    (np.sum(w * y_pred_demean**2) * np.sum(w * y_demean**2)) /
                    (np.sum(w)**2)))
        if np.isfinite(corr):
            return -1 * np.abs(corr)
        return 0.

    neg_weighted_pearson = make_fitness(function=_neg_weighted_pearson,
                                        greater_is_better=False)

    c_est_gp = SymbolicTransformer(generations=2,
                                   population_size=100,
                                   hall_of_fame=10,
                                   n_components=1,
                                   stopping_criteria=-1,
                                   metric=neg_weighted_pearson,
                                   random_state=415)
    c_est_gp.fit(boston.data, boston.target)
    for program in c_est_gp:
        c_formula = program.__str__()
    assert_equal(expected_formula, c_formula, True)
def test_function_in_program():
    """Check that using a custom function in a program works"""
    def logic(x1, x2, x3, x4):
        return np.where(x1 > x2, x3, x4)

    logical = make_function(function=logic, name='logical', arity=4)
    function_set = ['add', 'sub', 'mul', 'div', logical]
    est = SymbolicTransformer(generations=2,
                              population_size=2000,
                              hall_of_fame=100,
                              n_components=10,
                              function_set=function_set,
                              parsimony_coefficient=0.0005,
                              max_samples=0.9,
                              random_state=0)
    est.fit(boston.data[:300, :], boston.target[:300])

    formula = est._programs[0][906].__str__()
    expected_formula = 'sub(logical(X6, add(X11, 0.898), X10, X2), X5)'
    assert_equal(expected_formula, formula, True)
Ejemplo n.º 12
0
def getSymbolTrans(train, valid, y, random_state=888):

    X_train = train.copy()
    X_valid = valid.copy()
    y_train = y.copy()
    function_set = [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max',
        'min'
    ]

    gp = SymbolicTransformer(generations=20,
                             population_size=2000,
                             hall_of_fame=100,
                             n_components=10,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9,
                             verbose=0,
                             random_state=0,
                             n_jobs=3)

    gp.fit(X_train, y_train)

    gp_features_train = gp.transform(X_train)
    dt_gp_features_train = pd.DataFrame(gp_features_train)
    dt_gp_features_train.columns = [
        "ST_" + str(i) for i in range(1, dt_gp_features_train.shape[1] + 1)
    ]
    X_train = X_train.join(dt_gp_features_train)
    X_train = X_train.fillna(0)

    gp_features_valid = gp.transform(X_valid)
    dt_gp_features_valid = pd.DataFrame(gp_features_valid)
    dt_gp_features_valid.columns = [
        "ST_" + str(i) for i in range(1, dt_gp_features_valid.shape[1] + 1)
    ]
    X_valid = X_valid.join(dt_gp_features_valid)
    X_valid = X_valid.fillna(0)

    return (X_train, X_valid)
Ejemplo n.º 13
0
def test_custom_functions():
    """Test the custom programs example works"""

    rng = check_random_state(0)
    boston = load_boston()
    perm = rng.permutation(boston.target.size)
    boston.data = boston.data[perm]
    boston.target = boston.target[perm]

    def logic(x1, x2, x3, x4):
        return np.where(x1 > x2, x3, x4)

    logical = make_function(function=logic, name='logical', arity=4)

    function_set = ['add', 'sub', 'mul', 'div', logical]
    gp = SymbolicTransformer(generations=2,
                             population_size=2000,
                             hall_of_fame=100,
                             n_components=10,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9,
                             random_state=0)

    gp.fit(boston.data[:300, :], boston.target[:300])

    assert_equal(gp._programs[0][906].__str__(),
                 'sub(logical(X6, add(X11, 0.898), X10, X2), X5)')

    dot_data = gp._programs[0][906].export_graphviz()
    expected = ('digraph program {\nnode [style=filled]\n0 [label="sub", '
                'fillcolor="#136ed4"] ;\n1 [label="logical", '
                'fillcolor="#136ed4"] ;\n2 [label="X6", fillcolor="#60a6f6"] '
                ';\n3 [label="add", fillcolor="#136ed4"] ;\n4 [label="X11", '
                'fillcolor="#60a6f6"] ;\n5 [label="0.898", '
                'fillcolor="#60a6f6"] ;\n3 -> 5 ;\n3 -> 4 ;\n6 [label="X10", '
                'fillcolor="#60a6f6"] ;\n7 [label="X2", fillcolor="#60a6f6"] '
                ';\n1 -> 7 ;\n1 -> 6 ;\n1 -> 3 ;\n1 -> 2 ;\n8 [label="X5", '
                'fillcolor="#60a6f6"] ;\n0 -> 8 ;\n0 -> 1 ;\n}')
    assert_equal(dot_data, expected)
Ejemplo n.º 14
0
def get_feature_symbolic_learning(df, gp_config):
    """

    Parameters
    ----------
    df: pd.DataFrame,the input dataFrame.
    gp_config: GPConfig object, the config object of gplearn.SymbolicTransformer.

    Returns
    -------
    df_t: pd.DataFrame, df with the features of SymbolicTransformer trans.
        The new features named like 'symbolic_component_{0 to n}'(n is the n_components)
    """

    gp = SymbolicTransformer(
        generations=gp_config.generation,
        population_size=gp_config.population_size,
        hall_of_fame=gp_config.hall_of_fame,
        n_components=gp_config.n_components,
        function_set=gp_config.function_set,
        parsimony_coefficient=gp_config.parsimony_coefficient,
        max_samples=gp_config.max_samples,
        verbose=1,
        random_state=0,
        n_jobs=3)

    X = df[gp_config.feature_cols]
    y = df[gp_config.target_col]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        shuffle=True)
    gp.fit(X_train, y_train)
    names = [
        "symbolic_component_" + str(i) for i in range(gp_config.n_components)
    ]
    res = pd.DataFrame(gp.transform(X), columns=names)
    df_t = pd.concat([df, res], axis=1)
    return df_t
Ejemplo n.º 15
0
def gp_features(df,
                target,
                random_state,
                generations=5,
                function_set=['add', 'sub', 'mul', 'div']):
    X = df.loc[:, (df.columns != target)]
    y = df.loc[:, target]

    gp = SymbolicTransformer(generations=generations,
                             population_size=1000,
                             hall_of_fame=100,
                             n_components=12,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9,
                             verbose=0,
                             random_state=random_state,
                             n_jobs=-1)
    gp.fit(pd.get_dummies(X), y)
    df = gp_transform(df, gp.transform, X)

    return df, gp.transform
Ejemplo n.º 16
0
    def fit(self, X, y=None, state={}):
        exponential = make_function(function=exponent, name='exp', arity=1)

        function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max',
                        'min', 'tan', 'sin', 'cos', exponential]

        gp = SymbolicTransformer(generations=self.generations, population_size=self.population,
                                 hall_of_fame=self.hall_of_fame, n_components=self.components,
                                 function_set=function_set,
                                 parsimony_coefficient='auto',
                                 max_samples=0.6, verbose=1, metric=self.metric,
                                 random_state=0, n_jobs=7)

        self.state['genetic'] = {}
        self.state['genetic']['fit'] = gp.fit(X, y)

        return self
Ejemplo n.º 17
0
boston.target = boston.target[perm]

est = Ridge()
est.fit(boston.data[:300, :], boston.target[:300])
print(est.score(boston.data[300:, :], boston.target[300:]))

del est

function_set = [
    'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max',
    'min'
]
gp = SymbolicTransformer(generations=20,
                         population_size=2000,
                         hall_of_fame=100,
                         n_components=10,
                         function_set=function_set,
                         parsimony_coefficient=0.0005,
                         max_samples=0.9,
                         verbose=1,
                         random_state=0,
                         n_jobs=3)
gp.fit(boston.data[:300, :], boston.target[:300])

gp_features = gp.transform(boston.data)
new_boston = np.hstack((boston.data, gp_features))

est = Ridge()
est.fit(new_boston[:300, :], boston.target[:300])
print(est.score(new_boston[300:, :], boston.target[300:]))
Ejemplo n.º 18
0
generations = 3  # 进化世代数
population_size = 1000  # 每一代中的公式数量
tournament_size = 200  # 每一代中被随机选中计算适应度的公式数
const_range = (0.0, 10.0)
function_set = init_function + user_function  # 函数算子
metric = rankIC_metric  # 目标函数作为适应度
random_state = 200812  # 设置随机种子
factor_gp = SymbolicTransformer(feature_names=fields,
                                function_set=function_set,
                                generations=generations,
                                population_size=population_size,
                                tournament_size=tournament_size,
                                const_range=const_range,
                                random_state=random_state)  #, metric=metric)
factor_gp.fit(stock_price, target)

with open(r'D:\work\back_test_system\FactorBackTest\gp_model.pkl', 'wb') as f:
    pickle.dump(factor_gp, f)

best_programs = factor_gp._best_programs
best_programs_dict = {}
for p in best_programs:
    factor_name = 'alpha_' + str(best_programs.index(p) + 1)
    best_programs_dict[factor_name] = {
        'fitness': p.fitness_,
        'expression': str(p),
        'depth': p.depth_,
        'length': p.length_
    }
best_programs_dict = pd.DataFrame(best_programs_dict).T
Ejemplo n.º 19
0
# 使用gplearn的genetic方法组合特征
data = datasets.load_boston()  # 加载数据集
x, y = data.data, data.target  # 分割形成x和y
print(x.shape)  # 查看x的形状
print(x[0])  # 查看x的第一条数据
model_symbolic = SymbolicTransformer(n_components=5,
                                     generations=18,
                                     function_set=('add', 'sub', 'mul', 'div',
                                                   'sqrt', 'log', 'abs', 'neg',
                                                   'inv', 'max', 'min'),
                                     max_samples=0.9,
                                     metric='pearson',
                                     random_state=0,
                                     n_jobs=2)
model_symbolic.fit(x, y)  # 训练数据
symbolic_features = model_symbolic.transform(x)  # 转换数据
print(symbolic_features.shape)  # 打印形状
print(symbolic_features[0])  # 打印第1条数据
print(model_symbolic)  # 输出公式

#读者可取消注释执行下面的代码段
#%%
'''
# 本段示例代码将输出重复的重复特征
reg_data = np.loadtxt('data5.txt')
x, y = reg_data[:, :-1], reg_data[:, -1]
model_symbolic = SymbolicTransformer(n_components=5, generations=18,
                                     function_set=(
                                         'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg',
                                         'inv','max', 'min'),
Ejemplo n.º 20
0
numeric_feats = tt.dtypes[tt.dtypes == np.float64].index
numeric_feats = numeric_feats.drop('target')
function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv']

gp = SymbolicTransformer(generations=20,
                         population_size=2000,
                         hall_of_fame=100,
                         n_components=10,
                         function_set=function_set,
                         parsimony_coefficient=0.0005,
                         max_samples=0.9,
                         verbose=1,
                         random_state=0,
                         n_jobs=6)

gp.fit(train[numeric_feats], train['target'])

gp_feats = gp.transform(tt[numeric_feats])
tt = pd.concat([tt, pd.DataFrame(gp_feats)], axis=1)

### box cox transform
'''
#numeric_feats = tt.dtypes[tt.dtypes != 'object'].index 
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    
skewed_feats = skewed_feats[skewed_feats > 0.2]
skewed_feats = skewed_feats.index
for feat in skewed_feats:
    tt[feat] = tt[feat] +10
    (tt[feat], lam) = boxcox(tt[feat])
    
Ejemplo n.º 21
0
                                  min_data_in_leaf=6,
                                  min_sum_hessian_in_leaf=11)

    cv = KFold(n_splits=6, shuffle=True, random_state=42)
    results = []
    feature_import = pd.DataFrame()
    sub_array = []
    # feature_import['name'] = train.columns

    y_train = y_train.values

    y_mean = np.mean(y_train)

    for model in [model_lgb]:
        for traincv, testcv in cv.split(train, y_train):
            gp.fit(train[traincv], y_train[traincv])

            gp_features = gp.transform(train)
            print(gp_features)
            train = np.hstack((train, gp_features))

            m = model.fit(train[traincv],
                          y_train[traincv],
                          eval_set=[(train[testcv], y_train[testcv])],
                          early_stopping_rounds=150)

            y_tmp = m.predict(train[testcv], num_iteration=m.best_iteration)
            res = mean_squared_error(y_train[testcv], (y_tmp)) / 2
            results.append(res)

            t_gp_features = gp.transform(test)
Ejemplo n.º 22
0
generations = 3  # 进化世代数
population_size = 1000  # 每一代中的公式数量
tournament_size = 20  # 每一代中被随机选中计算适应度的公式数
const_range = (0.0, 10.0)
function_set = init_function + user_function  # 函数算子
metric = my_metric  # 目标函数作为适应度
random_state = 316  # 设置随机种子
est_gp = SymbolicTransformer(feature_names=fields,
                             function_set=function_set,
                             generations=generations,
                             metric=metric,
                             population_size=population_size,
                             tournament_size=tournament_size,
                             const_range=const_range,
                             random_state=random_state)
est_gp.fit(X_train, y_train)

with open(r'D:\work\back_test_system\FactorBackTest\gp_model.pkl', 'wb') as f:
    pickle.dump(est_gp, f)

best_programs = est_gp._best_programs
best_programs_dict = {}
for p in best_programs:
    factor_name = 'alpha_' + str(best_programs.index(p) + 1)
    best_programs_dict[factor_name] = {
        'fitness': p.fitness_,
        'expression': str(p),
        'depth': p.depth_,
        'length': p.length_
    }
    'min'
]

gp = SymbolicTransformer(generations=10,
                         population_size=50000,
                         hall_of_fame=100,
                         n_components=10,
                         function_set=function_set,
                         parsimony_coefficient=0.0005,
                         max_samples=0.9,
                         verbose=1,
                         random_state=42,
                         n_jobs=4)

# Fit & save to dataframe
gp.fit(total_df.iloc[train_idx], y)
gp_features = gp.transform(total_df)
genetic_df = pd.DataFrame(
    gp_features, columns=[f'Genetic_{i}' for i in range(gp_features.shape[1])])


def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Taken from: https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
Ejemplo n.º 24
0
def pd_col_genetic_transform(df=None, col=None, pars=None):
    """
        Find Symbolic formulae for faeture engineering

    """
    prefix = 'col_genetic'
    ######################################################################################
    from gplearn.genetic import SymbolicTransformer
    from gplearn.functions import make_function
    import random

    colX = col  # [col_ for col_ in col if col_ not in coly]
    train_X = df[colX].fillna(method='ffill')
    feature_name_ = colX

    def squaree(x):
        return x * x

    square_ = make_function(function=squaree, name='square_', arity=1)

    function_set = pars.get('function_set', [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan',
        square_
    ])
    pars_genetic = pars.get(
        'pars_genetic',
        {
            'generations': 5,
            'population_size': 10,  ### Higher than nb_features
            'metric': 'spearman',
            'tournament_size': 20,
            'stopping_criteria': 1.0,
            'const_range': (-1., 1.),
            'p_crossover': 0.9,
            'p_subtree_mutation': 0.01,
            'p_hoist_mutation': 0.01,
            'p_point_mutation': 0.01,
            'p_point_replace': 0.05,
            'parsimony_coefficient': 0.005,  ####   0.00005 Control Complexity
            'max_samples': 0.9,
            'verbose': 1,

            #'n_components'      ### Control number of outtput features  : n_components
            'random_state': 0,
            'n_jobs': 4,
        })

    if 'path_pipeline' in pars:  #### Inference time
        gp = load(pars['path_pipeline'] + f"/{prefix}_model.pkl")
        pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")
    else:  ### Training time
        coly = pars['coly']
        train_y = pars['dfy']
        gp = SymbolicTransformer(
            hall_of_fame=train_X.shape[1] + 1,  ### Buggy
            n_components=pars_genetic.get('n_components', train_X.shape[1]),
            feature_names=feature_name_,
            function_set=function_set,
            **pars_genetic)
        gp.fit(train_X, train_y)

    ##### Transform Data  #########################################
    df_genetic = gp.transform(train_X)
    tag = random.randint(0, 10)  #### UNIQUE TAG
    col_genetic = [f"gen_{tag}_{i}" for i in range(df_genetic.shape[1])]
    df_genetic = pd.DataFrame(df_genetic,
                              columns=col_genetic,
                              index=train_X.index)
    df_genetic.index = train_X.index
    pars_gen_all = {'pars_genetic': pars_genetic, 'function_set': function_set}

    ##### Formulae Exrraction #####################################
    formula = str(gp).replace("[", "").replace("]", "")
    flist = formula.split(",\n")
    form_dict = {x: flist[i] for i, x in enumerate(col_genetic)}
    pars_gen_all['formulae_dict'] = form_dict
    log("########## Formulae ", form_dict)
    # col_pars['map_dict'] = dict(zip(train_X.columns.to_list(), feature_name_))

    col_new = col_genetic

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df_genetic, 'df_genetic', pars['path_features_store'])
        save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_gen_all,
             pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")
        # save(form_dict,      pars['path_pipeline_export'] + f"/{prefix}_formula.pkl")
        save_json(form_dict, pars['path_pipeline_export'] +
                  f"/{prefix}_formula.json")  ### Human readable

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: col_new  ### list
    }
    return df_genetic, col_pars
Ejemplo n.º 25
0
    generations = 50
    function_set = init_function + user_function
    metric = MSLE
    population_size = 100
    random_state = 0
    est_gp = SymbolicTransformer(
        feature_names=fields,
        function_set=function_set,
        generations=generations,
        metric=metric,
        population_size=population_size,
        tournament_size=20,
        random_state=random_state,
    )

    est_gp.fit(train_X, train_y)

    best_programs = est_gp._best_programs
    best_programs_dict = {}
    for p in best_programs:
        factor_name = str(best_programs.index(p) + 1)
        best_programs_dict[factor_name] = {
            'fitness': p.fitness_,
            'expression': str(p),
            'depth': p.depth_,
            'length': p.length_
        }

    best_programs_dict = pd.DataFrame(best_programs_dict).T
    best_programs_dict = best_programs_dict.sort_values(by='fitness')
Ejemplo n.º 26
0
def run_ga_industry(industry_sym,
                    industry_code,
                    data_directory,
                    start_date_int,
                    end_date_int,
                    metric,
                    signal_ref_data,
                    q_lower,
                    q_upper,
                    strat_flag,
                    population_size,
                    tournament_size,
                    generations,
                    hall_of_fame,
                    n_components,
                    factor_filter,
                    n_jobs=1,
                    verbose=0):
    industry_data = pd.read_parquet(f'{data_directory}/{industry_code}.parq')
    industry_data = industry_data.loc[start_date_int:end_date_int, :].copy()
    data0 = industry_data.copy()
    data0['pct1'] = np.log(data0['close_' + industry_sym]).diff().shift(-1)
    dataset = data0.dropna()
    data = dataset.drop('pct1', axis=1).values
    ga_train_fields = dataset.drop('pct1', axis=1).columns

    target = dataset['pct1'].values
    test_size = 0.1
    test_num = int(len(data) * test_size)

    X_train = data[:-test_num].copy()
    X_train_df = dataset[ga_train_fields].iloc[:-test_num].copy()
    # X_train = ut.min_max_scaling(X_train)
    y_train = np.nan_to_num(target[:-test_num].copy())

    test_backward_i0 = test_num + signal_ref_data - 1
    # X_test = data[-test_backward_i0:].copy()
    X_test_df = dataset[ga_train_fields].iloc[-test_backward_i0:].copy()
    # X_test = ut.min_max_scaling(X_test)
    # y_test = np.nan_to_num(target[-test_backward_i0:].copy())

    # ================================================================================
    # Fitting
    # --------------------------------------------------------------------------------
    # SymbolicTransformer
    est_gp = SymbolicTransformer(
        population_size=population_size,  # 1000
        tournament_size=tournament_size,  # 20
        generations=generations,  # 20
        hall_of_fame=hall_of_fame,  # 100
        n_components=n_components,  # 10
        stopping_criteria=np.inf,  # 1.0
        const_range=None,  # (-1., 1.)
        # init_depth=(2, 6),    # (2, 6)
        # init_method='half and half',    # 'half and half'
        function_set=function_set,  # ('add', 'sub', 'mul', 'div')
        metric=metric,  # 'pearson'
        # metric=gp_sharpe,
        parsimony_coefficient=0.0001,  # 0.001
        # p_crossover=0.9,    # 0.9
        # p_subtree_mutation=0.01,    # 0.01
        # p_hoist_mutation=0.01,    # 0.01
        # p_point_mutation=0.01,    # 0.01
        # p_point_replace=0.05,    # 0.05
        max_samples=
        1.0,  # 1.0 || The fraction of samples to draw from X to evaluate each program on.
        feature_names=ga_train_fields,  # None
        # warm_start=False,    # False
        # low_memory=False,    # False
        n_jobs=n_jobs,  # 1
        verbose=verbose,  # 0
        random_state=10,  # None
    )
    est_gp.fit(X_train, y_train, sample_weight=None)

    # ================================================================================
    # Process programs
    # --------------------------------------------------------------------------------
    program_df = clean_gplearn_programs(est_gp._programs, verbose=0)
    function_expressions = program_df['expression'].values

    # ================================================================================
    # Backtest Overview
    # --------------------------------------------------------------------------------
    # train set
    logret = dataset.iloc[:-test_num]['pct1'].values

    alpha_train_overview_list = []
    for expr in function_expressions:
        factor_values = eval(expr, function_set_dict,
                             X_train_df.to_dict(orient="series"))
        signal = _generate_signal(factor_values,
                                  n=signal_ref_data,
                                  q_lower=q_lower,
                                  q_upper=q_upper,
                                  flag=strat_flag)
        factor_return = np.sum(signal * logret)
        alpha_train_overview_list.append([expr, factor_return])

    train_ov = pd.DataFrame(alpha_train_overview_list,
                            columns=['expression',
                                     'totret_is']).set_index('expression')
    best_train_factor = train_ov.sort_values(
        'totret_is').iloc[-factor_filter[0]:].index.tolist()

    # test set
    logret = dataset.iloc[-test_backward_i0:]['pct1'].values
    alpha_test_overview_list = []
    for expr in best_train_factor:
        factor_values = eval(expr, function_set_dict,
                             X_test_df.to_dict(orient="series"))
        signal = _generate_signal(factor_values,
                                  n=signal_ref_data,
                                  q_lower=q_lower,
                                  q_upper=q_upper,
                                  flag=strat_flag)
        factor_return = np.sum(signal * logret)
        alpha_test_overview_list.append([expr, factor_return])

    test_ov = pd.DataFrame(alpha_test_overview_list,
                           columns=['expression',
                                    'totret_oos']).set_index('expression')
    best_factors = test_ov.sort_values(
        "totret_oos").iloc[-factor_filter[1]:].index.tolist()

    _ref_data = industry_data.iloc[-signal_ref_data *
                                   2:].to_dict(orient="series")
    best_opinions = [
        _generate_signal(eval(expr, function_set_dict, _ref_data),
                         n=signal_ref_data,
                         q_lower=q_lower,
                         q_upper=q_upper,
                         flag=strat_flag)[-1] for expr in best_factors
    ]
    ew_opinion = np.sum(best_opinions)
    output = [industry_sym, ew_opinion, best_opinions, best_factors]
    return output
Ejemplo n.º 27
0
class GplearnGenerateFeature(object):
    def __init__(self, input_path, output_path):
        self.__input_path, self.__output_path = input_path, output_path

        # data prepare
        self.__feature_importance = None
        self.__feature_top_column = None
        self.__train, self.__test = [None for _ in range(2)]
        self.__train_label = None
        self.__train_feature, self.__test_feature = [None for _ in range(2)]

        self.__categorical_columns = None
        self.__encoder = None
        self.__numeric_columns = None
        self.__filler = None

        # feature generate
        self.__genetic_transformer = None
        self.__genetic_train_feature = None
        self.__genetic_test_feature = None

    def data_prepare(self):
        self.__feature_importance = pd.read_csv(
            os.path.join(self.__input_path,
                         "feature_importance_feature_data_V5.csv"))
        self.__feature_importance = (self.__feature_importance.groupby([
            "feature"
        ])["importance"].mean().to_frame("importance").reset_index(
            drop=False)).sort_values("importance",
                                     ascending=False).reset_index(drop=True)
        self.__feature_top_column = list(self.__feature_importance.iloc[0:200,
                                                                        0])

        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_select_feature_df.csv"),
            usecols=self.__feature_top_column + ["TARGET"])
        self.__test = pd.read_csv(os.path.join(self.__input_path,
                                               "test_select_feature_df.csv"),
                                  usecols=self.__feature_top_column)

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop("TARGET", axis=1)
        self.__test_feature = self.__test[
            self.__train_feature.columns.tolist()]

        # encoder
        self.__categorical_columns = self.__train_feature.select_dtypes(
            include="object").columns.tolist()
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature[self.__categorical_columns],
                           self.__train_label)
        self.__train_feature[
            self.__categorical_columns] = self.__encoder.transform(
                self.__train_feature[self.__categorical_columns])
        self.__test_feature[
            self.__categorical_columns] = self.__encoder.transform(
                self.__test_feature[self.__categorical_columns])

        # filler
        self.__numeric_columns = self.__train_feature.select_dtypes(
            exclude="object").columns.tolist()
        self.__filler = Imputer(strategy="median")
        self.__filler.fit(self.__train_feature[self.__numeric_columns])
        self.__train_feature[self.__numeric_columns] = self.__filler.transform(
            self.__train_feature[self.__numeric_columns])
        self.__test_feature[self.__numeric_columns] = self.__filler.transform(
            self.__test_feature[self.__numeric_columns])

    def feature_generate(self):
        self.__genetic_transformer = SymbolicTransformer(population_size=10000,
                                                         generations=200,
                                                         tournament_size=200,
                                                         metric="spearman",
                                                         n_jobs=-1,
                                                         verbose=1)
        self.__genetic_transformer.fit(self.__train_feature,
                                       self.__train_label)
        self.__genetic_train_feature = self.__genetic_transformer.transform(
            self.__train_feature)
        self.__genetic_test_feature = self.__genetic_transformer.transform(
            self.__test_feature)

    def data_output(self):
        self.__genetic_train_feature = pd.DataFrame(
            self.__genetic_train_feature,
            columns=[
                "Genetic_" + str(i)
                for i in range(self.__genetic_train_feature.shape[1])
            ])
        self.__genetic_test_feature = pd.DataFrame(
            self.__genetic_test_feature,
            columns=[
                "Genetic_" + str(i)
                for i in range(self.__genetic_test_feature.shape[1])
            ])
        self.__genetic_train_feature.to_csv(os.path.join(
            self.__output_path, "genetic_train_feature.csv"),
                                            index=False)
        self.__genetic_test_feature.to_csv(os.path.join(
            self.__output_path, "genetic_test_feature.csv"),
                                           index=False)