def pd_col_genetic_transform(df=None, col=None, pars=None): num_gen=20 num_comp=10 function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv','tan'] gp = SymbolicTransformer(generations=num_gen, population_size=200, hall_of_fame=100, n_components=num_comp, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6) gen_feats = gp.fit_transform(train_X, train_y) gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = train_X.index train_X_all=pd.concat((train_X,gen_feats),axis=1) gen_feats = gp.transform(test_X) gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = test_X.index test_X_all=pd.concat((test_X,gen_feats),axis=1) gen_feats = gp.transform(val_X) gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = val_X.index val_X_all=pd.concat((val_X,gen_feats),axis=1) return train_X_all,test_X_all,val_X_all
def data_prepare(self): self.__digists = load_digits(n_class=2) self.__X = self.__digists.data self.__y = self.__digists.target self.__train, self.__test, self.__train_label, self.__test_label = train_test_split( self.__X, self.__y, test_size=0.2, random_state=9) # standard scaler scaler = StandardScaler().fit(self.__train) self.__train = scaler.transform(self.__train) self.__test = scaler.transform(self.__test) # gp feature function_set = ("add", "sub", "mul", "div", "sqrt", "log", "abs", "neg", "inv", "max", "min") gp = SymbolicTransformer(generations=5, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=3) # 使用 stacking 的方式得到 generic feature 感觉更为合理 gp.fit(self.__train, self.__train_label) self.__train_gfeature = np.hstack( (self.__train, gp.transform(self.__train))) self.__test_gfeature = np.hstack( (self.__test, gp.transform(self.__test)))
class GplearnDemo(object): def __init__(self): # data prepare self.__boston = None self.__boston_feature = None self.__boston_label = None self.__train_feature, self.__test_feature = [None for _ in range(2)] self.__train_label, self.__test_label = [None for _ in range(2)] self.__transformer = None self.__gp_train_feature = None self.__gp_test_feature = None # model fit self.__regressor = None def data_prepare(self): self.__boston = load_boston() self.__boston_feature = pd.DataFrame( self.__boston.data, columns=self.__boston.feature_names) self.__boston_label = pd.Series( self.__boston.target).to_frame("TARGET").squeeze() self.__train_feature, self.__test_feature, self.__train_label, self.__test_label = ( train_test_split(self.__boston_feature, self.__boston_label, test_size=0.5, shuffle=True)) # 不能有缺失值 self.__transformer = SymbolicTransformer(n_jobs=4) self.__transformer.fit(self.__train_feature, self.__train_label) self.__gp_train_feature = self.__transformer.transform( self.__train_feature) self.__gp_test_feature = self.__transformer.transform( self.__test_feature) def model_fit_predict(self): self.__regressor = Ridge() self.__regressor.fit(self.__train_feature, self.__train_label) print( mean_squared_error(self.__test_label, self.__regressor.predict(self.__test_feature))) self.__regressor = Ridge() self.__regressor.fit( np.hstack((self.__train_feature.values, self.__gp_train_feature)), self.__train_label) print( mean_squared_error( self.__test_label, self.__regressor.predict( np.hstack((self.__test_feature.values, self.__gp_test_feature)))))
def symbolic_transformer(X, y, encoder=None): """Transform features using multiple operations. This will add new features to the data frame. Args: X (DataFrame): Independent features y (Series): Dependen feature or target encoder (obj, optional): Object of the type 'SymbolicTransformer'. Defaults to None. Returns: DataFrame: Additional columns calculated by the algorithm """ if encoder is None: function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min'] encoder = SymbolicTransformer(generations=10, population_size=1000, hall_of_fame=100, n_components=12, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=123, n_jobs=-1) encoder.fit(X, y) gp_features = encoder.transform(X) return gp_features, encoder
def test_symbolic_transformer(): """Check that SymbolicTransformer example works""" rng = check_random_state(0) boston = load_boston() perm = rng.permutation(boston.target.size) boston.data = boston.data[perm] boston.target = boston.target[perm] est = Ridge() est.fit(boston.data[:300, :], boston.target[:300]) assert_almost_equal(est.score(boston.data[300:, :], boston.target[300:]), 0.759319453049884) function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min'] gp = SymbolicTransformer(generations=20, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, random_state=0) gp.fit(boston.data[:300, :], boston.target[:300]) gp_features = gp.transform(boston.data) new_boston = np.hstack((boston.data, gp_features)) est = Ridge() est.fit(new_boston[:300, :], boston.target[:300]) assert_almost_equal(est.score(new_boston[300:, :], boston.target[300:]), 0.8418372105182055)
def test_output_shape(): """Check output shape is as expected""" random_state = check_random_state(415) X = np.reshape(random_state.uniform(size=50), (5, 10)) y = random_state.uniform(size=5) # Check the transformer est = SymbolicTransformer(n_components=5, generations=2, random_state=0) est.fit(X, y) assert_true(est.transform(X).shape == (5, 5))
def test_output_shape(): """Check output shape is as expected""" random_state = check_random_state(415) X = np.reshape(random_state.uniform(size=50), (5, 10)) y = random_state.uniform(size=5) # Check the transformer est = SymbolicTransformer(n_components=5, generations=2, random_state=0) est.fit(X, y) assert_true(est.transform(X).shape == (5, 5))
def getSymbolTrans(train, valid, y, random_state=888): X_train = train.copy() X_valid = valid.copy() y_train = y.copy() function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min' ] gp = SymbolicTransformer(generations=20, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=0, random_state=0, n_jobs=3) gp.fit(X_train, y_train) gp_features_train = gp.transform(X_train) dt_gp_features_train = pd.DataFrame(gp_features_train) dt_gp_features_train.columns = [ "ST_" + str(i) for i in range(1, dt_gp_features_train.shape[1] + 1) ] X_train = X_train.join(dt_gp_features_train) X_train = X_train.fillna(0) gp_features_valid = gp.transform(X_valid) dt_gp_features_valid = pd.DataFrame(gp_features_valid) dt_gp_features_valid.columns = [ "ST_" + str(i) for i in range(1, dt_gp_features_valid.shape[1] + 1) ] X_valid = X_valid.join(dt_gp_features_valid) X_valid = X_valid.fillna(0) return (X_train, X_valid)
def pd_col_genetic_transform(df=None, col=None, pars=None): """ Find Symbolic formulae for faeture engineering """ prefix = 'col_genetic' ###################################################################################### from gplearn.genetic import SymbolicTransformer coly = pars['coly'] colX = [t for t in col if t not in [coly]] train_X = df[colX] train_y = df[coly] function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan' ] pars_genetic = pars.get('pars_genetic', { 'generations': 20, 'n_components': 10, 'population_size': 200 }) gp = SymbolicTransformer(hall_of_fame=100, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6, **pars_genetic) gp.fit(train_X, train_y) df_genetic = gp.transform(train_X) df_genetic = pd.DataFrame( df_genetic, columns=["gen_" + str(a) for a in range(df_genetic.shape[1])]) df_genetic.index = train_X.index col_genetic = list(df_genetic.columns) ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_genetic, 'df_genetic', pars['path_features_store']) save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_genetic, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = {'model': gp, 'pars': pars_genetic} col_pars['cols_new'] = { 'col_genetic': col_genetic ### list } return df_genetic, col_pars
def symbolic_features(p_x, p_y): """ Funcion para crear regresores no lineales Parameters ---------- p_x: pd.DataFrame with regressors or predictor variables p_y: pd.DataFrame with variable to predict Returns ------- results: model """ model = SymbolicTransformer(function_set=[ "sub", "add", 'inv', 'mul', 'div', 'abs', 'log', "max", "min", "sin", "cos" ], population_size=5000, hall_of_fame=100, n_components=20, generations=20, tournament_size=20, stopping_criteria=.05, const_range=None, init_depth=(4, 12), metric='pearson', parsimony_coefficient=0.001, p_crossover=0.4, p_subtree_mutation=0.2, p_hoist_mutation=0.1, p_point_mutation=0.3, p_point_replace=.05, verbose=1, random_state=None, n_jobs=-1, feature_names=p_x.columns, warm_start=True) init = model.fit_transform(p_x[:'01-01-2019'], p_y[:'01-01-2019']) model_params = model.get_params() gp_features = model.transform(p_x) model_fit = np.hstack((p_x, gp_features)) results = {'fit': model_fit, 'params': model_params, 'model': model} return results
def symbolicLearning(df_list): ''' :param df_list: :return: ''' df_list = pd.DataFrame(df_list) function_set = ['add', 'sub', 'mul', 'div', 'log', 'sqrt', 'abs', 'neg', 'max', 'min'] gp = SymbolicTransformer(generations=10, population_size=1000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=3) gp_feature = gp.transform(df_list) new_feature_name = [str(i) + 'V' for i in range(1, len(function_set)+1)] new_feature = pd.DataFrame(gp_feature, columns=new_feature_name) return new_feature
def get_feature_symbolic_learning(df, gp_config): """ Parameters ---------- df: pd.DataFrame,the input dataFrame. gp_config: GPConfig object, the config object of gplearn.SymbolicTransformer. Returns ------- df_t: pd.DataFrame, df with the features of SymbolicTransformer trans. The new features named like 'symbolic_component_{0 to n}'(n is the n_components) """ gp = SymbolicTransformer( generations=gp_config.generation, population_size=gp_config.population_size, hall_of_fame=gp_config.hall_of_fame, n_components=gp_config.n_components, function_set=gp_config.function_set, parsimony_coefficient=gp_config.parsimony_coefficient, max_samples=gp_config.max_samples, verbose=1, random_state=0, n_jobs=3) X = df[gp_config.feature_cols] y = df[gp_config.target_col] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True) gp.fit(X_train, y_train) names = [ "symbolic_component_" + str(i) for i in range(gp_config.n_components) ] res = pd.DataFrame(gp.transform(X), columns=names) df_t = pd.concat([df, res], axis=1) return df_t
class GplearnGenerateFeature(object): def __init__(self, input_path, output_path): self.__input_path, self.__output_path = input_path, output_path # data prepare self.__feature_importance = None self.__feature_top_column = None self.__train, self.__test = [None for _ in range(2)] self.__train_label = None self.__train_feature, self.__test_feature = [None for _ in range(2)] self.__categorical_columns = None self.__encoder = None self.__numeric_columns = None self.__filler = None # feature generate self.__genetic_transformer = None self.__genetic_train_feature = None self.__genetic_test_feature = None def data_prepare(self): self.__feature_importance = pd.read_csv( os.path.join(self.__input_path, "feature_importance_feature_data_V5.csv")) self.__feature_importance = (self.__feature_importance.groupby([ "feature" ])["importance"].mean().to_frame("importance").reset_index( drop=False)).sort_values("importance", ascending=False).reset_index(drop=True) self.__feature_top_column = list(self.__feature_importance.iloc[0:200, 0]) self.__train = pd.read_csv( os.path.join(self.__input_path, "train_select_feature_df.csv"), usecols=self.__feature_top_column + ["TARGET"]) self.__test = pd.read_csv(os.path.join(self.__input_path, "test_select_feature_df.csv"), usecols=self.__feature_top_column) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop("TARGET", axis=1) self.__test_feature = self.__test[ self.__train_feature.columns.tolist()] # encoder self.__categorical_columns = self.__train_feature.select_dtypes( include="object").columns.tolist() self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[ self.__categorical_columns] = self.__encoder.transform( self.__train_feature[self.__categorical_columns]) self.__test_feature[ self.__categorical_columns] = self.__encoder.transform( self.__test_feature[self.__categorical_columns]) # filler self.__numeric_columns = self.__train_feature.select_dtypes( exclude="object").columns.tolist() self.__filler = Imputer(strategy="median") self.__filler.fit(self.__train_feature[self.__numeric_columns]) self.__train_feature[self.__numeric_columns] = self.__filler.transform( self.__train_feature[self.__numeric_columns]) self.__test_feature[self.__numeric_columns] = self.__filler.transform( self.__test_feature[self.__numeric_columns]) def feature_generate(self): self.__genetic_transformer = SymbolicTransformer(population_size=10000, generations=200, tournament_size=200, metric="spearman", n_jobs=-1, verbose=1) self.__genetic_transformer.fit(self.__train_feature, self.__train_label) self.__genetic_train_feature = self.__genetic_transformer.transform( self.__train_feature) self.__genetic_test_feature = self.__genetic_transformer.transform( self.__test_feature) def data_output(self): self.__genetic_train_feature = pd.DataFrame( self.__genetic_train_feature, columns=[ "Genetic_" + str(i) for i in range(self.__genetic_train_feature.shape[1]) ]) self.__genetic_test_feature = pd.DataFrame( self.__genetic_test_feature, columns=[ "Genetic_" + str(i) for i in range(self.__genetic_test_feature.shape[1]) ]) self.__genetic_train_feature.to_csv(os.path.join( self.__output_path, "genetic_train_feature.csv"), index=False) self.__genetic_test_feature.to_csv(os.path.join( self.__output_path, "genetic_test_feature.csv"), index=False)
def pd_col_genetic_transform(df=None, col=None, pars=None): """ Find Symbolic formulae for faeture engineering """ prefix = 'col_genetic' ###################################################################################### from gplearn.genetic import SymbolicTransformer from gplearn.functions import make_function import random colX = col # [col_ for col_ in col if col_ not in coly] train_X = df[colX].fillna(method='ffill') feature_name_ = colX def squaree(x): return x * x square_ = make_function(function=squaree, name='square_', arity=1) function_set = pars.get('function_set', [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan', square_ ]) pars_genetic = pars.get( 'pars_genetic', { 'generations': 5, 'population_size': 10, ### Higher than nb_features 'metric': 'spearman', 'tournament_size': 20, 'stopping_criteria': 1.0, 'const_range': (-1., 1.), 'p_crossover': 0.9, 'p_subtree_mutation': 0.01, 'p_hoist_mutation': 0.01, 'p_point_mutation': 0.01, 'p_point_replace': 0.05, 'parsimony_coefficient': 0.005, #### 0.00005 Control Complexity 'max_samples': 0.9, 'verbose': 1, #'n_components' ### Control number of outtput features : n_components 'random_state': 0, 'n_jobs': 4, }) if 'path_pipeline' in pars: #### Inference time gp = load(pars['path_pipeline'] + f"/{prefix}_model.pkl") pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") else: ### Training time coly = pars['coly'] train_y = pars['dfy'] gp = SymbolicTransformer( hall_of_fame=train_X.shape[1] + 1, ### Buggy n_components=pars_genetic.get('n_components', train_X.shape[1]), feature_names=feature_name_, function_set=function_set, **pars_genetic) gp.fit(train_X, train_y) ##### Transform Data ######################################### df_genetic = gp.transform(train_X) tag = random.randint(0, 10) #### UNIQUE TAG col_genetic = [f"gen_{tag}_{i}" for i in range(df_genetic.shape[1])] df_genetic = pd.DataFrame(df_genetic, columns=col_genetic, index=train_X.index) df_genetic.index = train_X.index pars_gen_all = {'pars_genetic': pars_genetic, 'function_set': function_set} ##### Formulae Exrraction ##################################### formula = str(gp).replace("[", "").replace("]", "") flist = formula.split(",\n") form_dict = {x: flist[i] for i, x in enumerate(col_genetic)} pars_gen_all['formulae_dict'] = form_dict log("########## Formulae ", form_dict) # col_pars['map_dict'] = dict(zip(train_X.columns.to_list(), feature_name_)) col_new = col_genetic ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_genetic, 'df_genetic', pars['path_features_store']) save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_gen_all, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") # save(form_dict, pars['path_pipeline_export'] + f"/{prefix}_formula.pkl") save_json(form_dict, pars['path_pipeline_export'] + f"/{prefix}_formula.json") ### Human readable col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: col_new ### list } return df_genetic, col_pars
boston.target = boston.target[perm] est = Ridge() est.fit(boston.data[:300, :], boston.target[:300]) print(est.score(boston.data[300:, :], boston.target[300:])) del est function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min' ] gp = SymbolicTransformer(generations=20, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=3) gp.fit(boston.data[:300, :], boston.target[:300]) gp_features = gp.transform(boston.data) new_boston = np.hstack((boston.data, gp_features)) est = Ridge() est.fit(new_boston[:300, :], boston.target[:300]) print(est.score(new_boston[300:, :], boston.target[300:]))
cv = KFold(n_splits=6, shuffle=True, random_state=42) results = [] feature_import = pd.DataFrame() sub_array = [] # feature_import['name'] = train.columns y_train = y_train.values y_mean = np.mean(y_train) for model in [model_lgb]: for traincv, testcv in cv.split(train, y_train): gp.fit(train[traincv], y_train[traincv]) gp_features = gp.transform(train) print(gp_features) train = np.hstack((train, gp_features)) m = model.fit(train[traincv], y_train[traincv], eval_set=[(train[testcv], y_train[testcv])], early_stopping_rounds=150) y_tmp = m.predict(train[testcv], num_iteration=m.best_iteration) res = mean_squared_error(y_train[testcv], (y_tmp)) / 2 results.append(res) t_gp_features = gp.transform(test) print(t_gp_features) test = np.hstack((test, t_gp_features))
] gp = SymbolicTransformer(generations=10, population_size=50000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=42, n_jobs=4) # Fit & save to dataframe gp.fit(total_df.iloc[train_idx], y) gp_features = gp.transform(total_df) genetic_df = pd.DataFrame( gp_features, columns=[f'Genetic_{i}' for i in range(gp_features.shape[1])]) def series_to_supervised(data, n_in=1, n_out=1, dropnan=True): """ Frame a time series as a supervised learning dataset. Taken from: https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/ Arguments: data: Sequence of observations as a list or NumPy array. n_in: Number of lag observations as input (X). n_out: Number of observations as output (y). dropnan: Boolean whether or not to drop rows with NaN values. Returns: Pandas DataFrame of series framed for supervised learning.
def symbolic_features(p_x, p_y): """ Funcion para crear regresores no lineales Parameters ---------- p_x: pd.DataFrame with regressors or predictor variables p_x = data_features.iloc[0:30, 3:] p_y: pd.DataFrame with variable to predict p_y = data_features.iloc[0:30, 1] Returns ------- score_gp: float error of prediction """ model = SymbolicTransformer( function_set=["sub", "add", 'inv', 'mul', 'div', 'abs', 'log'], population_size=5000, hall_of_fame=20, n_components=10, tournament_size=20, generations=5, init_depth=(4, 8), init_method='half and half', parsimony_coefficient=0.1, const_range=None, metric='pearson', stopping_criteria=0.65, p_crossover=0.4, p_subtree_mutation=0.3, p_hoist_mutation=0.1, p_point_mutation=0.2, verbose=True, warm_start=True, n_jobs=-1, feature_names=p_x.columns) model.fit_transform(p_x, p_y) model_params = model.get_params() gp_features = model.transform(p_x) model_fit = np.hstack((p_x, gp_features)) results = { 'fit': model_fit, 'params': model_params, 'model': model, "features": gp_features } best_p = model._best_programs best_p_dict = {} for p in best_p: factor_name = 'alpha_' + str(best_p.index(p) + 1) best_p_dict[factor_name] = { 'fitness': p.fitness_, "expression": str(p), 'depth': p.depth_, "length": p.length_ } best_p_dict = pd.DataFrame(best_p_dict).T best_p_dict = best_p_dict.sort_values(by="fitness") return results, best_p_dict
# 使用gplearn的genetic方法组合特征 data = datasets.load_boston() # 加载数据集 x, y = data.data, data.target # 分割形成x和y print(x.shape) # 查看x的形状 print(x[0]) # 查看x的第一条数据 model_symbolic = SymbolicTransformer(n_components=5, generations=18, function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min'), max_samples=0.9, metric='pearson', random_state=0, n_jobs=2) model_symbolic.fit(x, y) # 训练数据 symbolic_features = model_symbolic.transform(x) # 转换数据 print(symbolic_features.shape) # 打印形状 print(symbolic_features[0]) # 打印第1条数据 print(model_symbolic) # 输出公式 #读者可取消注释执行下面的代码段 #%% ''' # 本段示例代码将输出重复的重复特征 reg_data = np.loadtxt('data5.txt') x, y = reg_data[:, :-1], reg_data[:, -1] model_symbolic = SymbolicTransformer(n_components=5, generations=18, function_set=( 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv','max', 'min'), max_samples=0.9, metric='pearson',
function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv'] gp = SymbolicTransformer(generations=20, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6) gp.fit(train[numeric_feats], train['target']) gp_feats = gp.transform(tt[numeric_feats]) tt = pd.concat([tt, pd.DataFrame(gp_feats)], axis=1) ### box cox transform ''' #numeric_feats = tt.dtypes[tt.dtypes != 'object'].index skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) skewed_feats = skewed_feats[skewed_feats > 0.2] skewed_feats = skewed_feats.index for feat in skewed_feats: tt[feat] = tt[feat] +10 (tt[feat], lam) = boxcox(tt[feat]) '''