def pd_colnum_binto_onehot(df, col=None, pars=None): assert isinstance(col, list) and isinstance(df, pd.DataFrame) dfnum_bin = df[col] colnum_bin = col path_pipeline = pars.get('path_pipeline', False) colnum_onehot = load( f'{path_pipeline}/colnum_onehot.pkl') if path_pipeline else None log("###### colnum bin to One Hot #################################################" ) from util_feature import pd_col_to_onehot dfnum_hot, colnum_onehot = pd_col_to_onehot(dfnum_bin[colnum_bin], colname=colnum_bin, colonehot=colnum_onehot, return_val="dataframe,param") log(colnum_onehot) if 'path_features_store' in pars: save_features(dfnum_hot, 'colnum_onehot', pars['path_features_store']) save(colnum_onehot, pars['path_pipeline_export'] + "/colnum_onehot.pkl") col_pars = {} col_pars['colnum_onehot'] = colnum_onehot col_pars['cols_new'] = { # 'colnum' : col , ###list 'colnum_onehot': colnum_onehot ### list } return dfnum_hot, col_pars
def pd_colnum_normalize(df: pd.DataFrame, col: list = None, pars: dict = None): log("### colnum normalize ###############################################################" ) from util_feature import pd_colnum_normalize colnum = col pars = { 'pipe_list': [{ 'name': 'fillna', 'naval': 0.0 }, { 'name': 'minmax' }] } dfnum_norm, colnum_norm = pd_colnum_normalize(df, colname=colnum, pars=pars, suffix="_norm", return_val="dataframe,param") log(colnum_norm) # update: save col and colnum_norm in dictionary col_pars = {} col_pars['cols_new'] = { 'colnum': col, # list 'colnum_norm': colnum_norm # list } if pars.get('path_features_store', None) is not None: path_features_store = pars['path_features_store'] save_features(dfnum_norm, 'dfnum_norm', path_features_store) # old: return dfnum_norm, colnum_norm # update: return dfnum_norm, col_pars ==> return col_pars as dictionary for the next step in run_preprocess/preprocess return dfnum_norm, col_pars
def pd_coly(df, col, pars): ##### Filtering / cleaning rows : ################ coly = col def isfloat(x): try: a = float(x) return 1 except: return 0 df['_isfloat'] = df[coly].apply(lambda x: isfloat(x)) df = df[df['_isfloat'] > 0] df[coly] = df[coly].astype('float64') del df['_isfloat'] logs("----------df[coly]------------", df[coly]) ymin, ymax = pars.get('ymin', -9999999999.0), pars.get('ymax', 999999999.0) df = df[df[coly] > ymin] df = df[df[coly] < ymax] ##### Label processing #################################################################### y_norm_fun = None # Target coly processing, Normalization process , customize by model log("y_norm_fun preprocess_pars") y_norm_fun = pars.get('y_norm_fun', None) if y_norm_fun is not None: df[coly] = df[coly].apply(lambda x: y_norm_fun(x)) # save(y_norm_fun, f'{path_pipeline_export}/y_norm.pkl' ) if pars.get('path_features_store', None) is not None: path_features_store = pars['path_features_store'] save_features(df[coly], 'dfy', path_features_store) return df, col
def pd_coldate(df, col, pars): log("##### Coldate processing ##########################################" ) from utils import util_date coldate = col dfdate = None for coldate_i in coldate: dfdate_i = util_date.pd_datestring_split(df[[coldate_i]], coldate_i, fmt="auto", return_val="split") dfdate = pd.concat( (dfdate, dfdate_i), axis=1) if dfdate is not None else dfdate_i # if 'path_features_store' in pars : # path_features_store = pars['path_features_store'] # #save_features(dfdate_i, 'dfdate_' + coldate_i, path_features_store) if 'path_features_store' in pars: save_features(dfdate, 'dfdate', pars['path_features_store']) col_pars = {} col_pars['cols_new'] = { # 'colcross_single' : col , ###list 'dfdate': list(dfdate.columns) ### list } return dfdate, col_pars
def pd_colcat_encoder_generic(df, col, pars): """ Create a Class or decorator https://pypi.org/project/category-encoders/ encoder = ce.BackwardDifferenceEncoder(cols=[...]) encoder = ce.BaseNEncoder(cols=[...]) encoder = ce.BinaryEncoder(cols=[...]) encoder = ce.CatBoostEncoder(cols=[...]) encoder = ce.CountEncoder(cols=[...]) encoder = ce.GLMMEncoder(cols=[...]) encoder = ce.HashingEncoder(cols=[...]) encoder = ce.HelmertEncoder(cols=[...]) encoder = ce.JamesSteinEncoder(cols=[...]) encoder = ce.LeaveOneOutEncoder(cols=[...]) encoder = ce.MEstimateEncoder(cols=[...]) encoder = ce.OneHotEncoder(cols=[...]) encoder = ce.OrdinalEncoder(cols=[...]) encoder = ce.SumEncoder(cols=[...]) encoder = ce.PolynomialEncoder(cols=[...]) encoder = ce.TargetEncoder(cols=[...]) encoder = ce.WOEEncoder(cols=[...]) """ prefix = "colcat_encoder_generic" pars_model = None if 'path_pipeline' in pars: ### Load during Inference colcat_encoder = load(pars['path_pipeline'] + f"/{prefix}.pkl") pars_model = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") #model = load( pars['path_pipeline'] + f"/{prefix}_model.pkl" ) ####### Custom Code ############################################################### from category_encoders import HashingEncoder, WOEEncoder pars_model = pars.get('model_pars', {}) if pars_model is None else pars_model pars_model['cols'] = col model_name = pars.get('model_name', 'HashingEncoder') model_class = {'HashingEncoder': HashingEncoder}[model_name] model = model_class(**pars_model) dfcat_encoder = model.fit_transform(df[col]) dfcat_encoder.columns = [t + "_cod" for t in dfcat_encoder.columns] colcat_encoder = list(dfcat_encoder.columns) ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfcat_encoder, 'dfcat_encoder', pars['path_features_store']) save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(pars_model, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") save(colcat_encoder, pars['path_pipeline_export'] + f"/{prefix}.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { 'colcat_encoder_generic': colcat_encoder ### list } return dfcat_encoder, col_pars
def pd_colcat_bin(df, col=None, pars=None): # dfbum_bin = df[col] path_pipeline = pars.get('path_pipeline', False) colcat_bin_map = load( f'{path_pipeline}/colcat_bin_map.pkl') if path_pipeline else None colcat = [col] if isinstance(col, str) else col log("#### Colcat to integer encoding ") dfcat_bin, colcat_bin_map = util_feature.pd_colcat_toint( df[colcat], colname=colcat, colcat_map=colcat_bin_map, suffix="_int") colcat_bin = list(dfcat_bin.columns) ##### Colcat processing ################################################################ colcat_map = util_feature.pd_colcat_mapping(df, colcat) log(df[colcat].dtypes, colcat_map) if 'path_features_store' in pars: save_features(dfcat_bin, 'dfcat_bin', pars['path_features_store']) save(colcat_bin_map, pars['path_pipeline_export'] + "/colcat_bin_map.pkl") save(colcat_bin, pars['path_pipeline_export'] + "/colcat_bin.pkl") col_pars = {} col_pars['colcat_bin_map'] = colcat_bin_map col_pars['cols_new'] = { 'colcat': col, ###list 'colcat_bin': colcat_bin ### list } return dfcat_bin, col_pars
def pd_colcross(df, col, pars): """ cross_feature_new = feat1 X feat2 (pair feature) """ log("##### Cross Features From OneHot Features ######################################" ) prefix = 'colcross_onehot' # params_check(pars, [('dfcat_hot', pd.DataFrame), 'colid', ]) from util_feature import pd_feature_generate_cross dfcat_hot = pars['dfcat_hot'] colid = pars['colid'] try: dfnum_hot = pars['dfnum_hot'] df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left') except: df_onehot = copy.deepcopy(dfcat_hot) colcross_single = pars['colcross_single'] pars_model = {'pct_threshold': 0.02, 'm_combination': 2} if 'path_pipeline' in pars: #### Load existing column list colcross_single = load(pars['path_pipeline'] + f'/{prefix}_select.pkl') # pars_model = load( pars['path_pipeline'] + f'/{prefix}_pars.pkl') colcross_single_onehot_select = [] ## Select existing columns for t in list(df_onehot.columns): for c1 in colcross_single: if c1 in t: colcross_single_onehot_select.append(t) df_onehot = df_onehot[colcross_single_onehot_select] dfcross_hot, colcross_pair = pd_feature_generate_cross( df_onehot, colcross_single_onehot_select, **pars_model) log(dfcross_hot.head(2).T) colcross_pair_onehot = list(dfcross_hot.columns) model = None ############################################################################## if 'path_features_store' in pars: save_features(dfcross_hot, 'colcross_onehot', pars['path_features_store']) save(colcross_single_onehot_select, pars['path_pipeline_export'] + f'/{prefix}_select.pkl') save(colcross_pair, pars['path_pipeline_export'] + f'/{prefix}_stats.pkl') save(colcross_pair_onehot, pars['path_pipeline_export'] + f'/{prefix}_pair.pkl') save(model, pars['path_pipeline_export'] + f'/{prefix}_pars.pkl') col_pars = {'model': model, 'stats': colcross_pair} col_pars['cols_new'] = { # 'colcross_single' : col , ###list 'colcross_pair': colcross_pair_onehot ### list } return dfcross_hot, col_pars
def pd_ts_deltapy2( df=None, col=None, pars={}, ): """ Delta py pars : { 'name' : "robust_scaler", 'pars' : {} } """ prefix = 'colts_deltapy' ###### Custom code ################################################################ dfin = df.fillna(method='ffill') model_name = pars['name'] model_pars = pars.get('pars', {}) if 'path_pipeline' in pars: #### Prediction time model = load(pars['path_pipeline'] + f"/{prefix}_model.pkl") pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") else: ### Training time : Dynamic function load from util_feature import load_function_uri ##### transform.robust_scaler(df, drop=["Close_1"]) model = load_function_uri2(model_name) ##### Transform Data ############################################################ df_out = model(dfin, **model_pars) # Extract only returns one value, so no columns to loop over. model_name2 = model_name.replace("::", "-") if 'extract' in model_name: col_out = "0_" + model_name else: col_out = [coli + "_" + model_name for coli in df_out.columns] df_out.columns = col_out df_out.index = df_out.index col_new = col_out ###### Export ##################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_out, 'df_' + prefix, pars['path_features_store']) save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_new, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: col_new ### list of columns } return df_out, col_pars
def pd_colnum_normalize(df: pd.DataFrame, col: list = None, pars: dict = None): """ Float num INTO [0,1] 'quantile_cutoff', 'quantile_cutoff_2', 'minmax' 'name': 'fillna', 'na_val' : 0.0 """ prefix = 'colnum_norm' ### == cols_out df = df[col] log2( "### colnum normalize #############################################################" ) from util_feature import pd_colnum_normalize as pd_normalize_fun colnum = col if pars is None: pars = { 'pipe_list': [ { 'name': 'quantile_cutoff' }, # { 'name': 'fillna', 'na_val': 0.0 }, ] } if 'path_pipeline' in pars: #### Load existing column list pars = load(pars['path_pipeline'] + f'/{prefix}_pars.pkl') dfnum_norm, colnum_norm = pd_normalize_fun(df, colname=colnum, pars=pars, suffix="_norm", return_val="dataframe,param") log3('dfnum_norm', dfnum_norm.head(4), colnum_norm) log3('dfnum_norn NA', dfnum_norm.isna().sum()) colnew = colnum_norm log3( "##### Export ######################################################################" ) if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfnum_norm, prefix, pars['path_features_store']) save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: colnew ### list } return dfnum_norm, col_pars
def pd_sample_imblearn(df=None, col=None, pars=None): """ Over-sample """ params_check(pars, ['model_name', 'pars_resample', 'coly']) # , 'dfy' prefix = '_sample_imblearn' ###################################################################################### from imblearn.over_sampling import SMOTE from imblearn.combine import SMOTEENN, SMOTETomek from imblearn.under_sampling import NearMiss # model_resample = { 'SMOTE' : SMOTE, 'SMOTEENN': SMOTEENN }[ pars.get("model_name", 'SMOTEENN') ] model_resample = locals()[pars.get("model_name", 'SMOTEENN')] pars_resample = pars.get('pars_resample', { 'sampling_strategy': 'auto', 'random_state': 0 }) # , 'n_jobs': 2 if 'path_pipeline' in pars: #### Inference time return df, {'col_new': col} else: ### Training time colX = col # [col_ for col_ in col if col_ not in coly] coly = pars['coly'] train_y = pars['dfy'] ## df[coly] # train_X = df[colX].fillna(method='ffill') gp = model_resample(**pars_resample) X_resample, y_resample = gp.fit_resample(train_X, train_y) col_new = [t + f"_{prefix}" for t in col] df2 = pd.DataFrame(X_resample, columns=col_new) # , index=train_X.index df2[coly] = y_resample ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df2, prefix.replace("col_", "df_"), pars['path_features_store']) save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_resample, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: col_new ### for training input data } return df2, col_pars
def pd_coly_clean(df, col, pars): path_features_store = pars['path_features_store'] # path_pipeline_export = pars['path_pipeline_export'] coly = col = [0] y_norm_fun = None # Target coly processing, Normalization process , customize by model log("y_norm_fun preprocess_pars") y_norm_fun = pars.get('y_norm_fun', None) if y_norm_fun is not None: df[coly] = df[coly].apply(lambda x: y_norm_fun(x)) # save(y_norm_fun, f'{path_pipeline_export}/y_norm.pkl' ) save_features(df[coly], 'dfy', path_features_store) return df, coly
def pd_colcat_to_onehot(df, col=None, pars=None): """ """ log("#### colcat to onehot") col = [col] if isinstance(col, str) else col if len(col) == 1: colnew = [col[0] + "_onehot"] df[colnew] = df[col] col_pars = {} col_pars['colcat_onehot'] = colnew col_pars['cols_new'] = { # 'colnum' : col , ###list 'colcat_onehot': colnew ### list } return df[colnew], col_pars colcat_onehot = None if 'path_pipeline' in pars: colcat_onehot = load(pars['path_pipeline'] + '/colcat_onehot.pkl') ###################################################################################### colcat = col dfcat_hot, colcat_onehot = util_feature.pd_col_to_onehot( df[colcat], colname=colcat, colonehot=colcat_onehot, return_val="dataframe,param") log(dfcat_hot[colcat_onehot].head(5)) ###################################################################################### if 'path_features_store' in pars: save_features(dfcat_hot, 'colcat_onehot', pars['path_features_store']) save(colcat_onehot, pars['path_pipeline_export'] + "/colcat_onehot.pkl") save(colcat, pars['path_pipeline_export'] + "/colcat.pkl") col_pars = {} col_pars['colcat_onehot'] = colcat_onehot col_pars['cols_new'] = { # 'colnum' : col , ###list 'colcat_onehot': colcat_onehot ### list } print("ok ------------") return dfcat_hot, col_pars
def pd_colnum_bin(df: pd.DataFrame, col: list = None, pars: dict = None): """ float column into binned columns :param df: :param col: :param pars: :return: """ from util_feature import pd_colnum_tocat path_pipeline = pars.get('path_pipeline', False) colnum_binmap = load( f'{path_pipeline}/colnum_binmap.pkl') if path_pipeline else None log2(colnum_binmap) colnum = col log2( "### colnum Map numerics to Category bin ###########################################" ) dfnum_bin, colnum_binmap = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=colnum_binmap, bins=10, suffix="_bin", method="uniform", return_val="dataframe,param") log3(colnum_binmap) ### Renaming colunm_bin with suffix colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())] log3(colnum_bin) if 'path_features_store' in pars: scol = "_".join(col[:5]) save_features(dfnum_bin, 'colnum_bin' + "-" + scol, pars['path_features_store']) save(colnum_binmap, pars['path_pipeline_export'] + "/colnum_binmap.pkl") save(colnum_bin, pars['path_pipeline_export'] + "/colnum_bin.pkl") col_pars = {} col_pars['colnumbin_map'] = colnum_binmap col_pars['cols_new'] = { 'colnum': col, ###list 'colnum_bin': colnum_bin ### list } return dfnum_bin, col_pars
def pd_colcat_minhash(df, col, pars): """ MinHash Algo for category https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087 """ prefix = 'colcat_minhash' colcat = col pars_minhash = { 'n_component': [4, 2], 'model_pretrain_dict': None, } if 'path_pipeline_export' in pars: try: pars_minhash = load(pars['path_pipeline_export'] + '/colcat_minhash_pars.pkl') except: pass log("#### Colcat to Hash encoding #############################################" ) from utils import util_text dfcat_bin, col_hash_model = util_text.pd_coltext_minhash( df[colcat], colcat, return_val="dataframe,param", **pars_minhash) colcat_minhash = list(dfcat_bin.columns) log(col_hash_model) ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfcat_bin, prefix, pars['path_features_store']) save(colcat_minhash, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_minhash, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") save(col_hash_model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") col_pars = {} col_pars['col_hash_model'] = col_hash_model col_pars['cols_new'] = { 'colcat_minhash': colcat_minhash ### list } return dfcat_bin, col_pars
def pd_colnum_normalize(df, col, pars): log("### colnum normalize ###############################################################" ) from util_feature import pd_colnum_normalize colnum = col pars = { 'pipe_list': [{ 'name': 'fillna', 'naval': 0.0 }, { 'name': 'minmax' }] } dfnum_norm, colnum_norm = pd_colnum_normalize(df, colname=colnum, pars=pars, suffix="_norm", return_val="dataframe,param") log(colnum_norm) if pars.get('path_features_store', None) is not None: path_features_store = pars['path_features_store'] save_features(dfnum_norm, 'dfnum_norm', path_features_store) return dfnum_norm, colnum_norm
def pd_coltext(df, col, pars={}): """ df : Datframe col : list of columns pars : dict of pars """ from utils import util_text, util_model #### Load pars ################################################################### path_pipeline = pars.get('path_pipeline', None) word_tokeep_dict_all = load( path_pipeline + "/word_tokeep_dict_all.pkl" ) if path_pipeline is not None else {} # dftext_tdidf_all = load(f'{path_pipeline}/dftext_tdidf.pkl') if path_pipeline else None # dftext_svd_list_all = load(f'{path_pipeline}/dftext_svd.pkl') if path_pipeline else None dimpca = pars.get('dimpca', 2) word_minfreq = pars.get('word_minfreq', 3) #### Process #################################################################### stopwords = nlp_get_stopwords() dftext = pd_coltext_clean(df, col, stopwords= stopwords , pars=pars) dftext_svd_list_all = None dftext_tdidf_all = None ### Processing each of text columns to create a bag of word/to load the bag of word -> tf-idf -> svd for col_ in col: if path_pipeline is not None: ### If it is in Inference step, use the saved bag of word for the column `col_` word_tokeep = word_tokeep_dict_all[col_] else: ### If it is not, create a bag of word coltext_freq, word_tokeep = pd_coltext_wordfreq(df, col_, stopwords, ntoken=100) ## nb of words to keep word_tokeep_dict_all[col_] = word_tokeep ## save the bag of wrod for `col_` in a dict dftext_tdidf_dict, word_tokeep_dict = util_text.pd_coltext_tdidf(dftext, coltext=col_, word_minfreq= word_minfreq, word_tokeep = word_tokeep, return_val = "dataframe,param") dftext_tdidf_all = pd.DataFrame(dftext_tdidf_dict) if dftext_tdidf_all is None else pd.concat((dftext_tdidf_all,pd.DataFrame(dftext_tdidf_dict)),axis=1) log(word_tokeep_dict) ### Dimesnion reduction for Sparse Matrix dftext_svd_list, svd_list = util_model.pd_dim_reduction(dftext_tdidf_dict, colname = None, model_pretrain = None, colprefix = col_ + "_svd", method = "svd", dimpca=dimpca, return_val="dataframe,param") dftext_svd_list_all = dftext_svd_list if dftext_svd_list_all is None else pd.concat((dftext_svd_list_all,dftext_svd_list),axis=1) ################################################################################# ###### Save and Export ########################################################## if 'path_features_store' in pars: save_features(dftext_svd_list_all, 'dftext_svd' + "-" + str(col), pars['path_features_store']) # save(dftext_svd_list_all, pars['path_pipeline_export'] + "/dftext_svd.pkl") # save(dftext_tdidf_all, pars['path_pipeline_export'] + "/dftext_tdidf.pkl" ) save(word_tokeep_dict_all, pars['path_pipeline_export'] + "/word_tokeep_dict_all.pkl" ) col_pars = {} col_pars['cols_new'] = { # 'coltext_tdidf' : dftext_tdidf_all.columns.tolist(), ### list 'coltext_svd' : dftext_svd_list_all.columns.tolist() ### list } dftext_svd_list_all.index = dftext.index # return pd.concat((dftext_svd_list_all,dftext_svd_list_all),axis=1), col_pars return dftext_svd_list_all, col_pars
def pd_col_genetic_transform(df=None, col=None, pars=None): """ Find Symbolic formulae for faeture engineering """ prefix = 'col_genetic' ###################################################################################### from gplearn.genetic import SymbolicTransformer from gplearn.functions import make_function import random colX = col # [col_ for col_ in col if col_ not in coly] train_X = df[colX].fillna(method='ffill') feature_name_ = colX def squaree(x): return x * x square_ = make_function(function=squaree, name='square_', arity=1) function_set = pars.get('function_set', [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan', square_ ]) pars_genetic = pars.get( 'pars_genetic', { 'generations': 5, 'population_size': 10, ### Higher than nb_features 'metric': 'spearman', 'tournament_size': 20, 'stopping_criteria': 1.0, 'const_range': (-1., 1.), 'p_crossover': 0.9, 'p_subtree_mutation': 0.01, 'p_hoist_mutation': 0.01, 'p_point_mutation': 0.01, 'p_point_replace': 0.05, 'parsimony_coefficient': 0.005, #### 0.00005 Control Complexity 'max_samples': 0.9, 'verbose': 1, #'n_components' ### Control number of outtput features : n_components 'random_state': 0, 'n_jobs': 4, }) if 'path_pipeline' in pars: #### Inference time gp = load(pars['path_pipeline'] + f"/{prefix}_model.pkl") pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") else: ### Training time coly = pars['coly'] train_y = pars['dfy'] gp = SymbolicTransformer( hall_of_fame=train_X.shape[1] + 1, ### Buggy n_components=pars_genetic.get('n_components', train_X.shape[1]), feature_names=feature_name_, function_set=function_set, **pars_genetic) gp.fit(train_X, train_y) ##### Transform Data ######################################### df_genetic = gp.transform(train_X) tag = random.randint(0, 10) #### UNIQUE TAG col_genetic = [f"gen_{tag}_{i}" for i in range(df_genetic.shape[1])] df_genetic = pd.DataFrame(df_genetic, columns=col_genetic, index=train_X.index) df_genetic.index = train_X.index pars_gen_all = {'pars_genetic': pars_genetic, 'function_set': function_set} ##### Formulae Exrraction ##################################### formula = str(gp).replace("[", "").replace("]", "") flist = formula.split(",\n") form_dict = {x: flist[i] for i, x in enumerate(col_genetic)} pars_gen_all['formulae_dict'] = form_dict log("########## Formulae ", form_dict) # col_pars['map_dict'] = dict(zip(train_X.columns.to_list(), feature_name_)) col_new = col_genetic ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_genetic, 'df_genetic', pars['path_features_store']) save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_gen_all, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") # save(form_dict, pars['path_pipeline_export'] + f"/{prefix}_formula.pkl") save_json(form_dict, pars['path_pipeline_export'] + f"/{prefix}_formula.json") ### Human readable col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: col_new ### list } return df_genetic, col_pars
def pd_coltext_universal_google(df, col, pars={}): """ # Universal sentence encoding from Tensorflow Text ---> Vectors from source.preprocessors import pd_coltext_universal_google https://tfhub.dev/google/universal-sentence-encoder-multilingual/3 #latest Tensorflow that supports sentencepiece is 1.13.1 !pip uninstall --quiet --yes tensorflow !pip install --quiet tensorflow-gpu==1.13.1 !pip install --quiet tensorflow-hub pip install --quiet tf-sentencepiece, simpleneighbors !pip install --quiet simpleneighbors # df : dataframe # col : list of text colnum names pars """ prefix = "coltext_universal_google" if 'path_pipeline' in pars : ### Load during Inference coltext_embed = load( pars['path_pipeline'] + "/{prefix}.pkl" ) pars_model = load( pars['path_pipeline'] + "/{prefix}_pars.pkl" ) ####### Custom Code ############################################################### import tensorflow as tf import tensorflow_hub as hub import tensorflow_text #from tqdm import tqdm #progress bar uri_list = [ ] url_default = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3" url = pars.get("model_uri", url_default ) model = hub.load( url ) pars_model = {} dfall = None for coli in col[:1] : X = [] for r in (df[coli]): if pd.isnull(r)==True : r="" emb = model(r) review_emb = tf.reshape(emb, [-1]).numpy() X.append(review_emb) dfi = pd.DataFrame(X, columns= [ coli + "_" + str(i) for i in range( len(X[0])) ] , index = df.index) dfall = pd.concat((dfall, dfi)) if dfall is not None else dfi coltext_embed = list(dfall.columns) ##### Export #################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfall, 'dftext_embed', pars['path_features_store']) save(coltext_embed, pars['path_pipeline_export'] + "/{prefix}.pkl" ) save(pars_model, pars['path_pipeline_export'] + "/{prefix}_pars.pkl" ) # save(model, pars['path_pipeline_export'] + "/{prefix}_model.pkl" ) # model_uri = pars['path_pipeline_export'] + "/{prefix}_model.pkl" # col_pars = {'model_uri' : model_uri, 'pars': pars_model} col_pars = {'model_uri' : url , 'pars': pars_model} # model_uri col_pars['cols_new'] = { 'coltext_universal_google' : coltext_embed ### list } return dfall, col_pars
def pd_colnum_quantile_norm(df, col, pars={}): """ colnum normalization by quantile """ prefix = "colnum_quantile_norm" df = df[col] num_col = col ##### Grab previous computed params ################################################ pars2 = {} if 'path_pipeline' in pars: #### Load existing column list colnum_quantile_norm = load(pars['path_pipeline'] + f'/{prefix}.pkl') model = load(pars['path_pipeline'] + f'/{prefix}_model.pkl') pars2 = load(pars['path_pipeline'] + f'/{prefix}_pars.pkl') lower_bound_sparse = pars2.get('lower_bound_sparse', None) upper_bound_sparse = pars2.get('upper_bound_sparse', None) lower_bound = pars2.get('lower_bound_sparse', None) upper_bound = pars2.get('upper_bound_sparse', None) sparse_col = pars2.get('colsparse', ['capital-gain', 'capital-loss']) ####### Find IQR and implement to numericals and sparse columns seperately ########## Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 for col in num_col: if col in sparse_col: df_nosparse = pd.DataFrame(df[df[col] != df[col].mode()[0]][col]) if lower_bound_sparse is not None: pass elif df_nosparse[col].quantile( 0.25) < df[col].mode()[0]: #Unexpected case lower_bound_sparse = df_nosparse[col].quantile(0.25) else: lower_bound_sparse = df[col].mode()[0] if upper_bound_sparse is not None: pass elif df_nosparse[col].quantile( 0.75) < df[col].mode()[0]: #Unexpected case upper_bound_sparse = df[col].mode()[0] else: upper_bound_sparse = df_nosparse[col].quantile(0.75) n_outliers = len(df[(df[col] < lower_bound_sparse) | (df[col] > upper_bound_sparse)][col]) if n_outliers > 0: df.loc[df[col] < lower_bound_sparse, col] = lower_bound_sparse * 0.75 #--> MAIN DF CHANGED df.loc[df[col] > upper_bound_sparse, col] = upper_bound_sparse * 1.25 # --> MAIN DF CHANGED else: if lower_bound is None or upper_bound is None: lower_bound = df[col].quantile(0.25) - 1.5 * IQR[col] upper_bound = df[col].quantile(0.75) + 1.5 * IQR[col] df[col] = np.where(df[col] > upper_bound, 1.25 * upper_bound, df[col]) df[col] = np.where(df[col] < lower_bound, 0.75 * lower_bound, df[col]) df.columns = [t + "_qt_norm" for t in df.columns] pars_new = { 'lower_bound': lower_bound, 'upper_bound': upper_bound, 'lower_bound_sparse': lower_bound_sparse, 'upper_bound_sparse': upper_bound_sparse } dfnew = df model = None colnew = list(df.columns) ##### Export ############################################################################## if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df, prefix, pars['path_features_store']) save(colnew, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_new, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: colnew ### list } return dfnew, col_pars
def pd_colcross(df: pd.DataFrame, col: list = None, pars: dict = None): """ cross_feature_new = feat1 X feat2 (pair feature) """ log("##### Cross Features From OneHot Features ######################################" ) prefix = 'colcross_onehot' # params_check(pars, [('dfcat_hot', pd.DataFrame), 'colid', ]) from util_feature import pd_feature_generate_cross dfcat_hot = pars['dfcat_hot'] colid = pars['colid'] try: dfnum_hot = pars['dfnum_hot'] dfnum_hot = dfnum_hot.drop_duplicates( ) ### Create bug if not unique ids df_onehot = dfcat_hot.reset_index().join(dfnum_hot, on=[colid], how='left') # df_onehot = pd.merge(dfcat_hot.reset_index(), dfnum_hot.reset_index() , on= [colid], how='left') #log4_pd('df_onehot', df_onehot ) #log4(df_onehot.head(4).T ) assert set(dfcat_hot.index) == set( dfnum_hot.index), "Not equal index between dfcat_hot, dfnum_hot" log4('index', colid, dfcat_hot.index) log4(dfnum_hot.index) # df_onehot = df_onehot.set_index(colid) log4('colid', colid) log4_pd('dfnum_hot', dfnum_hot) log4_pd('dfcat_hot', dfcat_hot) except Exception as e: log4('error', e) df_onehot = copy.deepcopy(dfcat_hot) colcross_single = pars['colcross_single'] pars_model = {'pct_threshold': 0.02, 'm_combination': 2} if 'path_pipeline' in pars: #### Load existing column list colcross_single = load(pars['path_pipeline'] + f'/{prefix}_select.pkl') # pars_model = load( pars['path_pipeline'] + f'/{prefix}_pars.pkl') log4('colcross_single', colcross_single, len(colcross_single)) colcross_single_onehot_select = [] ## Select existing columns for t in list(df_onehot.columns): for c1 in colcross_single: if c1 in t: colcross_single_onehot_select.append(t) colcross_single_onehot_select = sorted( list(set(colcross_single_onehot_select))) log4('colcross_single_select', colcross_single_onehot_select, len(colcross_single_onehot_select)) df_onehot = df_onehot[colcross_single_onehot_select] log4_pd('df_onehot', df_onehot) dfcross_hot, colcross_pair = pd_feature_generate_cross( df_onehot, colcross_single_onehot_select, **pars_model) log4_pd("dfcross_hot", dfcross_hot) colcross_pair_onehot = list(dfcross_hot.columns) model = None ############################################################################## if 'path_features_store' in pars: save_features(dfcross_hot, 'colcross_onehot', pars['path_features_store']) save(colcross_single_onehot_select, pars['path_pipeline_export'] + f'/{prefix}_select.pkl') save(colcross_pair, pars['path_pipeline_export'] + f'/{prefix}_stats.pkl') save(colcross_pair_onehot, pars['path_pipeline_export'] + f'/{prefix}_pair.pkl') save(model, pars['path_pipeline_export'] + f'/{prefix}_pars.pkl') col_pars = {'model': model, 'stats': colcross_pair} col_pars['cols_new'] = { # 'colcross_single' : col , ###list 'colcross_pair': colcross_pair_onehot ### list } return dfcross_hot, col_pars