def pd_colnum_bin(df, col, pars): from util_feature import pd_colnum_tocat path_pipeline = pars.get('path_pipeline', False) colnum_binmap = load(f'{path_pipeline}/colnum_binmap.pkl') if path_pipeline else None log(colnum_binmap) colnum = col log("### colnum Map numerics to Category bin ###########################################") dfnum_bin, colnum_binmap = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=colnum_binmap, bins=10, suffix="_bin", method="uniform", return_val="dataframe,param") log(colnum_binmap) ### Renaming colunm_bin with suffix colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())] log(colnum_bin) if 'path_features_store' in pars: scol = "_".join(col[:5]) save_features(dfnum_bin, 'colnum_bin' + "-" + scol, pars['path_features_store']) save(colnum_binmap, pars['path_pipeline_export'] + "/colnum_binmap.pkl" ) save(colnum_bin, pars['path_pipeline_export'] + "/colnum_bin.pkl" ) col_pars = {} col_pars['colnumbin_map'] = colnum_binmap col_pars['cols_new'] = { 'colnum' : col , ###list 'colnum_bin' : colnum_bin ### list } return dfnum_bin, col_pars
""" # ## Colnum # In[124]: ## Map numerics to Category bin dfnum, colnum_binmap = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=None, bins=5, suffix="_bin", method="uniform", return_val="dataframe,param") print(colnum_binmap) # In[125]: colnum_bin = [ x + "_bin" for x in list( colnum_map.keys() ) ] print( colnum_bin ) # In[169]:
def preprocess(df, path_pipeline="data/pipeline/pipe_01/", preprocess_pars={}): """ FUNCTIONNAL approach is used for pre-processing, so the code can be EASILY extensible to PYSPPARK. PYSPARK supports better UDF, lambda function """ from util_feature import (pd_colnum_tocat, pd_col_to_onehot, pd_colcat_toint, pd_feature_generate_cross) log("########### Load column by column type ##################################" ) colid = load(f'{path_pipeline}/colid.pkl') coly = load(f'{path_pipeline}/coly.pkl') colcat = load(f'{path_pipeline}/colcat.pkl') colcat_onehot = load(f'{path_pipeline}/colcat_onehot.pkl') colcat_bin_map = load(f'{path_pipeline}/colcat_bin_map.pkl') colnum = load(f'{path_pipeline}/colnum.pkl') colnum_binmap = load(f'{path_pipeline}/colnum_binmap.pkl') colnum_onehot = load(f'{path_pipeline}/colnum_onehot.pkl') ### OneHot column selected for cross features colcross_single_onehot_select = load( f'{path_pipeline}/colcross_single_onehot_select.pkl') pipe_default = [ 'filter', 'label', 'dfnum_bin', 'dfnum_hot', 'dfcat_bin', 'dfcat_hot', 'dfcross_hot', ] pipe_list = preprocess_pars.get('pipe_list', pipe_default) if "dfcat_bin" in pipe_list: log("###### Colcat as integer encoded ####################################" ) dfcat_bin, _ = pd_colcat_toint(df[colcat], colname=colcat, colcat_map=colcat_bin_map, suffix="_int") colcat_bin = list(dfcat_bin.columns) if "dfcat_hot" in pipe_list: log("###### Colcat to onehot ###############################################" ) dfcat_hot, _ = pd_col_to_onehot(df[colcat], colname=colcat, colonehot=colcat_onehot, return_val="dataframe,param") log(dfcat_hot[colcat_onehot].head(5)) if "dfnum_bin" in pipe_list: log("###### Colnum Preprocess ###########################################" ) dfnum_bin, _ = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=colnum_binmap, bins=-1, suffix="_bin", method="", return_val="dataframe,param") log(colnum_binmap) colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())] log(dfnum_bin[colnum_bin].head(5)) if "dfnum_hot" in pipe_list: ###### Map numerics bin to One Hot dfnum_hot, _ = pd_col_to_onehot(dfnum_bin[colnum_bin], colname=colnum_bin, colonehot=colnum_onehot, return_val="dataframe,param") log(dfnum_hot[colnum_onehot].head(5)) print('------------dfcat_hot---------------------', dfcat_hot) print('------------dfnum_hot---------------------', dfnum_hot) print('------------colcross_single_onehot_select---------------------', colcross_single_onehot_select) if "dfcross_hot" in pipe_list: log("####### colcross cross features ###################################################" ) dfcross_hot = pd.DataFrame() if colcross_single_onehot_select is not None: df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left') # colcat_onehot2 = [x for x in colcat_onehot if 'companyId' not in x] # log(colcat_onehot2) # colcross_single = colnum_onehot + colcat_onehot2 df_onehot = df_onehot[colcross_single_onehot_select] dfcross_hot, colcross_pair = pd_feature_generate_cross( df_onehot, colcross_single_onehot_select, pct_threshold=0.02, m_combination=2) log(dfcross_hot.head(2).T) colcross_onehot = list(dfcross_hot.columns) del df_onehot gc.collect() log("##### Merge data type together : #######################3############################ " ) dfX = df[colnum + colcat] for t in [ 'dfnum_bin', 'dfnum_hot', 'dfcat_bin', 'dfcat_hot', 'dfcross_hot', ]: if t in locals(): dfX = pd.concat((dfX, locals()[t]), axis=1) # log(t, list(dfX.columns)) colX = list(dfX.columns) #colX.remove(coly) del df gc.collect() log("###### Export columns group ##########################################################" ) cols_family = {} for t in [ 'colid', 'coly', #added 'coly' "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns 'colsX', 'coly' ]: t_val = locals().get(t, None) if t_val is not None: cols_family[t] = t_val return dfX, cols_family
def preprocess(path_train_X="", path_train_y="", path_pipeline_export="", cols_group=None, n_sample=5000, preprocess_pars={}, filter_pars={}, path_features_store=None): """ :param path_train_X: :param path_train_y: :param path_pipeline_export: :param cols_group: :param n_sample: :param preprocess_pars: :param filter_pars: :param path_features_store: :return: """ from util_feature import (pd_colnum_tocat, pd_col_to_onehot, pd_colcat_mapping, pd_colcat_toint, pd_feature_generate_cross) ##### column names for feature generation ##################################################### log(cols_group) coly = cols_group['coly'] # 'salary' colid = cols_group['colid'] # "jobId" colcat = cols_group['colcat'] # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ] colnum = cols_group['colnum'] # ['yearsExperience', 'milesFromMetropolis'] colcross_single = cols_group.get('colcross', []) ### List of single columns coltext = cols_group.get('coltext', []) coldate = cols_group.get('coldate', []) colall = colnum + colcat + coltext + coldate log(colall) #### Pipeline Execution pipe_default = [ 'filter', 'label', 'dfnum_bin', 'dfnum_hot', 'dfcat_bin', 'dfcat_hot', 'dfcross_hot', ] pipe_list = preprocess_pars.get('pipe_list', pipe_default) pipe_list.append('dfdate') pipe_list_pars = preprocess_pars.get('pipe_pars', []) ##### Load data ############################################################################## df = load_dataset(path_train_X, path_train_y, colid, n_sample= n_sample) ##### Filtering / cleaning rows : ######################################################### if "filter" in pipe_list : def isfloat(x): try : a= float(x) return 1 except: return 0 ymin, ymax = filter_pars.get('ymin', -9999999999.0), filter_pars.get('ymax', 999999999.0) print(coly) df['_isfloat'] = df[ coly ].apply(lambda x : isfloat(x)) print(df['_isfloat']) df = df[ df['_isfloat'] > 0 ] df = df[df[coly] > ymin] df = df[df[coly] < ymax] ##### Label processing #################################################################### y_norm_fun = None if "label" in pipe_list : # Target coly processing, Normalization process , customize by model log("y_norm_fun preprocess_pars") y_norm_fun = preprocess_pars.get('y_norm_fun', None) if y_norm_fun is not None: df[coly] = df[coly].apply(lambda x: y_norm_fun(x)) save(y_norm_fun, f'{path_pipeline_export}/y_norm.pkl' ) save_features(df[coly], 'dfy', path_features_store) ########### colnum procesing ############################################################# for x in colnum: print('bam',x) df[x] = df[x].astype("float") log(df[colall].dtypes) if "dfnum" in pipe_list : pass if "dfnum_norm" in pipe_list : log("### colnum normalize ###############################################################") from util_feature import pd_colnum_normalize pars = { 'pipe_list': [ {'name': 'fillna', 'naval' : 0.0 }, {'name': 'minmax'} ]} dfnum_norm, colnum_norm = pd_colnum_normalize(df, colname=colnum, pars=pars, suffix = "_norm", return_val="dataframe,param") log(colnum_norm) save_features(dfnum_norm, 'dfnum_norm', path_features_store) if "dfnum_bin" in pipe_list : log("### colnum Map numerics to Category bin ###########################################") dfnum_bin, colnum_binmap = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=None, bins=10, suffix="_bin", method="uniform", return_val="dataframe,param") log(colnum_binmap) ### Renaming colunm_bin with suffix colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())] log(colnum_bin) save_features(dfnum_bin, 'dfnum_binmap', path_features_store) if "dfnum_hot" in pipe_list and "dfnum_bin" in pipe_list : log("### colnum bin to One Hot") dfnum_hot, colnum_onehot = pd_col_to_onehot(dfnum_bin[colnum_bin], colname=colnum_bin, colonehot=None, return_val="dataframe,param") log(colnum_onehot) save_features(dfnum_hot, 'dfnum_onehot', path_features_store) ##### Colcat processing ################################################################ colcat_map = pd_colcat_mapping(df, colcat) log(df[colcat].dtypes, colcat_map) if "dfcat_hot" in pipe_list : log("#### colcat to onehot") dfcat_hot, colcat_onehot = pd_col_to_onehot(df[colcat], colname=colcat, colonehot=None, return_val="dataframe,param") log(dfcat_hot[colcat_onehot].head(5)) save_features(dfcat_hot, 'dfcat_onehot', path_features_store) if "dfcat_bin" in pipe_list : log("#### Colcat to integer encoding ") dfcat_bin, colcat_bin_map = pd_colcat_toint(df[colcat], colname=colcat, colcat_map=None, suffix="_int") colcat_bin = list(dfcat_bin.columns) save_features(dfcat_bin, 'dfcat_bin', path_features_store) if "dfcross_hot" in pipe_list : log("##### Cross Features From OneHot Features ######################################") try : df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left') except : df_onehot = copy.deepcopy(dfcat_hot) colcross_single_onehot_select = [] for t in list(df_onehot) : for c1 in colcross_single : if c1 in t : colcross_single_onehot_select.append(t) df_onehot = df_onehot[colcross_single_onehot_select ] dfcross_hot, colcross_pair = pd_feature_generate_cross(df_onehot, colcross_single_onehot_select, pct_threshold=0.02, m_combination=2) log(dfcross_hot.head(2).T) colcross_pair_onehot = list(dfcross_hot.columns) save_features(dfcross_hot, 'dfcross_onehot', path_features_store) del df_onehot ,colcross_pair_onehot if "dftext" in pipe_list : log("##### Coltext processing ###############################################################") stopwords = nlp_get_stopwords() pars = {'n_token' : 100 , 'stopwords': stopwords} dftext = None for coltext_i in coltext : ##### Run the text processor on each column text ############################# dftext_i = pipe_text( df[[coltext_i ]], coltext_i, pars ) dftext = pd.concat((dftext, dftext_i), axis=1) if dftext is not None else dftext_i save_features(dftext_i, 'dftext_' + coltext_i, path_features_store) log(dftext.head(6)) save_features(dftext, 'dftext', path_features_store) if "dfdate" in pipe_list : log("##### Coldate processing #############################################################") from utils import util_date dfdate = None for coldate_i in coldate : dfdate_i = util_date.pd_datestring_split( df[[coldate_i]] , coldate_i, fmt="auto", return_val= "split" ) dfdate = pd.concat((dfdate, dfdate_i), axis=1) if dfdate is not None else dfdate_i save_features(dfdate_i, 'dfdate_' + coldate_i, path_features_store) save_features(dfdate, 'dfdate', path_features_store) print('spoo',dfdate) ################################################################################### # ############### ##### Save pre-processor meta-parameters os.makedirs(path_pipeline_export, exist_ok=True) log(path_pipeline_export) cols_family = {} for t in ['colid', "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns 'coldate', 'coltext', "coly", "y_norm_fun" ]: tfile = f'{path_pipeline_export}/{t}.pkl' log(tfile) t_val = locals().get(t, None) if t_val is not None : save(t_val, tfile) cols_family[t] = t_val ###### Merge AlL ############################################################################# dfXy = df[colnum + colcat + [coly] ] print('localTT',dfXy) for t in [ 'dfnum_bin', 'dfnum_hot', 'dfcat_bin', 'dfcat_hot', 'dfcross_hot', 'dfdate', 'dftext' ] : if t in locals() : print('localT', t, locals()[t]) dfXy = pd.concat((dfXy, locals()[t] ), axis=1) save_features(dfXy, 'dfX', path_features_store) colXy = list(dfXy.columns) colXy.remove(coly) ##### Only X columns cols_family['colX'] = colXy save(colXy, f'{path_pipeline_export}/colsX.pkl' ) save(cols_family, f'{path_pipeline_export}/cols_family.pkl' ) ###### Return values ######################################################################### return dfXy, cols_family