def pd_col_myfun(df=None, col=None, pars={}): """ Example of custom Processor """ from source.util_feature import save, load prefix = 'col_myfun`' if 'path_pipeline' in pars: #### Inference time LOAD previous pars prepro = load(pars['path_pipeline'] + f"/{prefix}_model.pkl") pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") pars = {} if pars is None else pars #### Do something ################################################################# df_new = df[col] ### Do nithi df_new.columns = [col + "_myfun" for col in df.columns] cols_new = list(df_new.columns) prepro = None pars_new = None ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save(prepro, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(cols_new, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_new, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { 'col_myfun': cols_new ### list } return df_new, col_pars
def pd_col_myfun(df=None, col=None, pars={}): """ Example of custom Processor """ from source.util_feature import save, load prefix = "col_myfun`" if "path_pipeline" in pars: #### Inference time LOAD previous pars prepro = load(pars["path_pipeline"] + f"/{prefix}_model.pkl") pars = load(pars["path_pipeline"] + f"/{prefix}_pars.pkl") pars = {} if pars is None else pars #### Do something ################################################################# df_new = df[col] ### Do nithi df_new.columns = [col + "_myfun" for col in df.columns] cols_new = list(df_new.columns) prepro = None pars_new = None ################################################################################### if "path_features_store" in pars and "path_pipeline_export" in pars: save(prepro, pars["path_pipeline_export"] + f"/{prefix}_model.pkl") save(cols_new, pars["path_pipeline_export"] + f"/{prefix}.pkl") save(pars_new, pars["path_pipeline_export"] + f"/{prefix}_pars.pkl") col_pars = { "prefix": prefix, "path": pars.get("path_pipeline_export", pars.get("path_pipeline", None)) } col_pars["cols_new"] = { "col_myfun": cols_new ### list } return df_new, col_pars
def pd_colnum_quantile_norm(df, col, pars={}): """ colnum normalization by quantile """ import pandas as pd, numpy as np from source.util_feature import load, save prefix = "colnum_quantile_norm" df = df[col] num_col = col ##### Grab previous computed params ################################################ pars2 = {} if 'path_pipeline' in pars : #### Load existing column list colnum_quantile_norm = load( pars['path_pipeline'] +f'/{prefix}.pkl') model = load( pars['path_pipeline'] +f'/{prefix}_model.pkl') pars2 = load( pars['path_pipeline'] +f'/{prefix}_pars.pkl') ########### Compute ################################################################# lower_bound_sparse = pars2.get('lower_bound_sparse', None) upper_bound_sparse = pars2.get('upper_bound_sparse', None) lower_bound = pars2.get('lower_bound_sparse', None) upper_bound = pars2.get('upper_bound_sparse', None) sparse_col = pars2.get('colsparse', ['capital-gain', 'capital-loss'] ) ####### Find IQR and implement to numericals and sparse columns seperately ########## Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 for col in num_col: if col in sparse_col: df_nosparse = pd.DataFrame(df[df[col] != df[col].mode()[0]][col]) if lower_bound_sparse is not None: pass elif df_nosparse[col].quantile(0.25) < df[col].mode()[0]: #Unexpected case lower_bound_sparse = df_nosparse[col].quantile(0.25) else: lower_bound_sparse = df[col].mode()[0] if upper_bound_sparse is not None: pass elif df_nosparse[col].quantile(0.75) < df[col].mode()[0]: #Unexpected case upper_bound_sparse = df[col].mode()[0] else: upper_bound_sparse = df_nosparse[col].quantile(0.75) n_outliers = len(df[(df[col] < lower_bound_sparse) | (df[col] > upper_bound_sparse)][col]) if n_outliers > 0: df.loc[df[col] < lower_bound_sparse, col] = lower_bound_sparse * 0.75 #--> MAIN DF CHANGED df.loc[df[col] > upper_bound_sparse, col] = upper_bound_sparse * 1.25 # --> MAIN DF CHANGED else: if lower_bound is None or upper_bound is None : lower_bound = df[col].quantile(0.25) - 1.5 * IQR[col] upper_bound = df[col].quantile(0.75) + 1.5 * IQR[col] df[col] = np.where(df[col] > upper_bound, 1.25 * upper_bound, df[col]) df[col] = np.where(df[col] < lower_bound, 0.75 * lower_bound, df[col]) df.columns = [ t + "_qt_norm" for t in df.columns ] pars_new = {'lower_bound' : lower_bound, 'upper_bound': upper_bound, 'lower_bound_sparse' : lower_bound_sparse, 'upper_bound_sparse' : upper_bound_sparse } dfnew = df model = None colnew = list(df.columns) ##### Export ############################################################################## if 'path_features_store' in pars and 'path_pipeline_export' in pars: # save_features(df, prefix, pars['path_features_store']) save(colnew, pars['path_pipeline_export'] + f"/{prefix}.pkl" ) save(pars_new, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl" ) save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl" ) col_pars = {'prefix' : prefix, 'path': pars.get('path_pipeline_export', pars.get("path_pipeline", None)) } col_pars['cols_new'] = { prefix : colnew ### list } return dfnew, col_pars