def model_dict_load(model_dict, config_path, config_name, verbose=True): """ Load the model dict from the python config file. ### Issue wiht passing function durin pickle on disk :return: """ if model_dict is None: log("#### Model Params Dynamic loading ###############################################" ) model_dict_fun = load_function_uri(uri_name=config_path + "::" + config_name) model_dict = model_dict_fun() ### params else: ### Passing dict ### Due to Error when saving on disk the model, function definition is LOST, need dynamic load path_config = model_dict['global_pars']['config_path'] p1 = path_config + "::" + model_dict['model_pars'][ 'post_process_fun'].__name__ model_dict['model_pars']['post_process_fun'] = load_function_uri(p1) p1 = path_config + "::" + model_dict['model_pars']['pre_process_pars'][ 'y_norm_fun'].__name__ model_dict['model_pars']['pre_process_pars'][ 'y_norm_fun'] = load_function_uri(p1) return model_dict
def pd_ts_generic( df, col=None, pars=None, ): """ { 'name': 'deltapy.transform::robust_scaler', 'pars': {'drop':["Close_1"]} }, """ ###### Custom code ################################################################ model_name = pars['name'] model_pars = pars.get('pars', {}) dfin = df[col] dfin = dfin.fillna(method='ffill') if 'a_chi' in model_name: # Normalize the input for the chi dfin = (dfin - dfin.min()) / (dfin.max() - dfin.min()) ##### Transform Data ############################################################ model = load_function_uri(model_name) df_out = model(dfin, **model_pars) if 'extract' in model_name: # Extract only returns one value, so no columns to loop over. col_out = "0_" + model_name else: model_name2 = model_name.replace("::", "-") col_out = [coli + "_" + model_name2 for coli in df_out.columns] df_out.columns = col_out df_out.index = df.index return df_out
def model_dict_load(model_dict, config_path, config_name, verbose=True): if model_dict is None : log("#### Model Params Dynamic loading ###############################################") model_dict_fun = load_function_uri(uri_name=config_path + "::" + config_name) model_dict = model_dict_fun() ### params if verbose : log( model_dict ) return model_dict
def model_dict_load(model_dict, config_path, config_name, verbose=True): """ load the model dict from the python config file. :return: """ if model_dict is None: log("#### Model Params Dynamic loading ###############################################" ) model_dict_fun = load_function_uri(uri_name=config_path + "::" + config_name) model_dict = model_dict_fun() ### params log3(model_dict) return model_dict
def pd_colts_generate(df=None, col=None, pars={}): """ pars : { 'model_name' : "transform.robust_scaler", 'model_pars' : {} } """ prefix = 'colts_generate' ###### Custom code ################################################################ dfin = df[col].fillna(method='ffill') model_name = pars['model_name'] model_pars = pars.get('model_pars', {}) if 'path_pipeline' in pars: #### Prediction time model = load(pars['path_pipeline'] + f"/{prefix}_model.pkl") pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") else: ### Training time : Dynamic function load from util_feature import load_function_uri ##### transform.robust_scaler(df, drop=["Close_1"]) model = load_function_uri(model_name) model_name = model_name.replace(".", "_") ##### Transform Data ############################################################ df_out = model(dfin, col, **model_pars) col_out = [coli + "_" + model_name for coli in df_out.columns] df_out.columns = col_out df_out.index = train_X.index col_new = col_out ###### Export ##################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_out, 'df_' + prefix, pars['path_features_store']) save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_new, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: col_new ### list of columns } return df_out, col_pars
def run_preprocess(model_name, path_data, path_output, path_config_model="source/config_model.py", n_sample=5000, mode='run_preprocess',): #prefix "pre" added, in order to make if loop possible """ Configuration of the model is in config_model.py file """ path_output = root + path_output path_data = root + path_data path_features_store = path_output + "/features_store/" path_pipeline_out = path_output + "/pipeline/" path_model_out = path_output + "/model/" path_check_out = path_output + "/check/" path_train_X = path_data + "/features*" ### Can be a list of zip or parquet files path_train_y = path_data + "/target*" ### Can be a list of zip or parquet files log(path_output) log("#### load input column family ###################################################") cols_group = json.load(open(path_data + "/cols_group.json", mode='r')) log(cols_group) log("#### Model parameters Dynamic loading ############################################") model_dict_fun = load_function_uri(uri_name= path_config_model + "::" + model_name) model_dict = model_dict_fun(path_model_out) ### params log("#### Preprocess #################################################################") preprocess_pars = model_dict['model_pars']['pre_process_pars'] filter_pars = model_dict['data_pars']['filter_pars'] if mode == "run_preprocess" : dfXy, cols = preprocess(path_train_X, path_train_y, path_pipeline_out, cols_group, n_sample, preprocess_pars, filter_pars, path_features_store) elif mode == "load_preprocess" : dfXy, cols = preprocess_load(path_train_X, path_train_y, path_pipeline_out, cols_group, n_sample, preprocess_pars, filter_pars, path_features_store) model_dict['data_pars']['coly'] = cols['coly'] ### Generate actual column names from colum groups : colnum , colcat model_dict['data_pars']['cols_model'] = sum([ cols[colgroup] for colgroup in model_dict['data_pars']['cols_model_group'] ] , []) log( model_dict['data_pars']['cols_model'] , model_dict['data_pars']['coly']) log("######### finish #################################", )
def model_dict_load(model_dict, config_path, config_name, verbose=True): """model_dict_load Args: model_dict ([type]): [description] config_path ([type]): [description] config_name ([type]): [description] verbose (bool, optional): [description]. Defaults to True. Returns: [type]: [description] """ if model_dict is None: log("#### Model Params Dynamic loading ###############################################" ) model_dict_fun = load_function_uri(uri_name=config_path + "::" + config_name) model_dict = model_dict_fun() ### params if verbose: log(model_dict) return model_dict
def preprocess(path_train_X="", path_train_y="", path_pipeline_export="", cols_group=None, n_sample=5000, preprocess_pars={}, path_features_store=None): """ Used for trainiing only Save params on disk :param path_train_X: :param path_train_y: :param path_pipeline_export: :param cols_group: :param n_sample: :param preprocess_pars: :param path_features_store: :return: """ ##### column names for feature generation ##################################################### log(cols_group) coly = cols_group['coly'] # 'salary' colid = cols_group['colid'] # "jobId" colcat = cols_group[ 'colcat'] # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ] colnum = cols_group['colnum'] # ['yearsExperience', 'milesFromMetropolis'] os.makedirs(path_pipeline_export, exist_ok=True) log(path_pipeline_export) save(colid, f'{path_pipeline_export}/colid.pkl') ### Pipeline Execution ########################################## pipe_default = [{ 'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'type': '' }, { 'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'type': 'cross' }] pipe_list = preprocess_pars.get('pipe_list', pipe_default) pipe_list_X = [ task for task in pipe_list if task.get('type', '') not in ['coly', 'filter'] ] pipe_list_y = [ task for task in pipe_list if task.get('type', '') in ['coly'] ] pipe_filter = [ task for task in pipe_list if task.get('type', '') in ['filter'] ] ##### Load data ################################################################################# df = load_dataset(path_train_X, path_train_y, colid, n_sample=n_sample) ##### Generate features ########################################################################## dfi_all = {} ### Dict of all features cols_family_all = {'colid': colid, 'colnum': colnum, 'colcat': colcat} if len(pipe_filter) > 0: log("##### Filter #########################################################################" ) pipe_i = pipe_filter[0] pipe_fun = load_function_uri(pipe_i['uri']) df, col_pars = pipe_fun(df, list(df.columns), pars=pipe_i.get('pars', {})) if len(pipe_list_y) > 0: log("##### coly ###########################################################################" ) pipe_i = pipe_list_y[0] pipe_fun = load_function_uri(pipe_i['uri']) logs("----------df----------\n", df) pars = pipe_i.get('pars', {}) pars['path_features_store'] = path_features_store pars['path_pipeline_export'] = path_pipeline_export df, col_pars = pipe_fun(df, cols_group['coly'], pars=pars) ### coly can remove rows logs("----------df----------\n", df) dfi_all['coly'] = df[cols_group['coly']] cols_family_all['coly'] = cols_group['coly'] save_features(df[cols_group['coly']], "coly", path_features_store) ### already saved save(coly, f'{path_pipeline_export}/coly.pkl') ##### Processors ############################################################################### dfi_all['coly'] = df[cols_group['coly']] #for colg, colg_list in cols_group.items() : # if colg not in ['colid']: # dfi_all[colg] = df[colg_list] ## colnum colcat, coly for pipe_i in pipe_list_X: log("###################", pipe_i, "##########################################################") pipe_fun = load_function_uri( pipe_i['uri']) ### Load the code definition into pipe_fun cols_name = pipe_i['cols_family'] col_type = pipe_i['type'] pars = pipe_i.get('pars', {}) pars[ 'path_features_store'] = path_features_store ### intermdiate dataframe pars['path_pipeline_export'] = path_pipeline_export ### Store pipeline if col_type == 'cross': log("################### Adding Cross ###################################################" ) pars['dfnum_hot'] = dfi_all[ 'colnum_onehot'] ### dfnum_hot --> dfcross pars['dfcat_hot'] = dfi_all['colcat_onehot'] pars['colid'] = colid pars['colcross_single'] = cols_group.get('colcross', []) elif col_type == 'add_coly': log('add_coly genetic', cols_group['coly']) pars['coly'] = cols_group['coly'] pars['dfy'] = dfi_all['coly'] ### Transformed dfy ### Input columns or prevously Computed Columns ( colnum_bin ) cols_list = cols_group[cols_name] if cols_name in cols_group else list( dfi_all[cols_name].columns) df_ = df[cols_list] if cols_name in cols_group else dfi_all[cols_name] #cols_list = list(dfi_all[cols_name].columns) #df_ = dfi_all[cols_name] dfi, col_pars = pipe_fun(df_, cols_list, pars=pars) ### Concatenate colnum, colnum_bin into cols_family_all , dfi_all ########################### for colj, colist in col_pars['cols_new'].items(): ### Merge sub-family cols_family_all[colj] = cols_family_all.get(colj, []) + colist dfi_all[colj] = pd.concat( (dfi_all[colj], dfi), axis=1) if colj in dfi_all else dfi # save_features(dfi_all[colj], colj, path_features_store) ###### Merge AlL int dfXy ################################################################## dfXy = df[[coly] + colnum + colcat] #dfXy = df[ [coly] ] for t in dfi_all.keys(): if t not in ['coly', 'colnum', 'colcat']: dfXy = pd.concat((dfXy, dfi_all[t]), axis=1) save_features(dfXy, 'dfX', path_features_store) colXy = list(dfXy.columns) colXy.remove(coly) ##### Only X columns if len(colid) > 0: cols_family_all['colid'] = colid cols_family_all['colX'] = colXy #### Cols group for model input ########################################################### save(colXy, f'{path_pipeline_export}/colsX.pkl') save(cols_family_all, f'{path_pipeline_export}/cols_family.pkl') ###### Return values ####################################################################### return dfXy, cols_family_all
def preprocess_inference(df, path_pipeline="data/pipeline/pipe_01/", preprocess_pars={}, cols_group=None): """ At Inference time, load model, params and preprocess data. Not saving the data, only output final dataframe :param df: input dataframe :param path_pipeline: path where processors are stored :param preprocess_pars: dict of params specific to preprocessing :param cols_group: dict of column family :return: dfXy Final dataframe, cols_family_full : dict of column family """ from util_feature import load, load_function_uri, load_dataset #### Pipeline Execution #################################################### pipe_default = [{ 'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'type': '' }, { 'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'type': 'cross' }] pipe_list = preprocess_pars.get('pipe_list', pipe_default) pipe_list_X = [ task for task in pipe_list if task.get('type', '') not in ['coly', 'filter'] ] pipe_filter = [ task for task in pipe_list if task.get('type', '') in ['filter'] ] log("########### Load column by column type ##################################" ) cols_group = preprocess_pars['cols_group'] log(cols_group) ### list of model columns familty colid = cols_group['colid'] # "jobId" coly = cols_group['coly'] colcat = cols_group[ 'colcat'] # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ] colnum = cols_group['colnum'] # ['yearsExperience', 'milesFromMetropolis'] ##### Generate features ######################################################################## dfi_all = {} ### Dict of all features cols_family_full = {'coly': coly} if len(pipe_filter) > 0: log("##### Filter #######################################################################" ) pipe_i = pipe_filter[0] pipe_fun = load_function_uri(pipe_i['uri']) df, col_pars = pipe_fun(df, list(df.columns), pars=pipe_i.get('pars', {})) ##### Processors ############################################################################# #for colg, colg_list in cols_group.items() : # if colg not in ['colid', 'coly' ]: # dfi_all[colg] = df[colg_list] ## colnum colcat, coly for pipe_i in pipe_list_X: log("###################", pipe_i, "#######################################################") pipe_fun = load_function_uri( pipe_i['uri']) ### Load the code definition into pipe_fun cols_name = pipe_i['cols_family'] col_type = pipe_i['type'] pars = pipe_i.get('pars', {}) ### Load data from disk : inference time pars['path_pipeline'] = path_pipeline cols_list = cols_group[ cols_name] if cols_name in cols_group else cols_family_full[ cols_name] df_ = df[cols_group[ cols_name]] if cols_name in cols_group else dfi_all[cols_name] # cols_list = list(dfi_all[cols_name].columns) # df_ = dfi_all[cols_name] logs(df_, cols_list) if col_type == 'cross': pars['dfnum_hot'] = dfi_all[ 'colnum_onehot'] ### dfnum_hot --> dfcross pars['dfcat_hot'] = dfi_all['colcat_onehot'] pars['colid'] = colid pars['colcross_single'] = cols_group.get('colcross', []) elif col_type == 'add_coly': pass dfi, col_pars = pipe_fun(df_, cols_list, pars=pars) ### Concatenate colnum, colnum_bin into cols_family_all for colj, colist in col_pars['cols_new'].items(): ### Merge sub-family cols_family_full[colj] = cols_family_full.get(colj, []) + colist dfi_all[colj] = pd.concat( (dfi_all[colj], dfi), axis=1) if colj in dfi_all else dfi log("###### Merge AlL int dfXy #############################################################" ) dfXy = df[colnum + colcat] for t in dfi_all.keys(): if t not in ['colnum', 'colcat']: dfXy = pd.concat((dfXy, dfi_all[t]), axis=1) colXy = list(dfXy.columns) if len(colid) > 0: cols_family_full['colid'] = colid cols_family_full['colX'] = colXy return dfXy, cols_family_full