def salary_lightgbm(path_model_out="") : """ Huber Loss includes L1 regurarlization We test different features combinaison, default params is optimal """ data_name = "salary" model_class = 'LGBMRegressor' n_sample = 10**5 def post_process_fun(y): return y_norm(y, inverse=True, mode='boxcox') def pre_process_fun(y): return y_norm(y, inverse=False, mode='boxcox') model_dict = {'model_pars': {'model_class': model_class ,'model_path': path_model_out ,'model_pars': {'objective': 'huber', } # default ,'post_process_fun': copy.deepcopy( post_process_fun) ,'pre_process_pars': {'y_norm_fun' : copy.deepcopy(pre_process_fun) , ### Pipeline for data processing ############################## 'pipe_list': [ {'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, {'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' }, {'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': '' }, {'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' }, {'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': '' }, {'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair_onehot', 'type': 'cross'} ], } }, 'compute_pars': { 'metric_list': ['root_mean_squared_error', 'mean_absolute_error', 'explained_variance_score', 'r2_score', 'median_absolute_error'] }, 'data_pars': { 'cols_input_type' : cols_input_type_1 # cols['cols_model'] = cols["colnum"] + cols["colcat_bin"] # + cols[ "colcross_onehot"] ,'cols_model_group': [ 'colnum', 'colcat_bin'] ,'filter_pars': { 'ymax' : 100000.0 ,'ymin' : 0.0 } ### Filter data }} ################################################################################################ ##### Filling Global parameters ############################################################# model_dict = global_pars_update(model_dict, data_name, os_get_function_name() ) return model_dict
def house_price_lightgbm(path_model_out="") : """ Huber Loss includes L1 regurarlization We test different features combinaison, default params is optimal """ data_name = 'house_price' model_name = 'LGBMRegressor' n_sample = 20000 def post_process_fun(y): return y_norm(y, inverse=True, mode='norm') def pre_process_fun(y): return y_norm(y, inverse=False, mode='norm') model_dict = {'model_pars': { 'model_path' : path_model_out , 'model_class': model_name ### Actual Class Name , 'model_pars' : {} # default ones of the model name , 'post_process_fun' : post_process_fun , 'pre_process_pars' : {'y_norm_fun' : copy.deepcopy(pre_process_fun), ### Pipeline for data processing. # 'pipe_list' : [ 'filter', 'label', 'dfnum_bin', 'dfnum_hot', 'dfcat_bin', 'dfcat_hot', 'dfcross_hot', ] 'pipe_list' : [ 'filter', 'label', 'dfcat_bin' ] } }, 'compute_pars': { 'metric_list': ['root_mean_squared_error', 'mean_absolute_error', 'explained_variance_score', 'r2_score', 'median_absolute_error'] }, 'data_pars': { 'cols_input_type' : cols_input_type_1, # 'cols_model_group': [ 'colnum_onehot', 'colcat_onehot', 'colcross_onehot' ] 'cols_model_group': [ 'colnum', 'colcat_bin' ] ,'filter_pars': { 'ymax' : 1000000.0 ,'ymin' : 0.0 } ### Filter data }} ################################################################################################ ##### Filling Global parameters ############################################################# model_dict = global_pars_update(model_dict, data_name, config_name=os_get_function_name() ) return model_dict
def salary_bayesian_pyro(path_model_out="") : global model_name model_name = 'model_bayesian_pyro' def post_process_fun(y): return y_norm(y, inverse=True, mode='boxcox') def pre_process_fun(y): return y_norm(y, inverse=False, mode='boxcox') model_dict = {'model_pars': {'model_class': 'model_bayesian_pyro' , 'model_path': path_model_out , 'model_pars': {'input_width': 112, } # default , 'post_process_fun': post_process_fun , 'pre_process_pars': {'y_norm_fun' : copy.deepcopy(pre_process_fun) , ### Pipeline for data processing ############################## 'pipe_list': [ {'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, {'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' }, {'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': '' }, {'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' }, {'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': '' }, {'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair_onehot', 'type': 'cross'} ], } }, 'compute_pars': {'compute_pars': {'n_iter': 1200, 'learning_rate': 0.01} , 'metric_list': ['root_mean_squared_error', 'mean_absolute_error', 'explained_variance_score', 'r2_score', 'median_absolute_error'] , 'max_size': 1000000 , 'num_samples': 300 }, 'data_pars': { 'cols_input_type' : cols_input_type_1 ,'cols_model_group': [ 'colnum_onehot', 'colcat_onehot' ] ,'filter_pars': { 'ymax' : 100000.0 ,'ymin' : 0.0 } ### Filter data }} ##### Filling Global parameters ############################################################ model_dict = global_pars_update(model_dict, data_name, os_get_function_name() ) return model_dict
def house_price_elasticnetcv(path_model_out=""): model_name = 'ElasticNetCV' config_name = 'house_price_elasticnetcv' n_sample = 1000 def post_process_fun(y): return y_norm(y, inverse=True, mode='norm') def pre_process_fun(y): return y_norm(y, inverse=False, mode='norm') model_dict = {'model_pars': {'model_class': 'ElasticNetCV' , 'model_path': path_model_out , 'model_pars': {} # default ones , 'post_process_fun': post_process_fun , 'pre_process_pars': {'y_norm_fun' : pre_process_fun, ### Pipeline for data processing. # 'pipe_list' : [ 'filter', 'label', 'dfnum_bin', 'dfnum_hot', 'dfcat_bin', 'dfcat_hot', 'dfcross_hot', ] 'pipe_list' : [ 'filter', 'label', 'dfcat_hot' ] } }, 'compute_pars': { 'metric_list': ['root_mean_squared_error', 'mean_absolute_error', 'explained_variance_score', 'r2_score', 'median_absolute_error'] }, 'data_pars': { 'cols_input_type' : cols_input_type_1, # 'cols_model_group': [ 'colnum_onehot', 'colcat_onehot', 'colcross_onehot' ] 'cols_model_group': [ 'colnum', 'colcat_onehot' ] ,'filter_pars': { 'ymax' : 100000.0 ,'ymin' : 0.0 } ### Filter data }} ################################################################################################ ##### Filling Global parameters ############################################################# model_dict = global_pars_update(model_dict, data_name, config_name=os_get_function_name() ) return model_dict
### family of columns used for model input ######################################################### "cols_model_group": [ "colnum", ### numerical continuous "colcat_bin", ### category ] ### Filter data rows ################################################################## ,"filter_pars": { "ymax" : 2 ,"ymin" : -1 } } } ##### Filling Global parameters ############################################################ model_dict = global_pars_update(model_dict, data_name, config_name=os_get_function_name() ) return model_dict ################################################################################### ########## Preprocess ############################################################# ### def preprocess(config="", nsample=1000): from core_run import preprocess """ def preprocess(config=None, nsample=None): config_name = config if config is not None else config_default mdict = globals()[config_name]() m = mdict["global_pars"]
def titanic1(path_model_out="") : """ Contains all needed informations for Light GBM Classifier model, used for titanic classification task """ config_name = os_get_function_name() data_name = "titanic" ### in data/input/ model_class = 'LGBMClassifier' ### ACTUAL Class name for model_sklearn.py n_sample = 1000 def post_process_fun(y): return int(y) def pre_process_fun(y): return int(y) model_dict = {'model_pars': { ### LightGBM API model ####################################### 'model_class': model_class ,'model_pars' : {'objective': 'binary', 'n_estimators':10, } , 'post_process_fun' : post_process_fun , 'pre_process_pars' : {'y_norm_fun' : pre_process_fun , ### Pipeline for data processing ############################## 'pipe_list': [ {'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, {'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' }, # {'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': '' }, {'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' }, # {'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': '' }, # {'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair_onehot', 'type': 'cross'}, {'uri': 'source/prepro.py::pd_colcat_minhash', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_minhash', 'type': '' }, # {'uri': 'source/prepro.py::pd_coltext_universal_google', 'pars': {}, 'cols_family': 'coltext', 'cols_out': 'coltext_universal_google', 'type': '' }, {'uri': 'source/prepro.py::pd_col_genetic_transform', 'pars': { ## 'pars_genetic' : {} }, 'cols_family': 'colgen', 'cols_out': 'col_genetic', 'type': 'add_coly' }, {'uri': 'source/prepro.py::pd_colnum_quantile_norm', 'pars': {'colsparse' : [] }, 'cols_family': 'colnum', 'cols_out': 'colnum_quantile_norm', 'type': '' }, ], } }, 'compute_pars': { 'metric_list': ['accuracy_score','average_precision_score'] }, 'data_pars': { 'n_sample' : n_sample, 'cols_input_type' : cols_input_type_2, ### family of columns for MODEL ######################################################### # "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns # "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns # 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns # 'coldate', # 'coltext', 'cols_model_group': [ 'colnum', ### should be optional 'colcat' 'colcat_bin', # 'colcat_bin', # 'colnum_onehot', #'colcat_minhash', # 'colcat_onehot', # 'coltext_universal_google' 'colcat_minhash', 'col_genetic', 'colnum_quantile_norm' ] ### Filter data rows ################################################################## ,'filter_pars': { 'ymax' : 2 ,'ymin' : -1 } } } ##### Filling Global parameters ############################################################ model_dict = global_pars_update(model_dict, data_name, config_name ) return model_dict
def config3(path_model_out=""): """ Contains all needed informations """ config_name = os_get_function_name() data_name = "titanic" ### in data/input/ model_class = 'LGBMClassifier' ### ACTUAL Class name for model_sklearn.py n_sample = 1000 def post_process_fun(y): return int(y) def pre_process_fun(y): return int(y) model_dict = { 'model_pars': { ### LightGBM API model ####################################### 'model_class': model_class, 'model_pars': { 'objective': 'binary', 'n_estimators': 5, }, 'post_process_fun': post_process_fun, 'pre_process_pars': { 'y_norm_fun': pre_process_fun, ### Pipeline for data processing ############################## 'pipe_list': [ ### coly encoding { 'uri': 'source/prepro.py::pd_coly', 'pars': { 'ymin': -9999999999.0, 'ymax': 999999999.0, 'y_norm_fun': None }, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': { 'path_pipeline': False }, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' } #### Data Over/Under sampling, New data #,{'uri': 'source/prepro_sampler.py::pd_sample_imblearn' , # 'pars': {"model_name": 'SMOTEENN', # 'pars_resample': {'sampling_strategy' : 'auto', 'random_state':0}, # "coly": "Survived"} , # 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': 'add_coly' } # ,{'uri': 'source/prepro_sampler.py::pd_filter_rows' , 'pars': {'ymin': -9999999999.0, 'ymax': 999999999.0} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } #,{'uri': 'source/prepro_sampler.py::pd_augmentation_sdv' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } ], } }, 'compute_pars': { 'metric_list': ['accuracy_score', 'average_precision_score'] }, 'data_pars': { 'n_sample': n_sample, #### columns as raw data input 'cols_input_type': cols_input_type_2, ### columns for model input ############################################################ 'cols_model_group': [ # 'colnum', 'colcat_bin', ], #### Separate Category Sparse from Continuous (DLearning input) 'cols_model_type': { 'continuous': [ 'colnum', ], 'discreate': [ 'colcat_bin', ] } ### Filter data rows ################################################################### , 'filter_pars': { 'ymax': 2, 'ymin': -1 } } } ##### Filling Global parameters ######################################################### model_dict = global_pars_update(model_dict, data_name, config_name) return model_dict
def config4(path_model_out=""): """ """ config_name = os_get_function_name() data_name = "titanic" ### in data/input/ model_class = 'LGBMClassifier' ### ACTUAL Class name for model_sklearn.py n_sample = 1000 def post_process_fun(y): return int(y) def pre_process_fun(y): return int(y) model_dict = { 'model_pars': { ### LightGBM API model ####################################### 'model_class': model_class, 'model_pars': { 'objective': 'binary', 'n_estimators': 5, }, 'post_process_fun': post_process_fun, 'pre_process_pars': { 'y_norm_fun': pre_process_fun, ### Pipeline for data processing ############################## 'pipe_list': [ ### coly encoding { 'uri': 'source/prepro.py::pd_coly', 'pars': { 'ymin': -9999999999.0, 'ymax': 999999999.0, 'y_norm_fun': None }, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': { 'path_pipeline': False }, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' } #### Time Series #,{'uri': 'source/prepro_tseries.py::pd_ts_autoregressive' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } #,{'uri': 'source/prepro_tseries.py::pd_ts_basic' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } #,{'uri': 'source/prepro_tseries.py::pd_ts_date' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } #,{'uri': 'source/prepro_tseries.py::pd_ts_detrend' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } #,{'uri': 'source/prepro_tseries.py::pd_ts_generic' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } #,{'uri': 'source/prepro_tseries.py::pd_ts_groupby' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } #,{'uri': 'source/prepro_tseries.py::pd_ts_identity' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } #,{'uri': 'source/prepro_tseries.py::pd_ts_lag' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } #,{'uri': 'source/prepro_tseries.py::pd_ts_onehot' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } #,{'uri': 'source/prepro_tseries.py::pd_ts_rolling' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } #,{'uri': 'source/prepro_tseries.py::pd_ts_template' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } ], } }, 'compute_pars': { 'metric_list': ['accuracy_score', 'average_precision_score'] }, 'data_pars': { 'n_sample': n_sample, #### columns as raw data input 'cols_input_type': cols_input_type_2, ### columns for model input ######################################################### 'cols_model_group': [ # 'colnum', 'colcat_bin', ], #### Separate Category Sparse from Continuous (DLearning input) 'cols_model_type': { 'continuous': [ 'colnum', ], 'discreate': ['colcat_bin'] } ### Filter data rows ################################################################### , 'filter_pars': { 'ymax': 2, 'ymin': -1 } } } ##### Filling Global parameters ######################################################### model_dict = global_pars_update(model_dict, data_name, config_name) return model_dict
def config1(path_model_out=""): """ Contains all needed informations """ config_name = os_get_function_name() data_name = "titanic" ### in data/input/ model_class = 'LGBMClassifier' ### ACTUAL Class name for model_sklearn.py n_sample = 1000 def post_process_fun(y): return int(y) def pre_process_fun(y): return int(y) model_dict = { 'model_pars': { ### LightGBM API model ####################################### 'model_class': model_class, 'model_pars': { 'objective': 'binary', 'n_estimators': 3, }, 'post_process_fun': post_process_fun, 'pre_process_pars': { 'y_norm_fun': pre_process_fun, ### Pipeline for data processing ############################## 'pipe_list': [ ### Filter rows #,{'uri': 'source/prepro.py::pd_filter_rows' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } ### coly processing { 'uri': 'source/prepro.py::pd_coly', 'pars': { 'ymin': -9999999999.0, 'ymax': 999999999.0, 'y_norm_fun': None }, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_coly_clean', 'pars': { 'y_norm_fun': None }, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' } ### colnum : continuous , { 'uri': 'source/prepro.py::pd_colnum_quantile_norm', 'pars': { 'colsparse': [] }, 'cols_family': 'colnum', 'cols_out': 'colnum_quantile_norm', 'type': '' }, { 'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': { 'path_pipeline': False }, 'cols_family': 'colnum', 'cols_out': 'colnum_onehot', 'type': '' }, { 'uri': 'source/prepro.py::pd_colnum_bin', 'pars': { 'path_pipeline': False }, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' } ### colcat :Category , { 'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_onehot', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_minhash', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_minhash', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': { 'path_pipeline': False }, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' } #### Bug in NA values , { 'uri': 'source/prepro.py::pd_colcat_encoder_generic', 'pars': { 'model_name': 'HashingEncoder', 'model_pars': { 'verbose': 1, 'return_df': True } }, 'cols_family': 'colcat', 'cols_out': 'colcat_encoder2', 'type': '' } ### colcat, colnum cross-features , { 'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair_onehot', 'type': 'cross' } ### New Features , { 'uri': 'source/prepro.py::pd_col_genetic_transform', ### Issue with Binary 1 or 0 : need to pass with Logistic 'pars': { 'pars_generic': { 'metric': 'spearman', 'generations': 2, 'population_size': 10, ### Higher than nb_features 'tournament_size': 10, 'stopping_criteria': 1.0, 'const_range': (-1., 1.), 'p_crossover': 0.9, 'p_subtree_mutation': 0.01, 'p_hoist_mutation': 0.01, 'p_point_mutation': 0.01, 'p_point_replace': 0.05, 'parsimony_coefficient': 0.0005, #### 0.00005 Control Complexity 'max_samples': 0.9, 'verbose': 1, 'random_state': 0, 'n_jobs': 4, #'n_components' ### 'metric': 'spearman', Control number of outtput features : n_components } }, 'cols_family': 'colgen', 'cols_out': 'col_genetic', 'type': 'add_coly' #### Need to add target coly } #### Date #,{'uri': 'source/prepro.py::pd_coldate' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' } #### Example of Custom processor , { "uri": THIS_FILEPATH + "::pd_col_amyfun", "pars": {}, "cols_family": "colnum", "cols_out": "col_myfun", "type": "" }, ], } }, 'compute_pars': { 'metric_list': ['accuracy_score', 'average_precision_score'] }, 'data_pars': { 'n_sample': n_sample, #### columns as raw data input 'cols_input_type': cols_input_type_2, ### columns for model input ######################################################### # "colnum", "colnum_bin", "colnum_onehot", #### Colnum columns # "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns # 'colcross', "colcross_pair_onehot" #### colcross columns 'cols_model_group': [ # 'colnum', 'colnum_bin', 'colnum_onehot', 'colnum_quantile_norm', 'colcat_bin', 'colcat_onehot', 'colcat_minhash', ], #### Separate Category Sparse from Continuous (DLearning input) 'cols_model_type': { 'continuous': [ 'colnum', ], 'discreate': [ 'colcat_bin', 'colnum_bin', ] } ### Filter data rows ################################################################### , 'filter_pars': { 'ymax': 2, 'ymin': -1 } } } ##### Filling Global parameters ######################################################### model_dict = global_pars_update(model_dict, data_name, config_name) return model_dict
def airbnb_lightgbm(path_model_out=""): """ """ data_name = "airbnb" ###in data/ model_name = 'LGBMRegressor' def post_process_fun(y): return y_norm(y, inverse=True, mode='norm') def pre_process_fun(y): return y_norm(y, inverse=False, mode='norm') ############################################################################# model_dict = { 'model_pars': { 'model_class': model_name, 'model_path': path_model_out, 'model_pars': { 'objective': 'huber', } # lightgbm one , 'post_process_fun': post_process_fun, 'pre_process_pars': { 'y_norm_fun': copy.deepcopy(pre_process_fun), ### Pipeline for data processing ######################## 'pipe_list': [{ 'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': '' }, { 'uri': 'source/prepro.py::pd_coltext', 'pars': {}, 'cols_family': 'coltext', 'cols_out': 'coltext_svd', 'type': '' }, { 'uri': 'source/prepro.py::pd_coldate', 'pars': {}, 'cols_family': 'coldate', 'cols_out': 'coldate', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair_onehot', 'type': 'cross' }], } }, 'compute_pars': { 'metric_list': [ 'root_mean_squared_error', 'mean_absolute_error', #### sklearm names 'explained_variance_score', 'r2_score', 'median_absolute_error' ] }, 'data_pars': { 'cols_input_type': cols_input_type_1 # "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns # "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns # 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns # 'coldate', #'coltext', 'coltext_svd' , 'cols_model_group': ['colnum', 'colcat_bin', 'coltext_svd'], 'filter_pars': { 'ymax': 100000.0, 'ymin': 0.0 } ### Filter data } } ##### Filling Global parameters ############################################################ model_dict = global_pars_update(model_dict, data_name, os_get_function_name()) return model_dict
def multi_lightgbm(): """ multiclass """ data_name = f"multiclass" ### in data/input/ model_name = 'LGBMClassifier' n_sample = 6000 def post_process_fun(y): ### After prediction is done return int(y) def pre_process_fun_multi(y): ### Before the prediction is done return int(y) model_dict = { 'model_pars': { #'model_path' : path_model_out ### LightGBM API model ######################################## 'model_class': model_name ## ACTUAL Class name for model_sklearn.py , 'model_pars': { 'objective': 'multiclass', 'num_class': 4, 'metric': 'multi_logloss', 'learning_rate': 0.03, 'boosting_type': 'gbdt', "n_estimators": 3, } ### After prediction ########################################## , 'post_process_fun': post_process_fun, 'pre_process_pars': { 'y_norm_fun': pre_process_fun_multi, ### Pipeline for data processing. 'pipe_list': [{ 'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair_onehot', 'type': 'cross' }], }, }, 'compute_pars': { 'metric_list': ['roc_auc_score', 'accuracy_score'], 'probability': True, ### output probability for classifier }, 'data_pars': { 'n_sample': n_sample, ### columns from raw file, based on data type, ############# 'cols_input_type': cols_input_type_1, ### family of columns for MODEL ######################################################## # "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns # "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns # 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns # 'coldate', # 'coltext', 'cols_model_group': ['colnum_bin', 'colcat_bin'], 'cols_model_type': {} ### Filter data rows ##################################### , 'filter_pars': { 'ymax': 5, 'ymin': -1 } } } ##### Filling Global parameters ############################################################# model_dict = global_pars_update(model_dict, data_name, config_name=os_get_function_name()) return model_dict
def config1() : """ ONE SINGLE DICT Contains all needed informations for """ data_name = "titanic" ### in data/input/ model_class = 'AutoML' ### ACTUAL Class name for model_sklearn.py n_sample = 1000 def post_process_fun(y): ### After prediction is done return int(y) def pre_process_fun(y): ### Before the prediction is done return int(y) model_dict = { 'model_pars': { 'model_class': model_class ,'model_pars' : { 'total_time_limit' : 20, 'algorithms' : 'auto', 'results_path' : root_repo + f'/data/output/{data_name}/{os_get_function_name()}/automl_1', 'eval_metric' : 'auto' # mode='Explain', # ml_task='auto', model_time_limit=None, algorithms='auto', train_ensemble=True, # stack_models='auto', eval_metric='auto', validation_strategy='auto', explain_level='auto', # golden_features='auto', features_selection='auto', start_random_models='auto', # hill_climbing_steps='auto', top_models_to_improve='auto', verbose=1, random_state=1234) } , 'post_process_fun' : post_process_fun ### After prediction ########################################## , 'pre_process_pars' : {'y_norm_fun' : pre_process_fun , ### Before training ########################## ### Pipeline for data processing ############################## 'pipe_list': [ #### coly target prorcessing {'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, {'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' }, {'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': '' }, #### catcol INTO integer, colcat into OneHot {'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' }, # {'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': '' }, ### Cross_feat = feat1 X feat2 # {'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair', 'type': 'cross'}, ], } }, 'compute_pars': { 'metric_list': ['accuracy_score','average_precision_score'] ,'mlflow_pars' : None # {} ### Not empty --> use mlflow }, 'data_pars': { 'n_sample' : n_sample, 'download_pars' : None, 'cols_input_type' : cols_input_type_1, ### family of columns for MODEL ######################################################### # "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns # "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns # 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns 'coldate', 'coltext', 'cols_model_group': [ 'colnum_bin', 'colcat_bin', # 'coltext', # 'coldate', #'colcross_pair', ], 'cols_model_type' : { 'cols_cross_input': [ "colcat", ], 'cols_deep_input': ['colnum', ], } ### Filter data rows ################################################################## ,'filter_pars': { 'ymax' : 2 ,'ymin' : -1 } } } ##### Filling Global parameters ############################################################ model_dict = global_pars_update(model_dict, data_name, config_name=os_get_function_name() ) return model_dict
def titanic1(path_model_out=""): """ One big dict """ config_name = os_get_function_name() data_name = "titanic" ### in data/input/ model_class = 'LGBMClassifier' ### ACTUAL Class name for model_sklearn.py n_sample = 500 model_dict = { 'model_pars': { 'model_class': model_class, 'model_pars': { 'objective': 'binary', 'n_estimators': 10, }, 'post_process_fun': post_process_fun, 'pre_process_pars': { 'y_norm_fun': pre_process_fun, ### Pipeline for data processing ############################## 'pipe_list': [ { 'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' }, # {'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' }, # {'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': '' }, # {'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair_onehot', 'type': 'cross'}, ], } }, 'compute_pars': { 'metric_list': ['accuracy_score', 'average_precision_score'] }, 'data_pars': { 'n_sample': n_sample, 'cols_input_type': cols_input_type_2, 'cols_model_group': [ 'colnum', ### should be optional 'colcat' 'colcat_bin', ], 'cols_model_type': { 'continuous': [ 'colnum', ], 'sparse': [ 'colcat_bin', 'colnum_bin', ], } ### Filter data rows ################################################################## , 'filter_pars': { 'ymax': 2, 'ymin': -1 } } } ##### Filling Global parameters ######################################################## model_dict = global_pars_update(model_dict, data_name, config_name) return model_dict
def adfraud_lightgbm(path_model_out=""): """ """ config_name = os_get_function_name() data_name = "adfraud" ### in data/input/ model_class = 'LGBMClassifier' ### ACTUAL Class name for model_sklearn.py n_sample = 5000000 def post_process_fun(y): ### After prediction is done return int(y) def pre_process_fun(y): ### Before the prediction is done return int(y) model_dict = { 'model_pars': { ### LightGBM API model ####################################### 'info': """ Use large max_bin (may be slower) Use small learning_rate with large num_iterations Use large num_leaves (may cause over-fitting) Use bigger training data Try dart Deal with Over-fitting Use small max_bin Use small num_leaves Use min_data_in_leaf and min_sum_hessian_in_leaf Use bagging by set bagging_fraction and bagging_freq Use feature sub-sampling by set feature_fraction Use bigger training data Try lambda_l1, lambda_l2 and min_gain_to_split for regularization Try max_depth to avoid growing deep tree Try extra_trees Try increasing path_smooth """, 'model_class': model_class, 'model_pars': { 'objective': 'binary', 'boosting_type': 'gbdt', # "seed": 1, 'boosting_type': 'dart', 'metric': 'auc,average_precision', #'scale_pos_weight' : 99, 'is_unbalance': True, 'learning_rate': 0.001, 'num_leaves': 31, # we should let it be smaller than 2^(max_depth) 'max_depth': -1, # -1 means no limit 'min_child_samples': 20, # Minimum number of df need in a child(min_data_in_leaf) 'max_bin': 255, # Number of bucketed bin for feature values 'subsample': 0.6, # Subsample ratio of the training instance. 'subsample_freq': 0, # frequence of subsample, <=0 means no enable 'colsample_bytree': 0.3, # Subsample ratio of columns when constructing each tree. 'min_child_weight': 5, # Minimum sum of instance weight(hessian) needed in a child(leaf) 'subsample_for_bin': 2000, # Number of samples for constructing bin 'min_split_gain': 0, # lambda_l1, lambda_l2 and min_gain_to_split to regularization 'reg_alpha': 0, # L1 regularization term on weights 'reg_lambda': 0, # L2 regularization term on weights # 'nthread' : -1, 'verbose': 0, }, 'post_process_fun': post_process_fun ### After prediction ########################################## , 'pre_process_pars': { 'y_norm_fun': pre_process_fun, ### Before training ########################## ### Pipeline for data processing ############################## 'pipe_list': [ { 'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, # {'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' }, # {'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' }, # {'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': '' }, # {'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair', 'type': 'cross'}, #### Example of Custom processor # {'uri': 'titanic_classifier.py::pd_colnum_quantile_norm', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_quantile_norm', 'type': '' }, ], } }, #### Sklearn 'compute_pars': { 'metric_list': [ 'accuracy_score', 'average_precision_score', 'f1_score', 'recall_score' ] }, 'data_pars': { 'n_sample': n_sample, 'cols_input_type': cols_input_type_1, ### family of columns for MODEL ###################################################################### # "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns # "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns # 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns # 'coldate', 'cols_model_group': [ # 'colnum_bin', 'colnum', 'colcat_bin', # 'coltext', # 'coldate', # 'colcross_pair', ### example of custom # 'colnum_quantile_norm' ] ### Filter data rows ################################################################## , 'filter_pars': { 'ymax': 2, 'ymin': -1 } } } ##### Filling Global parameters ############################################################ model_dict = global_pars_update(model_dict, data_name, config_name) return model_dict
def online_lightgbm(): """ Contains all needed informations for Light GBM Classifier model, used for titanic classification task """ data_name = "online_shopping" ### in data/input/ model_class = 'LGBMClassifier' ### ACTUAL Class name for model_sklearn.py n_sample = 3816 def post_process_fun(y): ### After prediction is done return int(y) def pre_process_fun(y): ### Before the prediction is done return int(y) model_dict = {'model_pars': { ### LightGBM API model ####################################### 'model_class': model_class , 'model_pars': {'objective': 'binary', 'n_estimators': 10, 'learning_rate': 0.001, 'boosting_type': 'gbdt', ### Model hyperparameters 'early_stopping_rounds': 5 } , 'post_process_fun': post_process_fun ### After prediction ########################################## , 'pre_process_pars': {'y_norm_fun': pre_process_fun, ### Before training ########################## ### Pipeline for data processing ############################## 'pipe_list': [ #### coly target prorcessing {'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly'}, {'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': ''}, {'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': ''}, #### catcol INTO integer, colcat into OneHot {'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': ''}, {'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {},'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': ''}, ### Cross_feat = feat1 X feat2 {'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair', 'type': 'cross'}, ], } }, 'compute_pars': {'metric_list': ['accuracy_score', 'average_precision_score'] }, 'data_pars': {'n_sample': n_sample, 'cols_input_type': cols_input_type_1, ### family of columns for MODEL ######################################################### # "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns # "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns # 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns # 'coldate', 'coltext', 'cols_model_group': ['colnum_bin', 'colcat_bin', # 'coltext', # 'coldate', 'colcross_pair', ### example of custom # 'col_myfun' ] ### Filter data rows ################################################################## , 'filter_pars': {'ymax': 2, 'ymin': -1} } } ##### Filling Global parameters ############################################################ model_dict = global_pars_update(model_dict, data_name, config_name=os_get_function_name()) return model_dict
def config1(): """ ONE SINGLE DICT Contains all needed informations for used for tseries_demand classification task """ data_name = "tseries_demand" ### in data/input/ model_class = "LGBMRegressor" ### ACTUAL Class name for model_sklearn.py n_sample = 100000 def post_process_fun(y): ### After prediction is done # ynew = np.exp(y) - 1.0 ynew = float(y) return ynew def pre_process_fun(y): ### Before the prediction is done # ynew = np.log(y+1) ynew = float(y) return ynew model_dict = { "model_pars": { ### LightGBM API model ####################################### "model_class": model_class, "model_pars": { "objective": "huber", ### Regression Type Loss "n_estimators": 100, "learning_rate": 0.001, "boosting_type": "gbdt", ### Model hyperparameters "early_stopping_rounds": 5 }, "post_process_fun": post_process_fun ### After prediction ####### , "pre_process_pars": { "y_norm_fun": pre_process_fun, ### Before training ########################## ### Pipeline for data processing ############################## "pipe_list": [ #### Example of Custom processor { "uri": THIS_FILEPATH + "::pd_dsa2_custom", "pars": { 'coldate': 'date' }, "cols_family": "col_tseries", "cols_out": "tseries_feat", "type": "" }, ], } }, "compute_pars": { "metric_list": [ 'root_mean_squared_error', 'mean_absolute_error', 'explained_variance_score', 'r2_score', 'median_absolute_error' ] }, "data_pars": { "n_sample": n_sample, "download_pars": None, ### Raw data: column input ############################################################## "cols_input_type": cols_input_type_1, ### Model Input : Merge family of columns ############################################# "cols_model_group": [ ### cols_out of pd_dsa2_custom "tseries_feat" ] #### Model Input : Separate Category Sparse from Continuous : Aribitrary name is OK (!) , 'cols_model_type': { 'My123_continuous': [ 'tseries_feat', ], 'my_sparse': [ 'colcat', ], } ### Filter data rows ################################################################## , "filter_pars": { "ymax": 999999999, "ymin": -1 } } } ##### Filling Global parameters ############################################################ model_dict = global_pars_update(model_dict, data_name, config_name=os_get_function_name()) return model_dict
def income_status_lightgbm(path_model_out=""): """ """ data_name = "income_status" ### in data/input/ model_class = 'LGBMClassifier' # 'LGBMClassifier_optuna' ACTUAL Class name for model_sklearn.py n_sample = 32500 # 32560 def post_process_fun(y): ### After prediction is done return int(y) def pre_process_fun(y): ### Before the prediction is done return int(y) model_dict = { 'model_pars': { ### LightGBM API model ####################################### 'model_class': model_class, 'model_pars': { 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.001, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0, 'n_estimators': 5000, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0 }, 'post_process_fun': post_process_fun, 'pre_process_pars': { 'y_norm_fun': pre_process_fun, ### Pipeline for data processing ############################## 'pipe_list': [ #{'uri': 'data/input/income/manual_preprocessing.py::pd_income_processor', 'pars': {}, 'cols_family': 'colall', 'cols_out': 'colall', # 'type': 'filter' }, { 'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' }, ### Cross Features { 'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': '' }, { 'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair', 'type': 'cross' }, ### Quantile normalization { 'uri': 'source/prepro.py::pd_colnum_quantile_norm', 'pars': { 'colsparse': [] }, 'cols_family': 'colnum', 'cols_out': 'colnum_quantile_norm', 'type': '' }, ], } }, 'compute_pars': { 'metric_list': ['accuracy_score', 'average_precision_score'], 'optuna_params': { "early_stopping_rounds": 5, 'verbose_eval': 100, # folds=KFold(n_splits=3) }, 'optuna_engine': 'LightGBMTuner' ### LightGBMTuner', LightGBMTunerCV }, 'data_pars': { 'n_sample': n_sample, 'cols_input_type': cols_input_type_1, ### family of columns for MODEL ######################################################## # "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns # "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns # 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns # 'coldate', 'coltext', 'cols_model_group': [ # 'colnum_bin', 'colcat_bin', 'colnum_quantile_norm', # 'coltext', # 'coldate', 'colcross_pair', ] ### Filter data rows ################################################################## , 'filter_pars': { 'ymax': 2, 'ymin': -1 } } } ##### Filling Global parameters ############################################################ model_dict = global_pars_update(model_dict, data_name, config_name=os_get_function_name()) return model_dict
def titanic_lightoptuna(): """ Contains all needed informations for Light GBM Classifier model, used for titanic classification task """ config_name = os_get_function_name() data_name = "titanic" ### in data/input/ model_class = 'LGBMModel_optuna' ### ACTUAL Class name for model_sklearn.py n_sample = 1000 def post_process_fun(y): ### After prediction is done return int(y) def pre_process_fun(y): ### Before the prediction is done return int(y) model_dict = { 'model_pars': { ### LightGBM API model ####################################### 'model_file': 'optuna_lightgbm.py', ###Optional one 'model_class': model_class, 'model_pars': { 'objective': 'binary', 'n_estimators': 50, 'learning_rate': 0.001, 'boosting_type': 'gbdt', ### Model hyperparameters 'early_stopping_rounds': 5 } ### After prediction ########################################## , 'post_process_fun': post_process_fun ### Before training ########################################## , 'pre_process_pars': { 'y_norm_fun': pre_process_fun, ### Pipeline for data processing ############################## 'pipe_list': [ { 'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' }, # {'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': ''}, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' }, # {'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': ''}, #{'uri': 'source/prepro.py::pd_colcross', 'pars': {},'cols_family': 'colcross', 'cols_out': 'colcross_pair_onehot', 'type': 'cross'} ], } }, #classoptuna.integration.lightgbm.LightGBMTuner(params: Dict[str, Any], train_set: lgb.Dataset, # num_boost_round: int = 1000, valid_sets: Optional[VALID_SET_TYPE] = None, # valid_names: Optional[Any] = None, fobj: Optional[Callable[[…], Any]] = None, # feval: Optional[Callable[[…], Any]] = None, feature_name: str = 'auto', categorical_feature: str = 'auto', early_stopping_rounds: Optional[int] = None, evals_result: Optional[Dict[Any, Any]] = None, verbose_eval: Union[bool, int, None] = True, learning_rates: Optional[List[float]] = None, keep_training_booster: bool = False, callbacks: Optional[List[Callable[[…], Any]]] = None, time_budget: Optional[int] = None, sample_size: Optional[int] = None, study: Optional[optuna.study.Study] = None, optuna_callbacks: Optional[List[Callable[[optuna.study.Study, optuna.trial._frozen.FrozenTrial], None]]] = None, model_dir: Optional[str] = None, verbosity: Optional[int] = None, show_progress_bar: bool = True)[source] 'compute_pars': { 'metric_list': ['accuracy_score', 'average_precision_score'], 'optuna_params': { "early_stopping_rounds": 5, 'verbose_eval': 100, # folds=KFold(n_splits=3) }, 'optuna_engine': 'LightGBMTuner' ### LightGBMTuner', LightGBMTunerCV }, 'data_pars': { 'n_sample': n_sample, 'cols_input_type': cols_input_type_1, ### family of columns for MODEL ######################################################### # "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns # "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns # 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns # 'coldate', # 'coltext', 'cols_model_group': [ 'colnum_bin', 'colcat_bin', # 'coltext', # 'coldate', # 'colcross_pair' ] ### Filter data rows ################################################################## , 'filter_pars': { 'ymax': 2, 'ymin': -1 } } } ##### Filling Global parameters ############################################################ model_dict = global_pars_update(model_dict, data_name, config_name) return model_dict
def config2(path_model_out=""): """ Contains all needed informations """ config_name = os_get_function_name() data_name = "titanic" ### in data/input/ model_class = 'LGBMClassifier' ### ACTUAL Class name for model_sklearn.py n_sample = 1000 def post_process_fun(y): return int(y) def pre_process_fun(y): return int(y) model_dict = { 'model_pars': { ### LightGBM API model ####################################### 'model_class': model_class, 'model_pars': { 'objective': 'binary', 'n_estimators': 5, }, 'post_process_fun': post_process_fun, 'pre_process_pars': { 'y_norm_fun': pre_process_fun, ### Pipeline for data processing ############################## 'pipe_list': [ ### coly encoding { 'uri': 'source/prepro.py::pd_coly', 'pars': { 'ymin': -9999999999.0, 'ymax': 999999999.0, 'y_norm_fun': None }, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': { 'path_pipeline': False }, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' } #### Text , { "uri": "source/prepro_text.py::pd_coltext", "pars": { 'dimpca': 1, "word_minfreq": 2 }, "cols_family": "coltext", "cols_out": "col_text", "type": "" }, { "uri": "source/prepro_text.py::pd_coltext_universal_google", "pars": { 'model_uri': "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3" }, "cols_family": "coltext", "cols_out": "col_text", "type": "" } ], } }, 'compute_pars': { 'metric_list': ['accuracy_score', 'average_precision_score'] }, 'data_pars': { 'n_sample': n_sample, #### columns as raw data input 'cols_input_type': cols_input_type_2, ### columns for model input ######################################################### 'cols_model_group': [ # 'colnum', 'colcat_bin', ], #### Separate Category Sparse from Continuous (DLearning input) 'cols_model_type': { 'continuous': [ 'colnum', ], 'discreate': [ 'colcat_bin', ] } ### Filter data rows ################################################################### , 'filter_pars': { 'ymax': 2, 'ymin': -1 } } } ##### Filling Global parameters ######################################################### model_dict = global_pars_update(model_dict, data_name, config_name) return model_dict
def adfraud_lightgbm(path_model_out=""): """ """ config_name = os_get_function_name() data_name = "adfraud" ### in data/input/ model_class = 'LGBMClassifier' ### ACTUAL Class name for model_sklearn.py n_sample = 1000 def post_process_fun(y): ### After prediction is done return int(y) def pre_process_fun(y): ### Before the prediction is done return int(y) model_dict = { 'model_pars': { ### LightGBM API model ####################################### 'model_class': model_class, 'model_pars': { 'objective': 'binary', 'n_estimators': 10, 'learning_rate': 0.001, 'boosting_type': 'gbdt', ### Model hyperparameters 'early_stopping_rounds': 5 }, 'post_process_fun': post_process_fun ### After prediction ########################################## , 'pre_process_pars': { 'y_norm_fun': pre_process_fun, ### Before training ########################## ### Pipeline for data processing ############################## 'pipe_list': [ { 'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': '' }, # {'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair', 'type': 'cross'}, #### Example of Custom processor # {'uri': 'titanic_classifier.py::pd_colnum_quantile_norm', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_quantile_norm', 'type': '' }, ], } }, 'compute_pars': { 'metric_list': ['accuracy_score', 'average_precision_score'] }, 'data_pars': { 'n_sample': n_sample, 'cols_input_type': cols_input_type_1, ### family of columns for MODEL ######################################################### # "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns # "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns # 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns # 'coldate', 'cols_model_group': [ 'colnum_bin', 'colcat_bin', # 'coltext', # 'coldate', # 'colcross_pair', ### example of custom # 'colnum_quantile_norm' ] ### Filter data rows ################################################################## , 'filter_pars': { 'ymax': 2, 'ymin': -1 } } } ##### Filling Global parameters ############################################################ model_dict = global_pars_update(model_dict, data_name, config_name) return model_dict
def config9(path_model_out=""): """ python example/test_features.py train --nsample 500 --config config1 """ config_name = os_get_function_name() data_name = "titanic" ### in data/input/ model_class = 'LGBMClassifier' ### ACTUAL Class name for model_sklearn.py n_sample = 1000 def post_process_fun(y): return int(y) def pre_process_fun(y): return int(y) model_dict = { 'model_pars': { ### LightGBM API model ####################################### 'model_class': model_class, 'model_pars': { 'objective': 'binary', 'n_estimators': 3, }, 'post_process_fun': post_process_fun, 'pre_process_pars': { 'y_norm_fun': pre_process_fun, ### Pipeline for data processing ############################## 'pipe_list': [ ### coly processing { 'uri': 'source/prepro.py::pd_coly', 'pars': { 'y_norm_fun': None }, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': { 'path_pipeline': False }, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' } #### Bug in NA values , { 'uri': 'source/prepro.py::pd_colcat_encoder_generic', 'pars': { 'model_name': 'HashingEncoder', 'model_pars': { 'verbose': 1, 'return_df': True } }, 'cols_family': 'colcat', 'cols_out': 'colcat_encoder2', 'type': '' } #### Example of Custom processor , { "uri": THIS_FILEPATH + "::pd_col_amyfun", "pars": {}, "cols_family": "colnum", "cols_out": "col_myfun", "type": "" }, ], } }, 'compute_pars': { 'metric_list': ['accuracy_score', 'average_precision_score'] }, 'data_pars': { 'n_sample': n_sample, 'cols_input_type': cols_input_type_2, 'cols_model_group': ['colnum', 'colcat_bin', 'col_myfun'], #### Separate Category Sparse from Continuous (DLearning input) 'cols_model_type': { 'continuous': [ 'colnum', ], 'discreate': [ 'colcat_bin', ] } ### Filter data rows ################################################################### , 'filter_pars': { 'ymax': 2, 'ymin': -1 } } } ##### Filling Global parameters ######################################################### model_dict = global_pars_update(model_dict, data_name, config_name) return model_dict
def config1(): """ ONE SINGLE DICT Contains all needed informations for used for titanic classification task """ data_name = "titanic" ### in data/input/ model_class = "LGBMClassifier" ### ACTUAL Class name for model_sklearn.py n_sample = 1000 def post_process_fun(y): ### After prediction is done return int(y) def pre_process_fun(y): ### Before the prediction is done return int(y) model_dict = { "model_pars": { ### LightGBM API model ####################################### "model_class": model_class, "model_pars": { "objective": "binary", "n_estimators": 10, "learning_rate": 0.001, "boosting_type": "gbdt", ### Model hyperparameters "early_stopping_rounds": 5 }, "post_process_fun": post_process_fun ### After prediction ########################################## , "pre_process_pars": { "y_norm_fun": pre_process_fun, ### Before training ########################## ### Pipeline for data processing ############################## "pipe_list": [ #### coly target prorcessing { "uri": "source/prepro.py::pd_coly", "pars": {}, "cols_family": "coly", "cols_out": "coly", "type": "coly" }, { "uri": "source/prepro.py::pd_colnum_bin", "pars": {}, "cols_family": "colnum", "cols_out": "colnum_bin", "type": "" }, { "uri": "source/prepro.py::pd_colnum_binto_onehot", "pars": {}, "cols_family": "colnum_bin", "cols_out": "colnum_onehot", "type": "" }, #### catcol INTO integer, colcat into OneHot { "uri": "source/prepro.py::pd_colcat_bin", "pars": {}, "cols_family": "colcat", "cols_out": "colcat_bin", "type": "" }, { "uri": "source/prepro.py::pd_colcat_to_onehot", "pars": {}, "cols_family": "colcat_bin", "cols_out": "colcat_onehot", "type": "" }, ### Cross_feat = feat1 X feat2 { "uri": "source/prepro.py::pd_colcross", "pars": {}, "cols_family": "colcross", "cols_out": "colcross_pair", "type": "cross" }, #### Example of Custom processor { "uri": THIS_FILEPATH + "::pd_col_myfun", "pars": {}, "cols_family": "colnum", "cols_out": "col_myfun", "type": "" }, ], } }, "compute_pars": { "metric_list": ["accuracy_score", "average_precision_score"] # ,"mlflow_pars" : {} ### Not empty --> use mlflow }, "data_pars": { "n_sample": n_sample, "download_pars": None, "cols_input_type": cols_input_type_1, ### family of columns for MODEL ######################################################### # "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns # "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns # "colcross_single_onehot_select", "colcross_pair_onehot", "colcross_pair", #### colcross columns "coldate", "coltext", "cols_model_group": [ "colnum_bin", "colcat_bin", # "coltext", # "coldate", "colcross_pair", ### example of custom "col_myfun" ] ### Filter data rows ################################################################## , "filter_pars": { "ymax": 2, "ymin": -1 } } } ##### Filling Global parameters ############################################################ model_dict = global_pars_update(model_dict, data_name, config_name=os_get_function_name()) return model_dict
def config_sampler() : """ ONE SINGLE DICT Contains all needed informations for used for titanic classification task """ data_name = "titanic" ### in data/input/ model_class = "CTGAN" ### ACTUAL Class name for model_sklearn.py n_sample = 1000 def post_process_fun(y): ### After prediction is done return int(y) def pre_process_fun(y): ### Before the prediction is done return int(y) model_dict = { "model_pars": { "model_class": model_class ,"model_pars" : { } , "post_process_fun" : post_process_fun ### After prediction ########################################## , "pre_process_pars" : { "y_norm_fun" : pre_process_fun , ### Before training ########################## ### Pipeline for data processing ############################## "pipe_list": [ #### coly target prorcessing {"uri": "source/prepro.py::pd_coly", "pars": {}, "cols_family": "coly", "cols_out": "coly", "type": "coly" }, {"uri": "source/prepro.py::pd_colnum_bin", "pars": {}, "cols_family": "colnum", "cols_out": "colnum_bin", "type": "" }, {"uri": "source/prepro.py::pd_colnum_binto_onehot", "pars": {}, "cols_family": "colnum_bin", "cols_out": "colnum_onehot", "type": "" }, #### catcol INTO integer, colcat into OneHot {"uri": "source/prepro.py::pd_colcat_bin", "pars": {}, "cols_family": "colcat", "cols_out": "colcat_bin", "type": "" }, {"uri": "source/prepro.py::pd_colcat_to_onehot", "pars": {}, "cols_family": "colcat_bin", "cols_out": "colcat_onehot", "type": "" }, ], } }, "compute_pars": { "metric_list": ["accuracy_score","average_precision_score"] # ,"mlflow_pars" : {} ### Not empty --> use mlflow }, "data_pars": { "n_sample" : n_sample, "download_pars" : None, ### Filter data rows ################################################################## "filter_pars": { "ymax" : 2 ,"ymin" : -1 }, ### Raw data: column input ############################################################## "cols_input_type" : cols_input_type_1, ### Model Input : Merge family of columns ############################################# "cols_model_group": [ "colnum_bin", "colcat_bin",] #### Model Input : Separate Category Sparse from Continuous : Aribitrary name is OK (!) ,'cols_model_type': { 'continuous' : [ 'colnum', ], 'sparse' : [ 'colcat_bin', 'colnum_bin', ], 'my_split_23' : [ 'colnum_bin', ], } } } ##### Filling Global parameters ############################################################ model_dict = global_pars_update(model_dict, data_name, config_name=os_get_function_name() ) return model_dict
def config1() : """ ONE SINGLE DICT Contains all needed informations for used for titanic classification task """ data_name = "titanic" ### in data/input/ model_class = "source/models/keras_widedeep_dense.py" ### ACTUAL Class name for n_sample = 1000 def post_process_fun(y): ### After prediction is done return int(y) def pre_process_fun(y): ### Before the prediction is done return int(y) model_dict = {"model_pars": { ### LightGBM API model ####################################### "model_class": model_class ,"model_pars" : { } , "post_process_fun" : post_process_fun ### After prediction ########################################## , "pre_process_pars" : {"y_norm_fun" : pre_process_fun , ### Before training ########################## ### Pipeline for data processing ############################## "pipe_list": [ {"uri": "source/prepro.py::pd_coly", "pars": {}, "cols_family": "coly", "cols_out": "coly", "type": "coly" }, {"uri": "source/prepro.py::pd_colnum_bin", "pars": {}, "cols_family": "colnum", "cols_out": "colnum_bin", "type": "" }, {"uri": "source/prepro.py::pd_colnum_binto_onehot", "pars": {}, "cols_family": "colnum_bin", "cols_out": "colnum_onehot", "type": "" }, {"uri": "source/prepro.py::pd_colcat_bin", "pars": {}, "cols_family": "colcat", "cols_out": "colcat_bin", "type": "" }, {"uri": "source/prepro.py::pd_colcat_to_onehot", "pars": {}, "cols_family": "colcat_bin", "cols_out": "colcat_onehot", "type": "" }, #### neeed to 0-1 Normalize the input # {"uri": "source/prepro.py::pd_colcat_bin", "pars": {}, "cols_family": "colcat", "cols_out": "colcat_bin", "type": "" }, ], }}, "compute_pars": { "metric_list": ["accuracy_score","average_precision_score"], 'compute_pars': {'epochs': 1 }, 'path_checkpoint' : "ztmp_checkpoint/" }, "data_pars": { "n_sample" : n_sample, "download_pars" : None, ### family of columns for raw input data ######################################################### "cols_input_type" : cols_input_type_1, ### family of columns used for model input ######################################################### "cols_model_group": [ "colnum_onehot", "colcat_onehot", ] ,'cols_model_type' : { 'cols_cross_input': [ "colcat_onehot", ], 'cols_deep_input': ['colnum_onehot', ], } ### Filter data rows ################################################################## ,"filter_pars": { "ymax" : 2 ,"ymin" : -1 } } } ##### Filling Global parameters ############################################################ model_dict = global_pars_update(model_dict, data_name, config_name=os_get_function_name() ) return model_dict