Example #1
0
def salary_lightgbm(path_model_out="") :
    """
        Huber Loss includes L1  regurarlization
        We test different features combinaison, default params is optimal
    """
    data_name     = "salary"
    model_class   = 'LGBMRegressor'
    n_sample      = 10**5

    def post_process_fun(y):
        return y_norm(y, inverse=True, mode='boxcox')

    def pre_process_fun(y):
        return y_norm(y, inverse=False, mode='boxcox')


    model_dict = {'model_pars':
        {'model_class': model_class
        ,'model_path':  path_model_out
        ,'model_pars':  {'objective': 'huber',


        }  # default
        ,'post_process_fun': copy.deepcopy( post_process_fun)
        ,'pre_process_pars': {'y_norm_fun' :  copy.deepcopy(pre_process_fun) ,

        ### Pipeline for data processing ##############################
        'pipe_list': [
            {'uri': 'source/prepro.py::pd_coly',                 'pars': {}, 'cols_family': 'coly',       'cols_out': 'coly',           'type': 'coly'         },
            {'uri': 'source/prepro.py::pd_colnum_bin',           'pars': {}, 'cols_family': 'colnum',     'cols_out': 'colnum_bin',     'type': ''             },
            {'uri': 'source/prepro.py::pd_colnum_binto_onehot',  'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot',  'type': ''             },
            {'uri': 'source/prepro.py::pd_colcat_bin',           'pars': {}, 'cols_family': 'colcat',     'cols_out': 'colcat_bin',     'type': ''             },
            {'uri': 'source/prepro.py::pd_colcat_to_onehot',     'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot',  'type': ''             },
            {'uri': 'source/prepro.py::pd_colcross',             'pars': {}, 'cols_family': 'colcross',   'cols_out': 'colcross_pair_onehot',  'type': 'cross'}
        ],
               }
        },


    'compute_pars': { 'metric_list': ['root_mean_squared_error', 'mean_absolute_error',
                                      'explained_variance_score', 'r2_score', 'median_absolute_error']
                                    },

    'data_pars': {
            'cols_input_type' : cols_input_type_1

            # cols['cols_model'] = cols["colnum"] + cols["colcat_bin"]  # + cols[ "colcross_onehot"]
            ,'cols_model_group': [ 'colnum', 'colcat_bin']

           ,'filter_pars': { 'ymax' : 100000.0 ,'ymin' : 0.0 }   ### Filter data

    }}

    ################################################################################################
    ##### Filling Global parameters    #############################################################
    model_dict        = global_pars_update(model_dict, data_name, os_get_function_name() )
    return model_dict
Example #2
0
def house_price_lightgbm(path_model_out="") :
    """
        Huber Loss includes L1  regurarlization
        We test different features combinaison, default params is optimal
    """
    data_name         = 'house_price'
    model_name        = 'LGBMRegressor'
    n_sample          = 20000


    def post_process_fun(y):
        return y_norm(y, inverse=True, mode='norm')

    def pre_process_fun(y):
        return y_norm(y, inverse=False, mode='norm')


    model_dict = {'model_pars': {  'model_path'       : path_model_out

        , 'model_class': model_name   ### Actual Class Name
        , 'model_pars'       : {}  # default ones of the model name

        , 'post_process_fun' : post_process_fun
        , 'pre_process_pars' : {'y_norm_fun' : copy.deepcopy(pre_process_fun),

            ### Pipeline for data processing.
            # 'pipe_list'  : [ 'filter', 'label', 'dfnum_bin', 'dfnum_hot',  'dfcat_bin', 'dfcat_hot', 'dfcross_hot', ]
           'pipe_list'  : [ 'filter', 'label',   'dfcat_bin'  ]

           }
                                                         },
    'compute_pars': { 'metric_list': ['root_mean_squared_error', 'mean_absolute_error',
                                      'explained_variance_score', 'r2_score', 'median_absolute_error']
                    },
    'data_pars': {
        'cols_input_type' : cols_input_type_1,

        # 'cols_model_group': [ 'colnum_onehot', 'colcat_onehot', 'colcross_onehot' ]
        'cols_model_group': [ 'colnum', 'colcat_bin' ]


       ,'filter_pars': { 'ymax' : 1000000.0 ,'ymin' : 0.0 }   ### Filter data

    }}


    ################################################################################################
    ##### Filling Global parameters    #############################################################
    model_dict        = global_pars_update(model_dict, data_name, config_name=os_get_function_name() )
    return model_dict
Example #3
0
def salary_bayesian_pyro(path_model_out="") :
    global model_name
    model_name        = 'model_bayesian_pyro'
    def post_process_fun(y):
        return y_norm(y, inverse=True, mode='boxcox')

    def pre_process_fun(y):
        return y_norm(y, inverse=False, mode='boxcox')

    model_dict = {'model_pars': {'model_class': 'model_bayesian_pyro'
        , 'model_path': path_model_out
        , 'model_pars': {'input_width': 112, }  # default
        , 'post_process_fun': post_process_fun

        , 'pre_process_pars': {'y_norm_fun' :  copy.deepcopy(pre_process_fun) ,

        ### Pipeline for data processing ##############################
        'pipe_list': [
            {'uri': 'source/prepro.py::pd_coly',                 'pars': {}, 'cols_family': 'coly',       'cols_out': 'coly',           'type': 'coly'         },
            {'uri': 'source/prepro.py::pd_colnum_bin',           'pars': {}, 'cols_family': 'colnum',     'cols_out': 'colnum_bin',     'type': ''             },
            {'uri': 'source/prepro.py::pd_colnum_binto_onehot',  'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot',  'type': ''             },
            {'uri': 'source/prepro.py::pd_colcat_bin',           'pars': {}, 'cols_family': 'colcat',     'cols_out': 'colcat_bin',     'type': ''             },
            {'uri': 'source/prepro.py::pd_colcat_to_onehot',     'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot',  'type': ''             },
            {'uri': 'source/prepro.py::pd_colcross',             'pars': {}, 'cols_family': 'colcross',   'cols_out': 'colcross_pair_onehot',  'type': 'cross'}
        ],
               }
        },

    'compute_pars': {'compute_pars': {'n_iter': 1200, 'learning_rate': 0.01}
                                 , 'metric_list': ['root_mean_squared_error', 'mean_absolute_error',
                                                    'explained_variance_score', 'r2_score', 'median_absolute_error']
                                 , 'max_size': 1000000
                                 , 'num_samples': 300
     },
    'data_pars':  {
            'cols_input_type' : cols_input_type_1


            ,'cols_model_group': [ 'colnum_onehot', 'colcat_onehot' ]


           ,'filter_pars': { 'ymax' : 100000.0 ,'ymin' : 0.0 }   ### Filter data
                            }}

    ##### Filling Global parameters    ############################################################
    model_dict        = global_pars_update(model_dict, data_name, os_get_function_name() )
    return model_dict
Example #4
0
def house_price_elasticnetcv(path_model_out=""):
    model_name   = 'ElasticNetCV'
    config_name  = 'house_price_elasticnetcv'
    n_sample     = 1000


    def post_process_fun(y):
        return y_norm(y, inverse=True, mode='norm')

    def pre_process_fun(y):
        return y_norm(y, inverse=False, mode='norm')


    model_dict = {'model_pars': {'model_class': 'ElasticNetCV'
        , 'model_path': path_model_out



        , 'model_pars': {}  # default ones
        , 'post_process_fun': post_process_fun
        , 'pre_process_pars': {'y_norm_fun' : pre_process_fun,

                        ### Pipeline for data processing.
                       # 'pipe_list'  : [ 'filter', 'label', 'dfnum_bin', 'dfnum_hot',  'dfcat_bin', 'dfcat_hot', 'dfcross_hot', ]
                       'pipe_list' : [ 'filter', 'label',   'dfcat_hot' ]
                                                     }
                                                         },
    'compute_pars': { 'metric_list': ['root_mean_squared_error', 'mean_absolute_error',
                                      'explained_variance_score', 'r2_score', 'median_absolute_error']
                    },

    'data_pars': {
        'cols_input_type' : cols_input_type_1,

        # 'cols_model_group': [ 'colnum_onehot', 'colcat_onehot', 'colcross_onehot' ]
        'cols_model_group': [ 'colnum', 'colcat_onehot' ]

         ,'filter_pars': { 'ymax' : 100000.0 ,'ymin' : 0.0 }   ### Filter data
    }}


    ################################################################################################
    ##### Filling Global parameters    #############################################################
    model_dict        = global_pars_update(model_dict, data_name, config_name=os_get_function_name() )
    return model_dict
Example #5
0
          ### family of columns used for model input  #########################################################
          "cols_model_group": [ "colnum",       ### numerical continuous   
                                "colcat_bin",   ###  category


                              ]

          ### Filter data rows   ##################################################################
         ,"filter_pars": { "ymax" : 2 ,"ymin" : -1 }

         }
      }

    ##### Filling Global parameters    ############################################################
    model_dict        = global_pars_update(model_dict, data_name, config_name=os_get_function_name() )
    return model_dict




###################################################################################
########## Preprocess #############################################################
### def preprocess(config="", nsample=1000):
from core_run import preprocess

"""
def preprocess(config=None, nsample=None):
    config_name  = config  if config is not None else config_default
    mdict        = globals()[config_name]()
    m            = mdict["global_pars"]
Example #6
0
def titanic1(path_model_out="") :
    """
       Contains all needed informations for Light GBM Classifier model,
       used for titanic classification task
    """
    config_name  = os_get_function_name()
    data_name    = "titanic"         ### in data/input/
    model_class  = 'LGBMClassifier'  ### ACTUAL Class name for model_sklearn.py
    n_sample     = 1000

    def post_process_fun(y):
        return  int(y)

    def pre_process_fun(y):
        return  int(y)


    model_dict = {'model_pars': {
    ### LightGBM API model   #######################################
     'model_class': model_class
    ,'model_pars' : {'objective': 'binary', 'n_estimators':10,
                    }

    , 'post_process_fun' : post_process_fun
    , 'pre_process_pars' : {'y_norm_fun' :  pre_process_fun ,


    ### Pipeline for data processing ##############################
    'pipe_list': [
        {'uri': 'source/prepro.py::pd_coly',                 'pars': {}, 'cols_family': 'coly',       'cols_out': 'coly',           'type': 'coly'         },
        {'uri': 'source/prepro.py::pd_colnum_bin',           'pars': {}, 'cols_family': 'colnum',     'cols_out': 'colnum_bin',     'type': ''             },
        # {'uri': 'source/prepro.py::pd_colnum_binto_onehot',  'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot',  'type': ''             },
        {'uri': 'source/prepro.py::pd_colcat_bin',           'pars': {}, 'cols_family': 'colcat',     'cols_out': 'colcat_bin',     'type': ''             },
        # {'uri': 'source/prepro.py::pd_colcat_to_onehot',     'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot',  'type': ''             },
        # {'uri': 'source/prepro.py::pd_colcross',             'pars': {}, 'cols_family': 'colcross',   'cols_out': 'colcross_pair_onehot',  'type': 'cross'},


        {'uri': 'source/prepro.py::pd_colcat_minhash',       'pars': {}, 'cols_family': 'colcat',     'cols_out': 'colcat_minhash',     'type': ''             },


        # {'uri': 'source/prepro.py::pd_coltext_universal_google',   'pars': {}, 'cols_family': 'coltext',     'cols_out': 'coltext_universal_google',     'type': ''    },


        {'uri': 'source/prepro.py::pd_col_genetic_transform',       'pars': {  ## 'pars_genetic' : {}
                                                                                   },
                'cols_family': 'colgen',     'cols_out': 'col_genetic',     'type': 'add_coly'             },


        {'uri': 'source/prepro.py::pd_colnum_quantile_norm',       'pars': {'colsparse' :  [] },
         'cols_family': 'colnum',     'cols_out': 'colnum_quantile_norm',     'type': ''             },


    ],
           }
    },

  'compute_pars': { 'metric_list': ['accuracy_score','average_precision_score']
                  },

  'data_pars': { 'n_sample' : n_sample,
      'cols_input_type' : cols_input_type_2,
      ### family of columns for MODEL  #########################################################
      #  "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap",  #### Colnum columns
      #  "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
      #  'colcross_single_onehot_select', "colcross_pair_onehot",  'colcross_pair',  #### colcross columns
      #  'coldate',
      #  'coltext',
      'cols_model_group': [ 'colnum',  ### should be optional 'colcat'
          
                            'colcat_bin',
                            # 'colcat_bin',
                            # 'colnum_onehot',

                            #'colcat_minhash',
                            # 'colcat_onehot',
                            # 'coltext_universal_google'


                            'colcat_minhash',

                            'col_genetic',

                            'colnum_quantile_norm'


                          ]

      ### Filter data rows   ##################################################################
     ,'filter_pars': { 'ymax' : 2 ,'ymin' : -1 }

         }
      }

    ##### Filling Global parameters    ############################################################
    model_dict        = global_pars_update(model_dict, data_name, config_name )
    return model_dict
Example #7
0
def config3(path_model_out=""):
    """
       Contains all needed informations 
    """
    config_name = os_get_function_name()
    data_name = "titanic"  ### in data/input/
    model_class = 'LGBMClassifier'  ### ACTUAL Class name for model_sklearn.py
    n_sample = 1000

    def post_process_fun(y):
        return int(y)

    def pre_process_fun(y):
        return int(y)

    model_dict = {
        'model_pars': {
            ### LightGBM API model   #######################################
            'model_class': model_class,
            'model_pars': {
                'objective': 'binary',
                'n_estimators': 5,
            },
            'post_process_fun': post_process_fun,
            'pre_process_pars': {
                'y_norm_fun':
                pre_process_fun,

                ### Pipeline for data processing ##############################
                'pipe_list': [
                    ###  coly encoding
                    {
                        'uri': 'source/prepro.py::pd_coly',
                        'pars': {
                            'ymin': -9999999999.0,
                            'ymax': 999999999.0,
                            'y_norm_fun': None
                        },
                        'cols_family': 'coly',
                        'cols_out': 'coly',
                        'type': 'coly'
                    },
                    {
                        'uri': 'source/prepro.py::pd_colcat_bin',
                        'pars': {
                            'path_pipeline': False
                        },
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_bin',
                        'type': ''
                    }

                    #### Data Over/Under sampling, New data
                    #,{'uri': 'source/prepro_sampler.py::pd_sample_imblearn'   ,
                    #            'pars': {"model_name": 'SMOTEENN',
                    #                    'pars_resample':    {'sampling_strategy' : 'auto', 'random_state':0},
                    #                    "coly": "Survived"} ,
                    #                    'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': 'add_coly'  }
                    # ,{'uri': 'source/prepro_sampler.py::pd_filter_rows'       , 'pars': {'ymin': -9999999999.0, 'ymax': 999999999.0} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }
                    #,{'uri': 'source/prepro_sampler.py::pd_augmentation_sdv'  , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }
                ],
            }
        },
        'compute_pars': {
            'metric_list': ['accuracy_score', 'average_precision_score']
        },
        'data_pars': {
            'n_sample': n_sample,

            #### columns as raw data input
            'cols_input_type': cols_input_type_2,

            ### columns for model input    ############################################################
            'cols_model_group': [  # 'colnum', 
                'colcat_bin',
            ],

            #### Separate Category Sparse from Continuous (DLearning input)
            'cols_model_type': {
                'continuous': [
                    'colnum',
                ],
                'discreate': [
                    'colcat_bin',
                ]
            }

            ### Filter data rows   ###################################################################
            ,
            'filter_pars': {
                'ymax': 2,
                'ymin': -1
            }
        }
    }

    ##### Filling Global parameters    #########################################################
    model_dict = global_pars_update(model_dict, data_name, config_name)
    return model_dict
Example #8
0
def config4(path_model_out=""):
    """

    """
    config_name = os_get_function_name()
    data_name = "titanic"  ### in data/input/
    model_class = 'LGBMClassifier'  ### ACTUAL Class name for model_sklearn.py
    n_sample = 1000

    def post_process_fun(y):
        return int(y)

    def pre_process_fun(y):
        return int(y)

    model_dict = {
        'model_pars': {
            ### LightGBM API model   #######################################
            'model_class': model_class,
            'model_pars': {
                'objective': 'binary',
                'n_estimators': 5,
            },
            'post_process_fun': post_process_fun,
            'pre_process_pars': {
                'y_norm_fun':
                pre_process_fun,

                ### Pipeline for data processing ##############################
                'pipe_list': [
                    ###  coly encoding
                    {
                        'uri': 'source/prepro.py::pd_coly',
                        'pars': {
                            'ymin': -9999999999.0,
                            'ymax': 999999999.0,
                            'y_norm_fun': None
                        },
                        'cols_family': 'coly',
                        'cols_out': 'coly',
                        'type': 'coly'
                    },
                    {
                        'uri': 'source/prepro.py::pd_colcat_bin',
                        'pars': {
                            'path_pipeline': False
                        },
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_bin',
                        'type': ''
                    }

                    #### Time Series
                    #,{'uri': 'source/prepro_tseries.py::pd_ts_autoregressive' , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }
                    #,{'uri': 'source/prepro_tseries.py::pd_ts_basic'          , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }
                    #,{'uri': 'source/prepro_tseries.py::pd_ts_date'           , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }

                    #,{'uri': 'source/prepro_tseries.py::pd_ts_detrend'        , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }
                    #,{'uri': 'source/prepro_tseries.py::pd_ts_generic'        , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }
                    #,{'uri': 'source/prepro_tseries.py::pd_ts_groupby'        , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }
                    #,{'uri': 'source/prepro_tseries.py::pd_ts_identity'       , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }
                    #,{'uri': 'source/prepro_tseries.py::pd_ts_lag'            , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }
                    #,{'uri': 'source/prepro_tseries.py::pd_ts_onehot'         , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }
                    #,{'uri': 'source/prepro_tseries.py::pd_ts_rolling'        , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }
                    #,{'uri': 'source/prepro_tseries.py::pd_ts_template'       , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }
                ],
            }
        },
        'compute_pars': {
            'metric_list': ['accuracy_score', 'average_precision_score']
        },
        'data_pars': {
            'n_sample': n_sample,

            #### columns as raw data input
            'cols_input_type': cols_input_type_2,

            ### columns for model input    #########################################################
            'cols_model_group': [  # 'colnum', 
                'colcat_bin',
            ],

            #### Separate Category Sparse from Continuous (DLearning input)
            'cols_model_type': {
                'continuous': [
                    'colnum',
                ],
                'discreate': ['colcat_bin']
            }

            ### Filter data rows   ###################################################################
            ,
            'filter_pars': {
                'ymax': 2,
                'ymin': -1
            }
        }
    }

    ##### Filling Global parameters    #########################################################
    model_dict = global_pars_update(model_dict, data_name, config_name)
    return model_dict
Example #9
0
def config1(path_model_out=""):
    """
       Contains all needed informations 
    """
    config_name = os_get_function_name()
    data_name = "titanic"  ### in data/input/
    model_class = 'LGBMClassifier'  ### ACTUAL Class name for model_sklearn.py
    n_sample = 1000

    def post_process_fun(y):
        return int(y)

    def pre_process_fun(y):
        return int(y)

    model_dict = {
        'model_pars': {
            ### LightGBM API model   #######################################
            'model_class': model_class,
            'model_pars': {
                'objective': 'binary',
                'n_estimators': 3,
            },
            'post_process_fun': post_process_fun,
            'pre_process_pars': {
                'y_norm_fun':
                pre_process_fun,

                ### Pipeline for data processing ##############################
                'pipe_list': [
                    ### Filter rows
                    #,{'uri': 'source/prepro.py::pd_filter_rows'               , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }

                    ###  coly processing
                    {
                        'uri': 'source/prepro.py::pd_coly',
                        'pars': {
                            'ymin': -9999999999.0,
                            'ymax': 999999999.0,
                            'y_norm_fun': None
                        },
                        'cols_family': 'coly',
                        'cols_out': 'coly',
                        'type': 'coly'
                    },
                    {
                        'uri': 'source/prepro.py::pd_coly_clean',
                        'pars': {
                            'y_norm_fun': None
                        },
                        'cols_family': 'coly',
                        'cols_out': 'coly',
                        'type': 'coly'
                    }

                    ### colnum : continuous
                    ,
                    {
                        'uri': 'source/prepro.py::pd_colnum_quantile_norm',
                        'pars': {
                            'colsparse': []
                        },
                        'cols_family': 'colnum',
                        'cols_out': 'colnum_quantile_norm',
                        'type': ''
                    },
                    {
                        'uri': 'source/prepro.py::pd_colnum_binto_onehot',
                        'pars': {
                            'path_pipeline': False
                        },
                        'cols_family': 'colnum',
                        'cols_out': 'colnum_onehot',
                        'type': ''
                    },
                    {
                        'uri': 'source/prepro.py::pd_colnum_bin',
                        'pars': {
                            'path_pipeline': False
                        },
                        'cols_family': 'colnum',
                        'cols_out': 'colnum_bin',
                        'type': ''
                    }

                    ### colcat :Category
                    ,
                    {
                        'uri': 'source/prepro.py::pd_colcat_to_onehot',
                        'pars': {},
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_onehot',
                        'type': ''
                    },
                    {
                        'uri': 'source/prepro.py::pd_colcat_minhash',
                        'pars': {},
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_minhash',
                        'type': ''
                    },
                    {
                        'uri': 'source/prepro.py::pd_colcat_bin',
                        'pars': {
                            'path_pipeline': False
                        },
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_bin',
                        'type': ''
                    }

                    #### Bug in NA values
                    ,
                    {
                        'uri': 'source/prepro.py::pd_colcat_encoder_generic',
                        'pars': {
                            'model_name': 'HashingEncoder',
                            'model_pars': {
                                'verbose': 1,
                                'return_df': True
                            }
                        },
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_encoder2',
                        'type': ''
                    }

                    ### colcat, colnum cross-features
                    ,
                    {
                        'uri': 'source/prepro.py::pd_colcross',
                        'pars': {},
                        'cols_family': 'colcross',
                        'cols_out': 'colcross_pair_onehot',
                        'type': 'cross'
                    }

                    ### New Features
                    ,
                    {
                        'uri': 'source/prepro.py::pd_col_genetic_transform',
                        ### Issue with Binary 1 or 0  : need to pass with Logistic
                        'pars': {
                            'pars_generic': {
                                'metric': 'spearman',
                                'generations': 2,
                                'population_size':
                                10,  ### Higher than nb_features
                                'tournament_size': 10,
                                'stopping_criteria': 1.0,
                                'const_range': (-1., 1.),
                                'p_crossover': 0.9,
                                'p_subtree_mutation': 0.01,
                                'p_hoist_mutation': 0.01,
                                'p_point_mutation': 0.01,
                                'p_point_replace': 0.05,
                                'parsimony_coefficient':
                                0.0005,  ####   0.00005 Control Complexity
                                'max_samples': 0.9,
                                'verbose': 1,
                                'random_state': 0,
                                'n_jobs': 4,
                                #'n_components'      ###    'metric': 'spearman', Control number of outtput features  : n_components
                            }
                        },
                        'cols_family': 'colgen',
                        'cols_out': 'col_genetic',
                        'type': 'add_coly'  #### Need to add target coly
                    }

                    #### Date
                    #,{'uri': 'source/prepro.py::pd_coldate'                   , 'pars': {} , 'cols_family': 'colnum' , 'cols_out': 'colnum_out' , 'type': '' }

                    #### Example of Custom processor
                    ,
                    {
                        "uri": THIS_FILEPATH + "::pd_col_amyfun",
                        "pars": {},
                        "cols_family": "colnum",
                        "cols_out": "col_myfun",
                        "type": ""
                    },
                ],
            }
        },
        'compute_pars': {
            'metric_list': ['accuracy_score', 'average_precision_score']
        },
        'data_pars': {
            'n_sample':
            n_sample,

            #### columns as raw data input
            'cols_input_type':
            cols_input_type_2,

            ### columns for model input    #########################################################
            #  "colnum", "colnum_bin", "colnum_onehot",   #### Colnum columns
            #  "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
            #  'colcross', "colcross_pair_onehot" #### colcross columns
            'cols_model_group': [  # 'colnum',
                'colnum_bin',
                'colnum_onehot',
                'colnum_quantile_norm',
                'colcat_bin',
                'colcat_onehot',
                'colcat_minhash',
            ],

            #### Separate Category Sparse from Continuous (DLearning input)
            'cols_model_type': {
                'continuous': [
                    'colnum',
                ],
                'discreate': [
                    'colcat_bin',
                    'colnum_bin',
                ]
            }

            ### Filter data rows   ###################################################################
            ,
            'filter_pars': {
                'ymax': 2,
                'ymin': -1
            }
        }
    }

    ##### Filling Global parameters    #########################################################
    model_dict = global_pars_update(model_dict, data_name, config_name)
    return model_dict
Example #10
0
def airbnb_lightgbm(path_model_out=""):
    """

    """
    data_name = "airbnb"  ###in data/
    model_name = 'LGBMRegressor'

    def post_process_fun(y):
        return y_norm(y, inverse=True, mode='norm')

    def pre_process_fun(y):
        return y_norm(y, inverse=False, mode='norm')

    #############################################################################
    model_dict = {
        'model_pars': {
            'model_class': model_name,
            'model_path': path_model_out,
            'model_pars': {
                'objective': 'huber',
            }  # lightgbm one
            ,
            'post_process_fun': post_process_fun,
            'pre_process_pars': {
                'y_norm_fun':
                copy.deepcopy(pre_process_fun),

                ### Pipeline for data processing ########################
                'pipe_list': [{
                    'uri': 'source/prepro.py::pd_coly',
                    'pars': {},
                    'cols_family': 'coly',
                    'cols_out': 'coly',
                    'type': 'coly'
                }, {
                    'uri': 'source/prepro.py::pd_colnum_bin',
                    'pars': {},
                    'cols_family': 'colnum',
                    'cols_out': 'colnum_bin',
                    'type': ''
                }, {
                    'uri': 'source/prepro.py::pd_colnum_binto_onehot',
                    'pars': {},
                    'cols_family': 'colnum_bin',
                    'cols_out': 'colnum_onehot',
                    'type': ''
                }, {
                    'uri': 'source/prepro.py::pd_colcat_bin',
                    'pars': {},
                    'cols_family': 'colcat',
                    'cols_out': 'colcat_bin',
                    'type': ''
                }, {
                    'uri': 'source/prepro.py::pd_colcat_to_onehot',
                    'pars': {},
                    'cols_family': 'colcat_bin',
                    'cols_out': 'colcat_onehot',
                    'type': ''
                }, {
                    'uri': 'source/prepro.py::pd_coltext',
                    'pars': {},
                    'cols_family': 'coltext',
                    'cols_out': 'coltext_svd',
                    'type': ''
                }, {
                    'uri': 'source/prepro.py::pd_coldate',
                    'pars': {},
                    'cols_family': 'coldate',
                    'cols_out': 'coldate',
                    'type': ''
                }, {
                    'uri': 'source/prepro.py::pd_colcross',
                    'pars': {},
                    'cols_family': 'colcross',
                    'cols_out': 'colcross_pair_onehot',
                    'type': 'cross'
                }],
            }
        },
        'compute_pars': {
            'metric_list': [
                'root_mean_squared_error',
                'mean_absolute_error',  #### sklearm names
                'explained_variance_score',
                'r2_score',
                'median_absolute_error'
            ]
        },
        'data_pars': {
            'cols_input_type': cols_input_type_1

            # "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap",  #### Colnum columns
            # "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
            # 'colcross_single_onehot_select', "colcross_pair_onehot",  'colcross_pair',  #### colcross columns
            # 'coldate', #'coltext', 'coltext_svd'
            ,
            'cols_model_group': ['colnum', 'colcat_bin', 'coltext_svd'],
            'filter_pars': {
                'ymax': 100000.0,
                'ymin': 0.0
            }  ### Filter data
        }
    }

    ##### Filling Global parameters    ############################################################
    model_dict = global_pars_update(model_dict, data_name,
                                    os_get_function_name())

    return model_dict
Example #11
0
def multi_lightgbm():
    """
       multiclass
    """
    data_name = f"multiclass"  ### in data/input/
    model_name = 'LGBMClassifier'
    n_sample = 6000

    def post_process_fun(y):  ### After prediction is done
        return int(y)

    def pre_process_fun_multi(y):  ### Before the prediction is done
        return int(y)

    model_dict = {
        'model_pars': {
            #'model_path'       : path_model_out

            ### LightGBM API model  ########################################
            'model_class':
            model_name  ## ACTUAL Class name for model_sklearn.py
            ,
            'model_pars': {
                'objective': 'multiclass',
                'num_class': 4,
                'metric': 'multi_logloss',
                'learning_rate': 0.03,
                'boosting_type': 'gbdt',
                "n_estimators": 3,
            }

            ### After prediction  ##########################################
            ,
            'post_process_fun': post_process_fun,
            'pre_process_pars': {
                'y_norm_fun':
                pre_process_fun_multi,

                ### Pipeline for data processing.
                'pipe_list': [{
                    'uri': 'source/prepro.py::pd_coly',
                    'pars': {},
                    'cols_family': 'coly',
                    'cols_out': 'coly',
                    'type': 'coly'
                }, {
                    'uri': 'source/prepro.py::pd_colnum_bin',
                    'pars': {},
                    'cols_family': 'colnum',
                    'cols_out': 'colnum_bin',
                    'type': ''
                }, {
                    'uri': 'source/prepro.py::pd_colnum_binto_onehot',
                    'pars': {},
                    'cols_family': 'colnum_bin',
                    'cols_out': 'colnum_onehot',
                    'type': ''
                }, {
                    'uri': 'source/prepro.py::pd_colcat_bin',
                    'pars': {},
                    'cols_family': 'colcat',
                    'cols_out': 'colcat_bin',
                    'type': ''
                }, {
                    'uri': 'source/prepro.py::pd_colcat_to_onehot',
                    'pars': {},
                    'cols_family': 'colcat_bin',
                    'cols_out': 'colcat_onehot',
                    'type': ''
                }, {
                    'uri': 'source/prepro.py::pd_colcross',
                    'pars': {},
                    'cols_family': 'colcross',
                    'cols_out': 'colcross_pair_onehot',
                    'type': 'cross'
                }],
            },
        },
        'compute_pars': {
            'metric_list': ['roc_auc_score', 'accuracy_score'],
            'probability': True,  ### output probability for classifier
        },
        'data_pars': {
            'n_sample': n_sample,

            ### columns from raw file, based on data type, #############
            'cols_input_type': cols_input_type_1,

            ### family of columns for MODEL  ########################################################
            #  "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap",  #### Colnum columns
            #  "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
            #  'colcross_single_onehot_select', "colcross_pair_onehot",  'colcross_pair',  #### colcross columns
            #  'coldate',
            #  'coltext',
            'cols_model_group': ['colnum_bin', 'colcat_bin'],
            'cols_model_type': {}

            ### Filter data rows   #####################################
            ,
            'filter_pars': {
                'ymax': 5,
                'ymin': -1
            }
        }
    }

    ##### Filling Global parameters    #############################################################
    model_dict = global_pars_update(model_dict,
                                    data_name,
                                    config_name=os_get_function_name())
    return model_dict
Example #12
0
def config1() :
    """      ONE SINGLE DICT Contains all needed informations for
    """
    data_name    = "titanic"         ### in data/input/
    model_class  = 'AutoML'  ### ACTUAL Class name for model_sklearn.py
    n_sample     = 1000

    def post_process_fun(y):   ### After prediction is done
        return  int(y)

    def pre_process_fun(y):    ### Before the prediction is done
        return  int(y)


    model_dict = {
    'model_pars': {
           'model_class': model_class
          ,'model_pars' : {
              'total_time_limit' : 20,
              'algorithms' : 'auto',
              'results_path' :   root_repo  + f'/data/output/{data_name}/{os_get_function_name()}/automl_1',
              'eval_metric' : 'auto'
              # mode='Explain',
              # ml_task='auto', model_time_limit=None, algorithms='auto', train_ensemble=True,
              # stack_models='auto', eval_metric='auto', validation_strategy='auto', explain_level='auto',
              # golden_features='auto', features_selection='auto', start_random_models='auto',
              # hill_climbing_steps='auto', top_models_to_improve='auto', verbose=1, random_state=1234)
            }

          , 'post_process_fun' : post_process_fun   ### After prediction  ##########################################
          , 'pre_process_pars' : {'y_norm_fun' :  pre_process_fun ,  ### Before training  ##########################


          ### Pipeline for data processing ##############################
          'pipe_list': [
          #### coly target prorcessing
          {'uri': 'source/prepro.py::pd_coly',                 'pars': {}, 'cols_family': 'coly',       'cols_out': 'coly',           'type': 'coly'         },


          {'uri': 'source/prepro.py::pd_colnum_bin',           'pars': {}, 'cols_family': 'colnum',     'cols_out': 'colnum_bin',     'type': ''             },
          {'uri': 'source/prepro.py::pd_colnum_binto_onehot',  'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot',  'type': ''             },

          #### catcol INTO integer,   colcat into OneHot
          {'uri': 'source/prepro.py::pd_colcat_bin',           'pars': {}, 'cols_family': 'colcat',     'cols_out': 'colcat_bin',     'type': ''             },
          # {'uri': 'source/prepro.py::pd_colcat_to_onehot',     'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot',  'type': ''             },


          ### Cross_feat = feat1 X feat2
          # {'uri': 'source/prepro.py::pd_colcross',             'pars': {}, 'cols_family': 'colcross',   'cols_out': 'colcross_pair',  'type': 'cross'},


          ],
                 }
      },

      'compute_pars': { 'metric_list': ['accuracy_score','average_precision_score']

                        ,'mlflow_pars' : None # {}   ### Not empty --> use mlflow
      },

      'data_pars': { 'n_sample' : n_sample,

          'download_pars' : None,


          'cols_input_type' : cols_input_type_1,
          ### family of columns for MODEL  #########################################################
          #  "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap",  #### Colnum columns
          #  "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
          #  'colcross_single_onehot_select', "colcross_pair_onehot",  'colcross_pair',  #### colcross columns  'coldate', 'coltext',
          'cols_model_group': [ 'colnum_bin',
                                'colcat_bin',
                                # 'coltext',
                                # 'coldate',
                                #'colcross_pair',
                              ],

          'cols_model_type' : {
              'cols_cross_input':  [ "colcat", ],
              'cols_deep_input':   ['colnum',  ],
          }
           
          ### Filter data rows   ##################################################################
         ,'filter_pars': { 'ymax' : 2 ,'ymin' : -1 }

        }
      }

    ##### Filling Global parameters    ############################################################
    model_dict        = global_pars_update(model_dict, data_name, config_name=os_get_function_name() )
    return model_dict
Example #13
0
def titanic1(path_model_out=""):
    """ One big dict
    """
    config_name = os_get_function_name()
    data_name = "titanic"  ### in data/input/
    model_class = 'LGBMClassifier'  ### ACTUAL Class name for model_sklearn.py
    n_sample = 500

    model_dict = {
        'model_pars': {
            'model_class': model_class,
            'model_pars': {
                'objective': 'binary',
                'n_estimators': 10,
            },
            'post_process_fun': post_process_fun,
            'pre_process_pars': {
                'y_norm_fun':
                pre_process_fun,

                ### Pipeline for data processing ##############################
                'pipe_list': [
                    {
                        'uri': 'source/prepro.py::pd_coly',
                        'pars': {},
                        'cols_family': 'coly',
                        'cols_out': 'coly',
                        'type': 'coly'
                    },
                    {
                        'uri': 'source/prepro.py::pd_colnum_bin',
                        'pars': {},
                        'cols_family': 'colnum',
                        'cols_out': 'colnum_bin',
                        'type': ''
                    },

                    # {'uri': 'source/prepro.py::pd_colnum_binto_onehot',  'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot',  'type': ''             },
                    {
                        'uri': 'source/prepro.py::pd_colcat_bin',
                        'pars': {},
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_bin',
                        'type': ''
                    },

                    # {'uri': 'source/prepro.py::pd_colcat_to_onehot',     'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot',  'type': ''             },
                    # {'uri': 'source/prepro.py::pd_colcross',             'pars': {}, 'cols_family': 'colcross',   'cols_out': 'colcross_pair_onehot',  'type': 'cross'},
                ],
            }
        },
        'compute_pars': {
            'metric_list': ['accuracy_score', 'average_precision_score']
        },
        'data_pars': {
            'n_sample':
            n_sample,
            'cols_input_type':
            cols_input_type_2,
            'cols_model_group': [
                'colnum',  ### should be optional 'colcat'
                'colcat_bin',
            ],
            'cols_model_type': {
                'continuous': [
                    'colnum',
                ],
                'sparse': [
                    'colcat_bin',
                    'colnum_bin',
                ],
            }

            ### Filter data rows   ##################################################################
            ,
            'filter_pars': {
                'ymax': 2,
                'ymin': -1
            }
        }
    }

    ##### Filling Global parameters    ########################################################
    model_dict = global_pars_update(model_dict, data_name, config_name)
    return model_dict
Example #14
0
def adfraud_lightgbm(path_model_out=""):
    """

    """
    config_name = os_get_function_name()
    data_name = "adfraud"  ### in data/input/
    model_class = 'LGBMClassifier'  ### ACTUAL Class name for model_sklearn.py
    n_sample = 5000000

    def post_process_fun(y):  ### After prediction is done
        return int(y)

    def pre_process_fun(y):  ### Before the prediction is done
        return int(y)

    model_dict = {
        'model_pars': {
            ### LightGBM API model   #######################################
            'info': """
              Use large max_bin (may be slower)
              Use small learning_rate with large num_iterations
              Use large num_leaves (may cause over-fitting)
              Use bigger training data

            Try dart
              Deal with Over-fitting
              Use small max_bin
              Use small num_leaves
              Use min_data_in_leaf and min_sum_hessian_in_leaf
              Use bagging by set bagging_fraction and bagging_freq
              Use feature sub-sampling by set feature_fraction
              Use bigger training data
              Try lambda_l1, lambda_l2 and min_gain_to_split for regularization
              Try max_depth to avoid growing deep tree
              Try extra_trees
              Try increasing path_smooth

         """,
            'model_class': model_class,
            'model_pars': {
                'objective': 'binary',
                'boosting_type':
                'gbdt',  #  "seed": 1, 'boosting_type': 'dart',
                'metric': 'auc,average_precision',
                #'scale_pos_weight'  : 99,
                'is_unbalance': True,
                'learning_rate': 0.001,
                'num_leaves':
                31,  # we should let it be smaller than 2^(max_depth)
                'max_depth': -1,  # -1 means no limit
                'min_child_samples':
                20,  # Minimum number of df need in a child(min_data_in_leaf)
                'max_bin': 255,  # Number of bucketed bin for feature values
                'subsample': 0.6,  # Subsample ratio of the training instance.
                'subsample_freq':
                0,  # frequence of subsample, <=0 means no enable
                'colsample_bytree':
                0.3,  # Subsample ratio of columns when constructing each tree.
                'min_child_weight':
                5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
                'subsample_for_bin':
                2000,  # Number of samples for constructing bin
                'min_split_gain':
                0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
                'reg_alpha': 0,  # L1 regularization term on weights
                'reg_lambda': 0,  # L2 regularization term on weights
                # 'nthread'           : -1,
                'verbose': 0,
            },
            'post_process_fun':
            post_process_fun  ### After prediction  ##########################################
            ,
            'pre_process_pars': {
                'y_norm_fun':
                pre_process_fun,  ### Before training  ##########################

                ### Pipeline for data processing ##############################
                'pipe_list': [
                    {
                        'uri': 'source/prepro.py::pd_coly',
                        'pars': {},
                        'cols_family': 'coly',
                        'cols_out': 'coly',
                        'type': 'coly'
                    },
                    # {'uri': 'source/prepro.py::pd_colnum_bin',           'pars': {}, 'cols_family': 'colnum',     'cols_out': 'colnum_bin',     'type': ''             },
                    # {'uri': 'source/prepro.py::pd_colnum_binto_onehot',  'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot',  'type': ''             },
                    {
                        'uri': 'source/prepro.py::pd_colcat_bin',
                        'pars': {},
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_bin',
                        'type': ''
                    },
                    # {'uri': 'source/prepro.py::pd_colcat_to_onehot',     'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot',  'type': ''             },
                    # {'uri': 'source/prepro.py::pd_colcross',             'pars': {}, 'cols_family': 'colcross',   'cols_out': 'colcross_pair',  'type': 'cross'},

                    #### Example of Custom processor
                    # {'uri': 'titanic_classifier.py::pd_colnum_quantile_norm',   'pars': {}, 'cols_family': 'colnum',   'cols_out': 'colnum_quantile_norm',  'type': '' },
                ],
            }
        },

        #### Sklearn
        'compute_pars': {
            'metric_list': [
                'accuracy_score', 'average_precision_score', 'f1_score',
                'recall_score'
            ]
        },
        'data_pars': {
            'n_sample': n_sample,
            'cols_input_type': cols_input_type_1,
            ### family of columns for MODEL  ######################################################################
            #  "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap",  #### Colnum columns
            #  "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
            #  'colcross_single_onehot_select', "colcross_pair_onehot",  'colcross_pair',  #### colcross columns
            #  'coldate',
            'cols_model_group': [  # 'colnum_bin',
                'colnum',
                'colcat_bin',
                # 'coltext',
                # 'coldate',
                # 'colcross_pair',

                ### example of custom
                # 'colnum_quantile_norm'
            ]

            ### Filter data rows   ##################################################################
            ,
            'filter_pars': {
                'ymax': 2,
                'ymin': -1
            }
        }
    }

    ##### Filling Global parameters    ############################################################
    model_dict = global_pars_update(model_dict, data_name, config_name)
    return model_dict
Example #15
0
def online_lightgbm():
    """
       Contains all needed informations for Light GBM Classifier model,
       used for titanic classification task
    """
    data_name = "online_shopping"  ### in data/input/
    model_class = 'LGBMClassifier'  ### ACTUAL Class name for model_sklearn.py
    n_sample = 3816


    def post_process_fun(y):  ### After prediction is done
        return int(y)

    def pre_process_fun(y):  ### Before the prediction is done
        return int(y)

    model_dict = {'model_pars': {
        ### LightGBM API model   #######################################
        'model_class': model_class
        , 'model_pars': {'objective': 'binary',
                         'n_estimators': 10,
                         'learning_rate': 0.001,
                         'boosting_type': 'gbdt',  ### Model hyperparameters
                         'early_stopping_rounds': 5
                         }

        , 'post_process_fun': post_process_fun  ### After prediction  ##########################################
        , 'pre_process_pars': {'y_norm_fun': pre_process_fun,  ### Before training  ##########################

           ### Pipeline for data processing ##############################
           'pipe_list': [
               #### coly target prorcessing
               {'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly'},


               {'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': ''},

               {'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': ''},

               #### catcol INTO integer,   colcat into OneHot
               {'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': ''},

               {'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {},'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': ''},


               ### Cross_feat = feat1 X feat2
               {'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'cols_out': 'colcross_pair', 'type': 'cross'},


           ],
                               }
    },

        'compute_pars': {'metric_list': ['accuracy_score', 'average_precision_score']
                         },

        'data_pars': {'n_sample': n_sample,
                      'cols_input_type': cols_input_type_1,
                      ### family of columns for MODEL  #########################################################
                      #  "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap",  #### Colnum columns
                      #  "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
                      #  'colcross_single_onehot_select', "colcross_pair_onehot",  'colcross_pair',  #### colcross columns
                      #  'coldate', 'coltext',
                      'cols_model_group': ['colnum_bin',
                                           'colcat_bin',
                                           # 'coltext',
                                           # 'coldate',
                                           'colcross_pair',

                                           ### example of custom
                                           # 'col_myfun'
                                           ]

                      ### Filter data rows   ##################################################################
            , 'filter_pars': {'ymax': 2, 'ymin': -1}

                      }
    }

    ##### Filling Global parameters    ############################################################
    model_dict = global_pars_update(model_dict, data_name, config_name=os_get_function_name())
    return model_dict
Example #16
0
def config1():
    """
       ONE SINGLE DICT Contains all needed informations for
       used for tseries_demand classification task
    """
    data_name = "tseries_demand"  ### in data/input/
    model_class = "LGBMRegressor"  ### ACTUAL Class name for model_sklearn.py
    n_sample = 100000

    def post_process_fun(y):  ### After prediction is done
        # ynew = np.exp(y) - 1.0
        ynew = float(y)
        return ynew

    def pre_process_fun(y):  ### Before the prediction is done
        # ynew = np.log(y+1)
        ynew = float(y)
        return ynew

    model_dict = {
        "model_pars": {
            ### LightGBM API model   #######################################
            "model_class": model_class,
            "model_pars": {
                "objective": "huber",  ### Regression Type Loss
                "n_estimators": 100,
                "learning_rate": 0.001,
                "boosting_type": "gbdt",  ### Model hyperparameters
                "early_stopping_rounds": 5
            },
            "post_process_fun": post_process_fun  ### After prediction  #######
            ,
            "pre_process_pars": {
                "y_norm_fun":
                pre_process_fun,  ### Before training  ##########################

                ### Pipeline for data processing ##############################
                "pipe_list": [
                    #### Example of Custom processor
                    {
                        "uri": THIS_FILEPATH + "::pd_dsa2_custom",
                        "pars": {
                            'coldate': 'date'
                        },
                        "cols_family": "col_tseries",
                        "cols_out": "tseries_feat",
                        "type": ""
                    },
                ],
            }
        },
        "compute_pars": {
            "metric_list": [
                'root_mean_squared_error', 'mean_absolute_error',
                'explained_variance_score', 'r2_score', 'median_absolute_error'
            ]
        },
        "data_pars": {
            "n_sample": n_sample,
            "download_pars": None,

            ### Raw data:  column input ##############################################################
            "cols_input_type": cols_input_type_1,

            ### Model Input :  Merge family of columns   #############################################
            "cols_model_group": [
                ### cols_out of  pd_dsa2_custom
                "tseries_feat"
            ]

            #### Model Input : Separate Category Sparse from Continuous : Aribitrary name is OK (!)
            ,
            'cols_model_type': {
                'My123_continuous': [
                    'tseries_feat',
                ],
                'my_sparse': [
                    'colcat',
                ],
            }

            ### Filter data rows   ##################################################################
            ,
            "filter_pars": {
                "ymax": 999999999,
                "ymin": -1
            }
        }
    }

    ##### Filling Global parameters    ############################################################
    model_dict = global_pars_update(model_dict,
                                    data_name,
                                    config_name=os_get_function_name())
    return model_dict
Example #17
0
def income_status_lightgbm(path_model_out=""):
    """


    """
    data_name = "income_status"  ### in data/input/
    model_class = 'LGBMClassifier'  #  'LGBMClassifier_optuna' ACTUAL Class name for model_sklearn.py
    n_sample = 32500  # 32560

    def post_process_fun(y):  ### After prediction is done
        return int(y)

    def pre_process_fun(y):  ### Before the prediction is done
        return int(y)

    model_dict = {
        'model_pars': {
            ### LightGBM API model   #######################################
            'model_class': model_class,
            'model_pars': {
                'boosting_type': 'gbdt',
                'class_weight': None,
                'colsample_bytree': 1.0,
                'importance_type': 'split',
                'learning_rate': 0.001,
                'max_depth': -1,
                'min_child_samples': 20,
                'min_child_weight': 0.001,
                'min_split_gain': 0,
                'n_estimators': 5000,
                'n_jobs': -1,
                'num_leaves': 31,
                'objective': None,
                'random_state': None,
                'reg_alpha': 0,
                'reg_lambda': 0.0,
                'silent': True,
                'subsample': 1.0,
                'subsample_for_bin': 200000,
                'subsample_freq': 0
            },
            'post_process_fun': post_process_fun,
            'pre_process_pars': {
                'y_norm_fun':
                pre_process_fun,

                ### Pipeline for data processing ##############################
                'pipe_list': [
                    #{'uri': 'data/input/income/manual_preprocessing.py::pd_income_processor',      'pars': {}, 'cols_family': 'colall',   'cols_out': 'colall',
                    #        'type': 'filter'         },
                    {
                        'uri': 'source/prepro.py::pd_coly',
                        'pars': {},
                        'cols_family': 'coly',
                        'cols_out': 'coly',
                        'type': 'coly'
                    },
                    {
                        'uri': 'source/prepro.py::pd_colnum_bin',
                        'pars': {},
                        'cols_family': 'colnum',
                        'cols_out': 'colnum_bin',
                        'type': ''
                    },
                    {
                        'uri': 'source/prepro.py::pd_colcat_bin',
                        'pars': {},
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_bin',
                        'type': ''
                    },

                    ### Cross Features
                    {
                        'uri': 'source/prepro.py::pd_colcat_to_onehot',
                        'pars': {},
                        'cols_family': 'colcat_bin',
                        'cols_out': 'colcat_onehot',
                        'type': ''
                    },
                    {
                        'uri': 'source/prepro.py::pd_colnum_binto_onehot',
                        'pars': {},
                        'cols_family': 'colnum_bin',
                        'cols_out': 'colnum_onehot',
                        'type': ''
                    },
                    {
                        'uri': 'source/prepro.py::pd_colcross',
                        'pars': {},
                        'cols_family': 'colcross',
                        'cols_out': 'colcross_pair',
                        'type': 'cross'
                    },

                    ### Quantile normalization
                    {
                        'uri': 'source/prepro.py::pd_colnum_quantile_norm',
                        'pars': {
                            'colsparse': []
                        },
                        'cols_family': 'colnum',
                        'cols_out': 'colnum_quantile_norm',
                        'type': ''
                    },
                ],
            }
        },
        'compute_pars': {
            'metric_list': ['accuracy_score', 'average_precision_score'],
            'optuna_params': {
                "early_stopping_rounds": 5,
                'verbose_eval': 100,
                #  folds=KFold(n_splits=3)
            },
            'optuna_engine':
            'LightGBMTuner'  ###  LightGBMTuner', LightGBMTunerCV
        },
        'data_pars': {
            'n_sample': n_sample,
            'cols_input_type': cols_input_type_1,

            ### family of columns for MODEL  ########################################################
            #  "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap",  #### Colnum columns
            #  "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
            #  'colcross_single_onehot_select', "colcross_pair_onehot",  'colcross_pair',  #### colcross columns
            #  'coldate',  'coltext',
            'cols_model_group': [  # 'colnum_bin',
                'colcat_bin',
                'colnum_quantile_norm',

                # 'coltext',
                # 'coldate',
                'colcross_pair',
            ]

            ### Filter data rows   ##################################################################
            ,
            'filter_pars': {
                'ymax': 2,
                'ymin': -1
            }
        }
    }

    ##### Filling Global parameters    ############################################################
    model_dict = global_pars_update(model_dict,
                                    data_name,
                                    config_name=os_get_function_name())
    return model_dict
Example #18
0
def titanic_lightoptuna():
    """
       Contains all needed informations for Light GBM Classifier model,
       used for titanic classification task
    """
    config_name = os_get_function_name()
    data_name = "titanic"  ### in data/input/
    model_class = 'LGBMModel_optuna'  ### ACTUAL Class name for model_sklearn.py
    n_sample = 1000

    def post_process_fun(y):
        ### After prediction is done
        return int(y)

    def pre_process_fun(y):
        ### Before the prediction is done
        return int(y)

    model_dict = {
        'model_pars': {
            ### LightGBM API model   #######################################
            'model_file': 'optuna_lightgbm.py',  ###Optional one
            'model_class': model_class,
            'model_pars': {
                'objective': 'binary',
                'n_estimators': 50,
                'learning_rate': 0.001,
                'boosting_type': 'gbdt',  ### Model hyperparameters
                'early_stopping_rounds': 5
            }

            ### After prediction  ##########################################
            ,
            'post_process_fun': post_process_fun

            ### Before training  ##########################################
            ,
            'pre_process_pars': {
                'y_norm_fun':
                pre_process_fun,

                ### Pipeline for data processing ##############################
                'pipe_list': [
                    {
                        'uri': 'source/prepro.py::pd_coly',
                        'pars': {},
                        'cols_family': 'coly',
                        'cols_out': 'coly',
                        'type': 'coly'
                    },
                    {
                        'uri': 'source/prepro.py::pd_colnum_bin',
                        'pars': {},
                        'cols_family': 'colnum',
                        'cols_out': 'colnum_bin',
                        'type': ''
                    },
                    # {'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'cols_out': 'colnum_onehot', 'type': ''},
                    {
                        'uri': 'source/prepro.py::pd_colcat_bin',
                        'pars': {},
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_bin',
                        'type': ''
                    },
                    # {'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'cols_out': 'colcat_onehot', 'type': ''},
                    #{'uri': 'source/prepro.py::pd_colcross', 'pars': {},'cols_family': 'colcross', 'cols_out': 'colcross_pair_onehot', 'type': 'cross'}
                ],
            }
        },

        #classoptuna.integration.lightgbm.LightGBMTuner(params: Dict[str, Any], train_set: lgb.Dataset,
        # num_boost_round: int = 1000, valid_sets: Optional[VALID_SET_TYPE] = None,
        # valid_names: Optional[Any] = None, fobj: Optional[Callable[[…], Any]] = None,
        # feval: Optional[Callable[[…], Any]] = None, feature_name: str = 'auto', categorical_feature: str = 'auto', early_stopping_rounds: Optional[int] = None, evals_result: Optional[Dict[Any, Any]] = None, verbose_eval: Union[bool, int, None] = True, learning_rates: Optional[List[float]] = None, keep_training_booster: bool = False, callbacks: Optional[List[Callable[[…], Any]]] = None, time_budget: Optional[int] = None, sample_size: Optional[int] = None, study: Optional[optuna.study.Study] = None, optuna_callbacks: Optional[List[Callable[[optuna.study.Study, optuna.trial._frozen.FrozenTrial], None]]] = None, model_dir: Optional[str] = None, verbosity: Optional[int] = None, show_progress_bar: bool = True)[source]
        'compute_pars': {
            'metric_list': ['accuracy_score', 'average_precision_score'],
            'optuna_params': {
                "early_stopping_rounds": 5,
                'verbose_eval': 100,
                #  folds=KFold(n_splits=3)
            },
            'optuna_engine':
            'LightGBMTuner'  ###  LightGBMTuner', LightGBMTunerCV
        },
        'data_pars': {
            'n_sample': n_sample,
            'cols_input_type': cols_input_type_1,
            ### family of columns for MODEL  #########################################################
            #  "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap",  #### Colnum columns
            #  "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
            #  'colcross_single_onehot_select', "colcross_pair_onehot",  'colcross_pair',  #### colcross columns
            #  'coldate',
            #  'coltext',
            'cols_model_group': [
                'colnum_bin',
                'colcat_bin',
                # 'coltext',
                # 'coldate',
                # 'colcross_pair'
            ]

            ### Filter data rows   ##################################################################
            ,
            'filter_pars': {
                'ymax': 2,
                'ymin': -1
            }
        }
    }

    ##### Filling Global parameters    ############################################################
    model_dict = global_pars_update(model_dict, data_name, config_name)
    return model_dict
Example #19
0
def config2(path_model_out=""):
    """
       Contains all needed informations 
    """
    config_name = os_get_function_name()
    data_name = "titanic"  ### in data/input/
    model_class = 'LGBMClassifier'  ### ACTUAL Class name for model_sklearn.py
    n_sample = 1000

    def post_process_fun(y):
        return int(y)

    def pre_process_fun(y):
        return int(y)

    model_dict = {
        'model_pars': {
            ### LightGBM API model   #######################################
            'model_class': model_class,
            'model_pars': {
                'objective': 'binary',
                'n_estimators': 5,
            },
            'post_process_fun': post_process_fun,
            'pre_process_pars': {
                'y_norm_fun':
                pre_process_fun,

                ### Pipeline for data processing ##############################
                'pipe_list': [
                    ###  coly encoding
                    {
                        'uri': 'source/prepro.py::pd_coly',
                        'pars': {
                            'ymin': -9999999999.0,
                            'ymax': 999999999.0,
                            'y_norm_fun': None
                        },
                        'cols_family': 'coly',
                        'cols_out': 'coly',
                        'type': 'coly'
                    },
                    {
                        'uri': 'source/prepro.py::pd_colcat_bin',
                        'pars': {
                            'path_pipeline': False
                        },
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_bin',
                        'type': ''
                    }

                    #### Text
                    ,
                    {
                        "uri": "source/prepro_text.py::pd_coltext",
                        "pars": {
                            'dimpca': 1,
                            "word_minfreq": 2
                        },
                        "cols_family": "coltext",
                        "cols_out": "col_text",
                        "type": ""
                    },
                    {
                        "uri":
                        "source/prepro_text.py::pd_coltext_universal_google",
                        "pars": {
                            'model_uri':
                            "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
                        },
                        "cols_family": "coltext",
                        "cols_out": "col_text",
                        "type": ""
                    }
                ],
            }
        },
        'compute_pars': {
            'metric_list': ['accuracy_score', 'average_precision_score']
        },
        'data_pars': {
            'n_sample': n_sample,

            #### columns as raw data input
            'cols_input_type': cols_input_type_2,

            ### columns for model input    #########################################################
            'cols_model_group': [  # 'colnum', 
                'colcat_bin',
            ],

            #### Separate Category Sparse from Continuous (DLearning input)
            'cols_model_type': {
                'continuous': [
                    'colnum',
                ],
                'discreate': [
                    'colcat_bin',
                ]
            }

            ### Filter data rows   ###################################################################
            ,
            'filter_pars': {
                'ymax': 2,
                'ymin': -1
            }
        }
    }

    ##### Filling Global parameters    #########################################################
    model_dict = global_pars_update(model_dict, data_name, config_name)
    return model_dict
Example #20
0
def adfraud_lightgbm(path_model_out=""):
    """

    """
    config_name = os_get_function_name()
    data_name = "adfraud"  ### in data/input/
    model_class = 'LGBMClassifier'  ### ACTUAL Class name for model_sklearn.py
    n_sample = 1000

    def post_process_fun(y):  ### After prediction is done
        return int(y)

    def pre_process_fun(y):  ### Before the prediction is done
        return int(y)

    model_dict = {
        'model_pars': {
            ### LightGBM API model   #######################################
            'model_class': model_class,
            'model_pars': {
                'objective': 'binary',
                'n_estimators': 10,
                'learning_rate': 0.001,
                'boosting_type': 'gbdt',  ### Model hyperparameters
                'early_stopping_rounds': 5
            },
            'post_process_fun':
            post_process_fun  ### After prediction  ##########################################
            ,
            'pre_process_pars': {
                'y_norm_fun':
                pre_process_fun,  ### Before training  ##########################

                ### Pipeline for data processing ##############################
                'pipe_list': [
                    {
                        'uri': 'source/prepro.py::pd_coly',
                        'pars': {},
                        'cols_family': 'coly',
                        'cols_out': 'coly',
                        'type': 'coly'
                    },
                    {
                        'uri': 'source/prepro.py::pd_colnum_bin',
                        'pars': {},
                        'cols_family': 'colnum',
                        'cols_out': 'colnum_bin',
                        'type': ''
                    },
                    {
                        'uri': 'source/prepro.py::pd_colnum_binto_onehot',
                        'pars': {},
                        'cols_family': 'colnum_bin',
                        'cols_out': 'colnum_onehot',
                        'type': ''
                    },
                    {
                        'uri': 'source/prepro.py::pd_colcat_bin',
                        'pars': {},
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_bin',
                        'type': ''
                    },
                    {
                        'uri': 'source/prepro.py::pd_colcat_to_onehot',
                        'pars': {},
                        'cols_family': 'colcat_bin',
                        'cols_out': 'colcat_onehot',
                        'type': ''
                    },
                    # {'uri': 'source/prepro.py::pd_colcross',             'pars': {}, 'cols_family': 'colcross',   'cols_out': 'colcross_pair',  'type': 'cross'},

                    #### Example of Custom processor
                    # {'uri': 'titanic_classifier.py::pd_colnum_quantile_norm',   'pars': {}, 'cols_family': 'colnum',   'cols_out': 'colnum_quantile_norm',  'type': '' },
                ],
            }
        },
        'compute_pars': {
            'metric_list': ['accuracy_score', 'average_precision_score']
        },
        'data_pars': {
            'n_sample': n_sample,
            'cols_input_type': cols_input_type_1,
            ### family of columns for MODEL  #########################################################
            #  "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap",  #### Colnum columns
            #  "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
            #  'colcross_single_onehot_select', "colcross_pair_onehot",  'colcross_pair',  #### colcross columns
            #  'coldate',
            'cols_model_group': [
                'colnum_bin',
                'colcat_bin',
                # 'coltext',
                # 'coldate',
                # 'colcross_pair',

                ### example of custom
                # 'colnum_quantile_norm'
            ]

            ### Filter data rows   ##################################################################
            ,
            'filter_pars': {
                'ymax': 2,
                'ymin': -1
            }
        }
    }

    ##### Filling Global parameters    ############################################################
    model_dict = global_pars_update(model_dict, data_name, config_name)
    return model_dict
Example #21
0
def config9(path_model_out=""):
    """
       python  example/test_features.py  train       --nsample 500 --config config1
    """
    config_name = os_get_function_name()
    data_name = "titanic"  ### in data/input/
    model_class = 'LGBMClassifier'  ### ACTUAL Class name for model_sklearn.py
    n_sample = 1000

    def post_process_fun(y):
        return int(y)

    def pre_process_fun(y):
        return int(y)

    model_dict = {
        'model_pars': {
            ### LightGBM API model   #######################################
            'model_class': model_class,
            'model_pars': {
                'objective': 'binary',
                'n_estimators': 3,
            },
            'post_process_fun': post_process_fun,
            'pre_process_pars': {
                'y_norm_fun':
                pre_process_fun,

                ### Pipeline for data processing ##############################
                'pipe_list': [
                    ###  coly processing
                    {
                        'uri': 'source/prepro.py::pd_coly',
                        'pars': {
                            'y_norm_fun': None
                        },
                        'cols_family': 'coly',
                        'cols_out': 'coly',
                        'type': 'coly'
                    },
                    {
                        'uri': 'source/prepro.py::pd_colcat_bin',
                        'pars': {
                            'path_pipeline': False
                        },
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_bin',
                        'type': ''
                    }

                    #### Bug in NA values
                    ,
                    {
                        'uri': 'source/prepro.py::pd_colcat_encoder_generic',
                        'pars': {
                            'model_name': 'HashingEncoder',
                            'model_pars': {
                                'verbose': 1,
                                'return_df': True
                            }
                        },
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_encoder2',
                        'type': ''
                    }

                    #### Example of Custom processor
                    ,
                    {
                        "uri": THIS_FILEPATH + "::pd_col_amyfun",
                        "pars": {},
                        "cols_family": "colnum",
                        "cols_out": "col_myfun",
                        "type": ""
                    },
                ],
            }
        },
        'compute_pars': {
            'metric_list': ['accuracy_score', 'average_precision_score']
        },
        'data_pars': {
            'n_sample': n_sample,
            'cols_input_type': cols_input_type_2,
            'cols_model_group': ['colnum', 'colcat_bin', 'col_myfun'],
            #### Separate Category Sparse from Continuous (DLearning input)
            'cols_model_type': {
                'continuous': [
                    'colnum',
                ],
                'discreate': [
                    'colcat_bin',
                ]
            }

            ### Filter data rows   ###################################################################
            ,
            'filter_pars': {
                'ymax': 2,
                'ymin': -1
            }
        }
    }

    ##### Filling Global parameters    #########################################################
    model_dict = global_pars_update(model_dict, data_name, config_name)
    return model_dict
Example #22
0
def config1():
    """
       ONE SINGLE DICT Contains all needed informations for
       used for titanic classification task
    """
    data_name = "titanic"  ### in data/input/
    model_class = "LGBMClassifier"  ### ACTUAL Class name for model_sklearn.py
    n_sample = 1000

    def post_process_fun(y):  ### After prediction is done
        return int(y)

    def pre_process_fun(y):  ### Before the prediction is done
        return int(y)

    model_dict = {
        "model_pars": {
            ### LightGBM API model   #######################################
            "model_class": model_class,
            "model_pars": {
                "objective": "binary",
                "n_estimators": 10,
                "learning_rate": 0.001,
                "boosting_type": "gbdt",  ### Model hyperparameters
                "early_stopping_rounds": 5
            },
            "post_process_fun":
            post_process_fun  ### After prediction  ##########################################
            ,
            "pre_process_pars": {
                "y_norm_fun":
                pre_process_fun,  ### Before training  ##########################

                ### Pipeline for data processing ##############################
                "pipe_list": [
                    #### coly target prorcessing
                    {
                        "uri": "source/prepro.py::pd_coly",
                        "pars": {},
                        "cols_family": "coly",
                        "cols_out": "coly",
                        "type": "coly"
                    },
                    {
                        "uri": "source/prepro.py::pd_colnum_bin",
                        "pars": {},
                        "cols_family": "colnum",
                        "cols_out": "colnum_bin",
                        "type": ""
                    },
                    {
                        "uri": "source/prepro.py::pd_colnum_binto_onehot",
                        "pars": {},
                        "cols_family": "colnum_bin",
                        "cols_out": "colnum_onehot",
                        "type": ""
                    },

                    #### catcol INTO integer,   colcat into OneHot
                    {
                        "uri": "source/prepro.py::pd_colcat_bin",
                        "pars": {},
                        "cols_family": "colcat",
                        "cols_out": "colcat_bin",
                        "type": ""
                    },
                    {
                        "uri": "source/prepro.py::pd_colcat_to_onehot",
                        "pars": {},
                        "cols_family": "colcat_bin",
                        "cols_out": "colcat_onehot",
                        "type": ""
                    },

                    ### Cross_feat = feat1 X feat2
                    {
                        "uri": "source/prepro.py::pd_colcross",
                        "pars": {},
                        "cols_family": "colcross",
                        "cols_out": "colcross_pair",
                        "type": "cross"
                    },

                    #### Example of Custom processor
                    {
                        "uri": THIS_FILEPATH + "::pd_col_myfun",
                        "pars": {},
                        "cols_family": "colnum",
                        "cols_out": "col_myfun",
                        "type": ""
                    },
                ],
            }
        },
        "compute_pars": {
            "metric_list": ["accuracy_score", "average_precision_score"]

            # ,"mlflow_pars" : {}   ### Not empty --> use mlflow
        },
        "data_pars": {
            "n_sample": n_sample,
            "download_pars": None,
            "cols_input_type": cols_input_type_1,
            ### family of columns for MODEL  #########################################################
            #  "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap",  #### Colnum columns
            #  "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
            #  "colcross_single_onehot_select", "colcross_pair_onehot",  "colcross_pair",  #### colcross columns  "coldate", "coltext",
            "cols_model_group": [
                "colnum_bin",
                "colcat_bin",
                # "coltext",
                # "coldate",
                "colcross_pair",

                ### example of custom
                "col_myfun"
            ]

            ### Filter data rows   ##################################################################
            ,
            "filter_pars": {
                "ymax": 2,
                "ymin": -1
            }
        }
    }

    ##### Filling Global parameters    ############################################################
    model_dict = global_pars_update(model_dict,
                                    data_name,
                                    config_name=os_get_function_name())
    return model_dict
Example #23
0
def config_sampler() :
    """
       ONE SINGLE DICT Contains all needed informations for
       used for titanic classification task
    """
    data_name    = "titanic"         ### in data/input/
    model_class  = "CTGAN"  ### ACTUAL Class name for model_sklearn.py
    n_sample     = 1000

    def post_process_fun(y):   ### After prediction is done
        return  int(y)

    def pre_process_fun(y):    ### Before the prediction is done
        return  int(y)

    model_dict = {
      "model_pars": {
         "model_class": model_class
        ,"model_pars" : { }
        , "post_process_fun" : post_process_fun   ### After prediction  ##########################################
        , "pre_process_pars" : {
              "y_norm_fun" :  pre_process_fun ,  ### Before training  ##########################
              ### Pipeline for data processing ##############################
              "pipe_list": [
                  #### coly target prorcessing
                  {"uri": "source/prepro.py::pd_coly",                 "pars": {}, "cols_family": "coly",       "cols_out": "coly",           "type": "coly"         },

                  {"uri": "source/prepro.py::pd_colnum_bin",           "pars": {}, "cols_family": "colnum",     "cols_out": "colnum_bin",     "type": ""             },
                  {"uri": "source/prepro.py::pd_colnum_binto_onehot",  "pars": {}, "cols_family": "colnum_bin", "cols_out": "colnum_onehot",  "type": ""             },

                  #### catcol INTO integer,   colcat into OneHot
                  {"uri": "source/prepro.py::pd_colcat_bin",           "pars": {}, "cols_family": "colcat",     "cols_out": "colcat_bin",     "type": ""             },
                  {"uri": "source/prepro.py::pd_colcat_to_onehot",     "pars": {}, "cols_family": "colcat_bin", "cols_out": "colcat_onehot",  "type": ""             },

             ],
                                  }
      },

      "compute_pars": { "metric_list": ["accuracy_score","average_precision_score"]
                        # ,"mlflow_pars" : {}   ### Not empty --> use mlflow
                      },

      "data_pars": { 
          "n_sample" : n_sample,
          "download_pars" : None,
          ### Filter data rows   ##################################################################
          "filter_pars": { "ymax" : 2 ,"ymin" : -1 },

          ### Raw data:  column input ##############################################################
          "cols_input_type" : cols_input_type_1,


          ### Model Input :  Merge family of columns   #############################################
          "cols_model_group": [ "colnum_bin", "colcat_bin",]

          #### Model Input : Separate Category Sparse from Continuous : Aribitrary name is OK (!)
        ,'cols_model_type': {
            'continuous'   : [ 'colnum',   ],
            'sparse'       : [ 'colcat_bin', 'colnum_bin',  ],
            'my_split_23'  : [ 'colnum_bin',   ],
          }

         }
      }

    ##### Filling Global parameters    ############################################################
    model_dict        = global_pars_update(model_dict, data_name, config_name=os_get_function_name() )
    return model_dict
Example #24
0
def config1() :
    """
       ONE SINGLE DICT Contains all needed informations for  used for titanic classification task
    """
    data_name    = "titanic"         ### in data/input/
    model_class  = "source/models/keras_widedeep_dense.py"  ### ACTUAL Class name for
    n_sample     = 1000

    def post_process_fun(y):   ### After prediction is done
        return  int(y)

    def pre_process_fun(y):    ### Before the prediction is done
        return  int(y)

    model_dict = {"model_pars": {
        ### LightGBM API model   #######################################
         "model_class": model_class
        ,"model_pars" : {

         }

        , "post_process_fun" : post_process_fun                    ### After prediction  ##########################################
        , "pre_process_pars" : {"y_norm_fun" :  pre_process_fun ,  ### Before training  ##########################

        ### Pipeline for data processing ##############################
        "pipe_list": [
          {"uri": "source/prepro.py::pd_coly",                 "pars": {}, "cols_family": "coly",       "cols_out": "coly",           "type": "coly"         },

          {"uri": "source/prepro.py::pd_colnum_bin",           "pars": {}, "cols_family": "colnum",     "cols_out": "colnum_bin",     "type": ""             },
          {"uri": "source/prepro.py::pd_colnum_binto_onehot",  "pars": {}, "cols_family": "colnum_bin", "cols_out": "colnum_onehot",  "type": ""             },


          {"uri": "source/prepro.py::pd_colcat_bin",           "pars": {}, "cols_family": "colcat",     "cols_out": "colcat_bin",     "type": ""             },
          {"uri": "source/prepro.py::pd_colcat_to_onehot",     "pars": {}, "cols_family": "colcat_bin", "cols_out": "colcat_onehot",  "type": ""             },


          #### neeed to 0-1 Normalize the input
          # {"uri": "source/prepro.py::pd_colcat_bin",           "pars": {}, "cols_family": "colcat",     "cols_out": "colcat_bin",     "type": ""             },

        ],
        }},

      "compute_pars": { "metric_list":  ["accuracy_score","average_precision_score"],
                        'compute_pars': {'epochs': 1 },

                        'path_checkpoint' : "ztmp_checkpoint/"
                      },

      "data_pars": { "n_sample" : n_sample,
          "download_pars" : None,

          ### family of columns for raw input data  #########################################################
          "cols_input_type" : cols_input_type_1,

          ### family of columns used for model input  #########################################################
          "cols_model_group": [ "colnum_onehot",  "colcat_onehot",
                              ]

         ,'cols_model_type' : {
              'cols_cross_input':  [ "colcat_onehot", ],
              'cols_deep_input':   ['colnum_onehot',  ],
          }

          ### Filter data rows   ##################################################################
         ,"filter_pars": { "ymax" : 2 ,"ymin" : -1 }

         }
      }

    ##### Filling Global parameters    ############################################################
    model_dict        = global_pars_update(model_dict, data_name, config_name=os_get_function_name() )
    return model_dict