Beispiel #1
0
def pd_colcat_bin(df, col=None, pars=None):
    # dfbum_bin = df[col]
    path_pipeline = pars.get('path_pipeline', False)
    colcat_bin_map = load(
        f'{path_pipeline}/colcat_bin_map.pkl') if path_pipeline else None
    colcat = [col] if isinstance(col, str) else col

    log("#### Colcat to integer encoding ")
    dfcat_bin, colcat_bin_map = util_feature.pd_colcat_toint(
        df[colcat], colname=colcat, colcat_map=colcat_bin_map, suffix="_int")
    colcat_bin = list(dfcat_bin.columns)
    ##### Colcat processing   ################################################################
    colcat_map = util_feature.pd_colcat_mapping(df, colcat)
    log(df[colcat].dtypes, colcat_map)

    if 'path_features_store' in pars:
        save_features(dfcat_bin, 'dfcat_bin', pars['path_features_store'])
        save(colcat_bin_map,
             pars['path_pipeline_export'] + "/colcat_bin_map.pkl")
        save(colcat_bin, pars['path_pipeline_export'] + "/colcat_bin.pkl")

    col_pars = {}
    col_pars['colcat_bin_map'] = colcat_bin_map
    col_pars['cols_new'] = {
        'colcat': col,  ###list
        'colcat_bin': colcat_bin  ### list
    }

    return dfcat_bin, col_pars
Beispiel #2
0
def preprocess(path_train_X="", path_train_y="", path_pipeline_export="", cols_group=None, n_sample=5000,
               preprocess_pars={}, filter_pars={}, path_features_store=None):
    """
    :param path_train_X:
    :param path_train_y:
    :param path_pipeline_export:
    :param cols_group:
    :param n_sample:
    :param preprocess_pars:
    :param filter_pars:
    :param path_features_store:
    :return:
    """
    from util_feature import (pd_colnum_tocat, pd_col_to_onehot, pd_colcat_mapping, pd_colcat_toint,
                              pd_feature_generate_cross)

    ##### column names for feature generation #####################################################
    log(cols_group)
    coly            = cols_group['coly']  # 'salary'
    colid           = cols_group['colid']  # "jobId"
    colcat          = cols_group['colcat']  # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ]
    colnum          = cols_group['colnum']  # ['yearsExperience', 'milesFromMetropolis']
    
    colcross_single = cols_group.get('colcross', [])   ### List of single columns
    coltext         = cols_group.get('coltext', [])
    coldate         = cols_group.get('coldate', [])
    colall          = colnum + colcat + coltext + coldate
    log(colall)

    #### Pipeline Execution
    pipe_default    = [ 'filter', 'label', 'dfnum_bin', 'dfnum_hot',  'dfcat_bin', 'dfcat_hot', 'dfcross_hot', ]
    pipe_list       = preprocess_pars.get('pipe_list', pipe_default)
    pipe_list.append('dfdate')
    pipe_list_pars  = preprocess_pars.get('pipe_pars', [])



    ##### Load data ##############################################################################
    df = load_dataset(path_train_X, path_train_y, colid, n_sample= n_sample)

    ##### Filtering / cleaning rows :   #########################################################
    if "filter" in pipe_list :
        def isfloat(x):
            try :
                a= float(x)
                return 1
            except:
                return 0
        ymin, ymax = filter_pars.get('ymin', -9999999999.0), filter_pars.get('ymax', 999999999.0)
        print(coly)
        df['_isfloat'] = df[ coly ].apply(lambda x : isfloat(x))
        print(df['_isfloat'])
        df = df[ df['_isfloat'] > 0 ]
        df = df[df[coly] > ymin]
        df = df[df[coly] < ymax]


    ##### Label processing   ####################################################################
    y_norm_fun = None
    if "label" in pipe_list :
        # Target coly processing, Normalization process  , customize by model
        log("y_norm_fun preprocess_pars")
        y_norm_fun = preprocess_pars.get('y_norm_fun', None)
        if y_norm_fun is not None:
            df[coly] = df[coly].apply(lambda x: y_norm_fun(x))
            save(y_norm_fun, f'{path_pipeline_export}/y_norm.pkl' )
            save_features(df[coly], 'dfy', path_features_store)


    ########### colnum procesing   #############################################################
    for x in colnum:
        print('bam',x)
        df[x] = df[x].astype("float")
    log(df[colall].dtypes)


    if "dfnum" in pipe_list :
        pass


    if "dfnum_norm" in pipe_list :
        log("### colnum normalize  ###############################################################")
        from util_feature import pd_colnum_normalize
        pars = { 'pipe_list': [ {'name': 'fillna', 'naval' : 0.0 }, {'name': 'minmax'} ]}
        dfnum_norm, colnum_norm = pd_colnum_normalize(df, colname=colnum,  pars=pars, suffix = "_norm",
                                                      return_val="dataframe,param")
        log(colnum_norm)
        save_features(dfnum_norm, 'dfnum_norm', path_features_store)


    if "dfnum_bin" in pipe_list :
        log("### colnum Map numerics to Category bin  ###########################################")
        dfnum_bin, colnum_binmap = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=None,
                                                   bins=10, suffix="_bin", method="uniform",
                                                   return_val="dataframe,param")
        log(colnum_binmap)
        ### Renaming colunm_bin with suffix
        colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())]
        log(colnum_bin)
        save_features(dfnum_bin, 'dfnum_binmap', path_features_store)


    if "dfnum_hot" in pipe_list and "dfnum_bin" in pipe_list  :
        log("### colnum bin to One Hot")
        dfnum_hot, colnum_onehot = pd_col_to_onehot(dfnum_bin[colnum_bin], colname=colnum_bin,
                                                    colonehot=None, return_val="dataframe,param")
        log(colnum_onehot)
        save_features(dfnum_hot, 'dfnum_onehot', path_features_store)


    ##### Colcat processing   ################################################################
    colcat_map = pd_colcat_mapping(df, colcat)
    log(df[colcat].dtypes, colcat_map)

    if "dfcat_hot" in pipe_list :
        log("#### colcat to onehot")
        dfcat_hot, colcat_onehot = pd_col_to_onehot(df[colcat], colname=colcat,
                                                    colonehot=None, return_val="dataframe,param")
        log(dfcat_hot[colcat_onehot].head(5))
        save_features(dfcat_hot, 'dfcat_onehot', path_features_store)



    if "dfcat_bin" in pipe_list :
        log("#### Colcat to integer encoding ")
        dfcat_bin, colcat_bin_map = pd_colcat_toint(df[colcat], colname=colcat,
                                                    colcat_map=None, suffix="_int")
        colcat_bin = list(dfcat_bin.columns)
        save_features(dfcat_bin, 'dfcat_bin', path_features_store)

    if "dfcross_hot" in pipe_list :
        log("#####  Cross Features From OneHot Features   ######################################")
        try :
           df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left')
        except :
           df_onehot = copy.deepcopy(dfcat_hot)

        colcross_single_onehot_select = []
        for t in list(df_onehot) :
            for c1 in colcross_single :
                if c1 in t :
                   colcross_single_onehot_select.append(t)

        df_onehot = df_onehot[colcross_single_onehot_select ]
        dfcross_hot, colcross_pair = pd_feature_generate_cross(df_onehot, colcross_single_onehot_select,
                                                               pct_threshold=0.02,  m_combination=2)
        log(dfcross_hot.head(2).T)
        colcross_pair_onehot = list(dfcross_hot.columns)
        save_features(dfcross_hot, 'dfcross_onehot', path_features_store)
        del df_onehot ,colcross_pair_onehot

    

    if "dftext" in pipe_list :
        log("##### Coltext processing   ###############################################################")
        stopwords = nlp_get_stopwords()
        pars      = {'n_token' : 100 , 'stopwords': stopwords}
        dftext    = None
        
        for coltext_i in coltext :
            
            ##### Run the text processor on each column text  #############################
            dftext_i = pipe_text( df[[coltext_i ]], coltext_i, pars )
            dftext   = pd.concat((dftext, dftext_i), axis=1)  if dftext is not None else dftext_i
            save_features(dftext_i, 'dftext_' + coltext_i, path_features_store)

        log(dftext.head(6))
        save_features(dftext, 'dftext', path_features_store)



    if "dfdate" in pipe_list :
        log("##### Coldate processing   #############################################################")
        from utils import util_date
        dfdate = None
        for coldate_i in coldate :
            dfdate_i =  util_date.pd_datestring_split( df[[coldate_i]] , coldate_i, fmt="auto", return_val= "split" )
            dfdate  = pd.concat((dfdate, dfdate_i), axis=1)  if dfdate is not None else dfdate_i
            save_features(dfdate_i, 'dfdate_' + coldate_i, path_features_store)
        save_features(dfdate, 'dfdate', path_features_store)
        print('spoo',dfdate)


    ###################################################################################
# ###############
    ##### Save pre-processor meta-parameters
    os.makedirs(path_pipeline_export, exist_ok=True)
    log(path_pipeline_export)
    cols_family = {}

    for t in ['colid',
              "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap",  #### Colnum columns
              "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
              'colcross_single_onehot_select', "colcross_pair_onehot",  'colcross_pair',  #### colcross columns

              'coldate',
              'coltext',

              "coly", "y_norm_fun"
              ]:
        tfile = f'{path_pipeline_export}/{t}.pkl'
        log(tfile)
        t_val = locals().get(t, None)
        if t_val is not None :
           save(t_val, tfile)
           cols_family[t] = t_val


    ######  Merge AlL  #############################################################################
    dfXy = df[colnum + colcat + [coly] ]
    print('localTT',dfXy)
    for t in [ 'dfnum_bin', 'dfnum_hot', 'dfcat_bin', 'dfcat_hot', 'dfcross_hot',
               'dfdate',  'dftext'  ] :
        if t in locals() :
            print('localT', t, locals()[t])
            dfXy = pd.concat((dfXy, locals()[t] ), axis=1)

    save_features(dfXy, 'dfX', path_features_store)
    colXy = list(dfXy.columns)
    colXy.remove(coly)    ##### Only X columns
    cols_family['colX'] = colXy
    save(colXy, f'{path_pipeline_export}/colsX.pkl' )
    save(cols_family, f'{path_pipeline_export}/cols_family.pkl' )


    ###### Return values  #########################################################################
    return dfXy, cols_family