Beispiel #1
0
def pd_colcat_bin(df, col=None, pars=None):
    # dfbum_bin = df[col]
    path_pipeline = pars.get('path_pipeline', False)
    colcat_bin_map = load(
        f'{path_pipeline}/colcat_bin_map.pkl') if path_pipeline else None
    colcat = [col] if isinstance(col, str) else col

    log("#### Colcat to integer encoding ")
    dfcat_bin, colcat_bin_map = util_feature.pd_colcat_toint(
        df[colcat], colname=colcat, colcat_map=colcat_bin_map, suffix="_int")
    colcat_bin = list(dfcat_bin.columns)
    ##### Colcat processing   ################################################################
    colcat_map = util_feature.pd_colcat_mapping(df, colcat)
    log(df[colcat].dtypes, colcat_map)

    if 'path_features_store' in pars:
        save_features(dfcat_bin, 'dfcat_bin', pars['path_features_store'])
        save(colcat_bin_map,
             pars['path_pipeline_export'] + "/colcat_bin_map.pkl")
        save(colcat_bin, pars['path_pipeline_export'] + "/colcat_bin.pkl")

    col_pars = {}
    col_pars['colcat_bin_map'] = colcat_bin_map
    col_pars['cols_new'] = {
        'colcat': col,  ###list
        'colcat_bin': colcat_bin  ### list
    }

    return dfcat_bin, col_pars
Beispiel #2
0
def preprocess(df, path_pipeline="data/pipeline/pipe_01/", preprocess_pars={}):
    """
      FUNCTIONNAL approach is used for pre-processing, so the code can be EASILY extensible to PYSPPARK.
      PYSPARK  supports better UDF, lambda function
    """
    from util_feature import (pd_colnum_tocat, pd_col_to_onehot,
                              pd_colcat_toint, pd_feature_generate_cross)

    log("########### Load column by column type ##################################"
        )
    colid = load(f'{path_pipeline}/colid.pkl')
    coly = load(f'{path_pipeline}/coly.pkl')
    colcat = load(f'{path_pipeline}/colcat.pkl')
    colcat_onehot = load(f'{path_pipeline}/colcat_onehot.pkl')
    colcat_bin_map = load(f'{path_pipeline}/colcat_bin_map.pkl')

    colnum = load(f'{path_pipeline}/colnum.pkl')
    colnum_binmap = load(f'{path_pipeline}/colnum_binmap.pkl')
    colnum_onehot = load(f'{path_pipeline}/colnum_onehot.pkl')

    ### OneHot column selected for cross features
    colcross_single_onehot_select = load(
        f'{path_pipeline}/colcross_single_onehot_select.pkl')

    pipe_default = [
        'filter',
        'label',
        'dfnum_bin',
        'dfnum_hot',
        'dfcat_bin',
        'dfcat_hot',
        'dfcross_hot',
    ]
    pipe_list = preprocess_pars.get('pipe_list', pipe_default)

    if "dfcat_bin" in pipe_list:
        log("###### Colcat as integer encoded  ####################################"
            )
        dfcat_bin, _ = pd_colcat_toint(df[colcat],
                                       colname=colcat,
                                       colcat_map=colcat_bin_map,
                                       suffix="_int")
        colcat_bin = list(dfcat_bin.columns)

    if "dfcat_hot" in pipe_list:
        log("###### Colcat to onehot ###############################################"
            )
        dfcat_hot, _ = pd_col_to_onehot(df[colcat],
                                        colname=colcat,
                                        colonehot=colcat_onehot,
                                        return_val="dataframe,param")
        log(dfcat_hot[colcat_onehot].head(5))

    if "dfnum_bin" in pipe_list:
        log("###### Colnum Preprocess   ###########################################"
            )
        dfnum_bin, _ = pd_colnum_tocat(df,
                                       colname=colnum,
                                       colexclude=None,
                                       colbinmap=colnum_binmap,
                                       bins=-1,
                                       suffix="_bin",
                                       method="",
                                       return_val="dataframe,param")
        log(colnum_binmap)
        colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())]
        log(dfnum_bin[colnum_bin].head(5))

    if "dfnum_hot" in pipe_list:
        ###### Map numerics bin to One Hot
        dfnum_hot, _ = pd_col_to_onehot(dfnum_bin[colnum_bin],
                                        colname=colnum_bin,
                                        colonehot=colnum_onehot,
                                        return_val="dataframe,param")
        log(dfnum_hot[colnum_onehot].head(5))

    print('------------dfcat_hot---------------------', dfcat_hot)
    print('------------dfnum_hot---------------------', dfnum_hot)
    print('------------colcross_single_onehot_select---------------------',
          colcross_single_onehot_select)
    if "dfcross_hot" in pipe_list:
        log("####### colcross cross features   ###################################################"
            )
        dfcross_hot = pd.DataFrame()
        if colcross_single_onehot_select is not None:
            df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left')

            # colcat_onehot2 = [x for x in colcat_onehot if 'companyId' not in x]
            # log(colcat_onehot2)
            # colcross_single = colnum_onehot + colcat_onehot2
            df_onehot = df_onehot[colcross_single_onehot_select]
            dfcross_hot, colcross_pair = pd_feature_generate_cross(
                df_onehot,
                colcross_single_onehot_select,
                pct_threshold=0.02,
                m_combination=2)
            log(dfcross_hot.head(2).T)
            colcross_onehot = list(dfcross_hot.columns)
            del df_onehot
            gc.collect()

    log("##### Merge data type together  :   #######################3############################ "
        )
    dfX = df[colnum + colcat]
    for t in [
            'dfnum_bin',
            'dfnum_hot',
            'dfcat_bin',
            'dfcat_hot',
            'dfcross_hot',
    ]:
        if t in locals():
            dfX = pd.concat((dfX, locals()[t]), axis=1)
            # log(t, list(dfX.columns))

    colX = list(dfX.columns)
    #colX.remove(coly)
    del df
    gc.collect()

    log("###### Export columns group   ##########################################################"
        )
    cols_family = {}
    for t in [
            'colid',
            'coly',  #added 'coly'
            "colnum",
            "colnum_bin",
            "colnum_onehot",
            "colnum_binmap",  #### Colnum columns
            "colcat",
            "colcat_bin",
            "colcat_onehot",
            "colcat_bin_map",  #### colcat columns
            'colcross_single_onehot_select',
            "colcross_pair_onehot",
            'colcross_pair',  #### colcross columns
            'colsX',
            'coly'
    ]:
        t_val = locals().get(t, None)
        if t_val is not None:
            cols_family[t] = t_val

    return dfX, cols_family
Beispiel #3
0
def preprocess(path_train_X="", path_train_y="", path_pipeline_export="", cols_group=None, n_sample=5000,
               preprocess_pars={}, filter_pars={}, path_features_store=None):
    """
    :param path_train_X:
    :param path_train_y:
    :param path_pipeline_export:
    :param cols_group:
    :param n_sample:
    :param preprocess_pars:
    :param filter_pars:
    :param path_features_store:
    :return:
    """
    from util_feature import (pd_colnum_tocat, pd_col_to_onehot, pd_colcat_mapping, pd_colcat_toint,
                              pd_feature_generate_cross)

    ##### column names for feature generation #####################################################
    log(cols_group)
    coly            = cols_group['coly']  # 'salary'
    colid           = cols_group['colid']  # "jobId"
    colcat          = cols_group['colcat']  # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ]
    colnum          = cols_group['colnum']  # ['yearsExperience', 'milesFromMetropolis']
    
    colcross_single = cols_group.get('colcross', [])   ### List of single columns
    coltext         = cols_group.get('coltext', [])
    coldate         = cols_group.get('coldate', [])
    colall          = colnum + colcat + coltext + coldate
    log(colall)

    #### Pipeline Execution
    pipe_default    = [ 'filter', 'label', 'dfnum_bin', 'dfnum_hot',  'dfcat_bin', 'dfcat_hot', 'dfcross_hot', ]
    pipe_list       = preprocess_pars.get('pipe_list', pipe_default)
    pipe_list.append('dfdate')
    pipe_list_pars  = preprocess_pars.get('pipe_pars', [])



    ##### Load data ##############################################################################
    df = load_dataset(path_train_X, path_train_y, colid, n_sample= n_sample)

    ##### Filtering / cleaning rows :   #########################################################
    if "filter" in pipe_list :
        def isfloat(x):
            try :
                a= float(x)
                return 1
            except:
                return 0
        ymin, ymax = filter_pars.get('ymin', -9999999999.0), filter_pars.get('ymax', 999999999.0)
        print(coly)
        df['_isfloat'] = df[ coly ].apply(lambda x : isfloat(x))
        print(df['_isfloat'])
        df = df[ df['_isfloat'] > 0 ]
        df = df[df[coly] > ymin]
        df = df[df[coly] < ymax]


    ##### Label processing   ####################################################################
    y_norm_fun = None
    if "label" in pipe_list :
        # Target coly processing, Normalization process  , customize by model
        log("y_norm_fun preprocess_pars")
        y_norm_fun = preprocess_pars.get('y_norm_fun', None)
        if y_norm_fun is not None:
            df[coly] = df[coly].apply(lambda x: y_norm_fun(x))
            save(y_norm_fun, f'{path_pipeline_export}/y_norm.pkl' )
            save_features(df[coly], 'dfy', path_features_store)


    ########### colnum procesing   #############################################################
    for x in colnum:
        print('bam',x)
        df[x] = df[x].astype("float")
    log(df[colall].dtypes)


    if "dfnum" in pipe_list :
        pass


    if "dfnum_norm" in pipe_list :
        log("### colnum normalize  ###############################################################")
        from util_feature import pd_colnum_normalize
        pars = { 'pipe_list': [ {'name': 'fillna', 'naval' : 0.0 }, {'name': 'minmax'} ]}
        dfnum_norm, colnum_norm = pd_colnum_normalize(df, colname=colnum,  pars=pars, suffix = "_norm",
                                                      return_val="dataframe,param")
        log(colnum_norm)
        save_features(dfnum_norm, 'dfnum_norm', path_features_store)


    if "dfnum_bin" in pipe_list :
        log("### colnum Map numerics to Category bin  ###########################################")
        dfnum_bin, colnum_binmap = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=None,
                                                   bins=10, suffix="_bin", method="uniform",
                                                   return_val="dataframe,param")
        log(colnum_binmap)
        ### Renaming colunm_bin with suffix
        colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())]
        log(colnum_bin)
        save_features(dfnum_bin, 'dfnum_binmap', path_features_store)


    if "dfnum_hot" in pipe_list and "dfnum_bin" in pipe_list  :
        log("### colnum bin to One Hot")
        dfnum_hot, colnum_onehot = pd_col_to_onehot(dfnum_bin[colnum_bin], colname=colnum_bin,
                                                    colonehot=None, return_val="dataframe,param")
        log(colnum_onehot)
        save_features(dfnum_hot, 'dfnum_onehot', path_features_store)


    ##### Colcat processing   ################################################################
    colcat_map = pd_colcat_mapping(df, colcat)
    log(df[colcat].dtypes, colcat_map)

    if "dfcat_hot" in pipe_list :
        log("#### colcat to onehot")
        dfcat_hot, colcat_onehot = pd_col_to_onehot(df[colcat], colname=colcat,
                                                    colonehot=None, return_val="dataframe,param")
        log(dfcat_hot[colcat_onehot].head(5))
        save_features(dfcat_hot, 'dfcat_onehot', path_features_store)



    if "dfcat_bin" in pipe_list :
        log("#### Colcat to integer encoding ")
        dfcat_bin, colcat_bin_map = pd_colcat_toint(df[colcat], colname=colcat,
                                                    colcat_map=None, suffix="_int")
        colcat_bin = list(dfcat_bin.columns)
        save_features(dfcat_bin, 'dfcat_bin', path_features_store)

    if "dfcross_hot" in pipe_list :
        log("#####  Cross Features From OneHot Features   ######################################")
        try :
           df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left')
        except :
           df_onehot = copy.deepcopy(dfcat_hot)

        colcross_single_onehot_select = []
        for t in list(df_onehot) :
            for c1 in colcross_single :
                if c1 in t :
                   colcross_single_onehot_select.append(t)

        df_onehot = df_onehot[colcross_single_onehot_select ]
        dfcross_hot, colcross_pair = pd_feature_generate_cross(df_onehot, colcross_single_onehot_select,
                                                               pct_threshold=0.02,  m_combination=2)
        log(dfcross_hot.head(2).T)
        colcross_pair_onehot = list(dfcross_hot.columns)
        save_features(dfcross_hot, 'dfcross_onehot', path_features_store)
        del df_onehot ,colcross_pair_onehot

    

    if "dftext" in pipe_list :
        log("##### Coltext processing   ###############################################################")
        stopwords = nlp_get_stopwords()
        pars      = {'n_token' : 100 , 'stopwords': stopwords}
        dftext    = None
        
        for coltext_i in coltext :
            
            ##### Run the text processor on each column text  #############################
            dftext_i = pipe_text( df[[coltext_i ]], coltext_i, pars )
            dftext   = pd.concat((dftext, dftext_i), axis=1)  if dftext is not None else dftext_i
            save_features(dftext_i, 'dftext_' + coltext_i, path_features_store)

        log(dftext.head(6))
        save_features(dftext, 'dftext', path_features_store)



    if "dfdate" in pipe_list :
        log("##### Coldate processing   #############################################################")
        from utils import util_date
        dfdate = None
        for coldate_i in coldate :
            dfdate_i =  util_date.pd_datestring_split( df[[coldate_i]] , coldate_i, fmt="auto", return_val= "split" )
            dfdate  = pd.concat((dfdate, dfdate_i), axis=1)  if dfdate is not None else dfdate_i
            save_features(dfdate_i, 'dfdate_' + coldate_i, path_features_store)
        save_features(dfdate, 'dfdate', path_features_store)
        print('spoo',dfdate)


    ###################################################################################
# ###############
    ##### Save pre-processor meta-parameters
    os.makedirs(path_pipeline_export, exist_ok=True)
    log(path_pipeline_export)
    cols_family = {}

    for t in ['colid',
              "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap",  #### Colnum columns
              "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
              'colcross_single_onehot_select', "colcross_pair_onehot",  'colcross_pair',  #### colcross columns

              'coldate',
              'coltext',

              "coly", "y_norm_fun"
              ]:
        tfile = f'{path_pipeline_export}/{t}.pkl'
        log(tfile)
        t_val = locals().get(t, None)
        if t_val is not None :
           save(t_val, tfile)
           cols_family[t] = t_val


    ######  Merge AlL  #############################################################################
    dfXy = df[colnum + colcat + [coly] ]
    print('localTT',dfXy)
    for t in [ 'dfnum_bin', 'dfnum_hot', 'dfcat_bin', 'dfcat_hot', 'dfcross_hot',
               'dfdate',  'dftext'  ] :
        if t in locals() :
            print('localT', t, locals()[t])
            dfXy = pd.concat((dfXy, locals()[t] ), axis=1)

    save_features(dfXy, 'dfX', path_features_store)
    colXy = list(dfXy.columns)
    colXy.remove(coly)    ##### Only X columns
    cols_family['colX'] = colXy
    save(colXy, f'{path_pipeline_export}/colsX.pkl' )
    save(cols_family, f'{path_pipeline_export}/cols_family.pkl' )


    ###### Return values  #########################################################################
    return dfXy, cols_family