Exemple #1
0
def pd_colnum_binto_onehot(df, col=None, pars=None):
    assert isinstance(col, list) and isinstance(df, pd.DataFrame)

    dfnum_bin = df[col]
    colnum_bin = col

    path_pipeline = pars.get('path_pipeline', False)
    colnum_onehot = load(
        f'{path_pipeline}/colnum_onehot.pkl') if path_pipeline else None

    log("###### colnum bin to One Hot  #################################################"
        )
    from util_feature import pd_col_to_onehot
    dfnum_hot, colnum_onehot = pd_col_to_onehot(dfnum_bin[colnum_bin],
                                                colname=colnum_bin,
                                                colonehot=colnum_onehot,
                                                return_val="dataframe,param")
    log(colnum_onehot)

    if 'path_features_store' in pars:
        save_features(dfnum_hot, 'colnum_onehot', pars['path_features_store'])
        save(colnum_onehot,
             pars['path_pipeline_export'] + "/colnum_onehot.pkl")

    col_pars = {}
    col_pars['colnum_onehot'] = colnum_onehot
    col_pars['cols_new'] = {
        # 'colnum'        :  col ,    ###list
        'colnum_onehot': colnum_onehot  ### list
    }
    return dfnum_hot, col_pars
Exemple #2
0
def pd_colnum_normalize(df: pd.DataFrame, col: list = None, pars: dict = None):
    log("### colnum normalize  ###############################################################"
        )
    from util_feature import pd_colnum_normalize
    colnum = col

    pars = {
        'pipe_list': [{
            'name': 'fillna',
            'naval': 0.0
        }, {
            'name': 'minmax'
        }]
    }
    dfnum_norm, colnum_norm = pd_colnum_normalize(df,
                                                  colname=colnum,
                                                  pars=pars,
                                                  suffix="_norm",
                                                  return_val="dataframe,param")
    log(colnum_norm)

    # update: save col and colnum_norm in dictionary
    col_pars = {}
    col_pars['cols_new'] = {
        'colnum': col,  # list
        'colnum_norm': colnum_norm  # list
    }
    if pars.get('path_features_store', None) is not None:
        path_features_store = pars['path_features_store']
        save_features(dfnum_norm, 'dfnum_norm', path_features_store)

    # old: return dfnum_norm, colnum_norm
    # update: return dfnum_norm, col_pars ==> return col_pars as dictionary for the next step in run_preprocess/preprocess
    return dfnum_norm, col_pars
Exemple #3
0
def pd_coly(df, col, pars):
    ##### Filtering / cleaning rows :   ################
    coly = col

    def isfloat(x):
        try:
            a = float(x)
            return 1
        except:
            return 0

    df['_isfloat'] = df[coly].apply(lambda x: isfloat(x))
    df = df[df['_isfloat'] > 0]
    df[coly] = df[coly].astype('float64')
    del df['_isfloat']
    logs("----------df[coly]------------", df[coly])
    ymin, ymax = pars.get('ymin', -9999999999.0), pars.get('ymax', 999999999.0)
    df = df[df[coly] > ymin]
    df = df[df[coly] < ymax]

    ##### Label processing   ####################################################################
    y_norm_fun = None
    # Target coly processing, Normalization process  , customize by model
    log("y_norm_fun preprocess_pars")
    y_norm_fun = pars.get('y_norm_fun', None)
    if y_norm_fun is not None:
        df[coly] = df[coly].apply(lambda x: y_norm_fun(x))
        # save(y_norm_fun, f'{path_pipeline_export}/y_norm.pkl' )

    if pars.get('path_features_store', None) is not None:
        path_features_store = pars['path_features_store']
        save_features(df[coly], 'dfy', path_features_store)

    return df, col
Exemple #4
0
def pd_coldate(df, col, pars):
    log("##### Coldate processing   ##########################################"
        )
    from utils import util_date
    coldate = col
    dfdate = None
    for coldate_i in coldate:
        dfdate_i = util_date.pd_datestring_split(df[[coldate_i]],
                                                 coldate_i,
                                                 fmt="auto",
                                                 return_val="split")
        dfdate = pd.concat(
            (dfdate, dfdate_i), axis=1) if dfdate is not None else dfdate_i
        # if 'path_features_store' in pars :
        #    path_features_store = pars['path_features_store']
        #    #save_features(dfdate_i, 'dfdate_' + coldate_i, path_features_store)

    if 'path_features_store' in pars:
        save_features(dfdate, 'dfdate', pars['path_features_store'])

    col_pars = {}
    col_pars['cols_new'] = {
        # 'colcross_single'     :  col ,    ###list
        'dfdate': list(dfdate.columns)  ### list
    }
    return dfdate, col_pars
Exemple #5
0
def pd_colcat_encoder_generic(df, col, pars):
    """
        Create a Class or decorator
        https://pypi.org/project/category-encoders/
        encoder = ce.BackwardDifferenceEncoder(cols=[...])
        encoder = ce.BaseNEncoder(cols=[...])
        encoder = ce.BinaryEncoder(cols=[...])
        encoder = ce.CatBoostEncoder(cols=[...])
        encoder = ce.CountEncoder(cols=[...])
        encoder = ce.GLMMEncoder(cols=[...])
        encoder = ce.HashingEncoder(cols=[...])
        encoder = ce.HelmertEncoder(cols=[...])
        encoder = ce.JamesSteinEncoder(cols=[...])
        encoder = ce.LeaveOneOutEncoder(cols=[...])
        encoder = ce.MEstimateEncoder(cols=[...])
        encoder = ce.OneHotEncoder(cols=[...])
        encoder = ce.OrdinalEncoder(cols=[...])
        encoder = ce.SumEncoder(cols=[...])
        encoder = ce.PolynomialEncoder(cols=[...])
        encoder = ce.TargetEncoder(cols=[...])
        encoder = ce.WOEEncoder(cols=[...])
    """
    prefix = "colcat_encoder_generic"
    pars_model = None
    if 'path_pipeline' in pars:  ### Load during Inference
        colcat_encoder = load(pars['path_pipeline'] + f"/{prefix}.pkl")
        pars_model = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")
        #model         = load( pars['path_pipeline'] + f"/{prefix}_model.pkl" )

    ####### Custom Code ###############################################################
    from category_encoders import HashingEncoder, WOEEncoder
    pars_model = pars.get('model_pars',
                          {}) if pars_model is None else pars_model
    pars_model['cols'] = col
    model_name = pars.get('model_name', 'HashingEncoder')

    model_class = {'HashingEncoder': HashingEncoder}[model_name]
    model = model_class(**pars_model)
    dfcat_encoder = model.fit_transform(df[col])

    dfcat_encoder.columns = [t + "_cod" for t in dfcat_encoder.columns]
    colcat_encoder = list(dfcat_encoder.columns)

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(dfcat_encoder, 'dfcat_encoder',
                      pars['path_features_store'])
        save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(pars_model, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")
        save(colcat_encoder, pars['path_pipeline_export'] + f"/{prefix}.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        'colcat_encoder_generic': colcat_encoder  ### list
    }
    return dfcat_encoder, col_pars
Exemple #6
0
def pd_colcat_bin(df, col=None, pars=None):
    # dfbum_bin = df[col]
    path_pipeline = pars.get('path_pipeline', False)
    colcat_bin_map = load(
        f'{path_pipeline}/colcat_bin_map.pkl') if path_pipeline else None
    colcat = [col] if isinstance(col, str) else col

    log("#### Colcat to integer encoding ")
    dfcat_bin, colcat_bin_map = util_feature.pd_colcat_toint(
        df[colcat], colname=colcat, colcat_map=colcat_bin_map, suffix="_int")
    colcat_bin = list(dfcat_bin.columns)
    ##### Colcat processing   ################################################################
    colcat_map = util_feature.pd_colcat_mapping(df, colcat)
    log(df[colcat].dtypes, colcat_map)

    if 'path_features_store' in pars:
        save_features(dfcat_bin, 'dfcat_bin', pars['path_features_store'])
        save(colcat_bin_map,
             pars['path_pipeline_export'] + "/colcat_bin_map.pkl")
        save(colcat_bin, pars['path_pipeline_export'] + "/colcat_bin.pkl")

    col_pars = {}
    col_pars['colcat_bin_map'] = colcat_bin_map
    col_pars['cols_new'] = {
        'colcat': col,  ###list
        'colcat_bin': colcat_bin  ### list
    }

    return dfcat_bin, col_pars
Exemple #7
0
def pd_colcross(df, col, pars):
    """
     cross_feature_new =  feat1 X feat2  (pair feature)

    """
    log("#####  Cross Features From OneHot Features   ######################################"
        )
    prefix = 'colcross_onehot'

    # params_check(pars,  [('dfcat_hot', pd.DataFrame), 'colid',   ])
    from util_feature import pd_feature_generate_cross

    dfcat_hot = pars['dfcat_hot']
    colid = pars['colid']

    try:
        dfnum_hot = pars['dfnum_hot']
        df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left')
    except:
        df_onehot = copy.deepcopy(dfcat_hot)

    colcross_single = pars['colcross_single']
    pars_model = {'pct_threshold': 0.02, 'm_combination': 2}
    if 'path_pipeline' in pars:  #### Load existing column list
        colcross_single = load(pars['path_pipeline'] + f'/{prefix}_select.pkl')
        # pars_model      = load( pars['path_pipeline']  + f'/{prefix}_pars.pkl')

    colcross_single_onehot_select = []  ## Select existing columns
    for t in list(df_onehot.columns):
        for c1 in colcross_single:
            if c1 in t:
                colcross_single_onehot_select.append(t)

    df_onehot = df_onehot[colcross_single_onehot_select]
    dfcross_hot, colcross_pair = pd_feature_generate_cross(
        df_onehot, colcross_single_onehot_select, **pars_model)
    log(dfcross_hot.head(2).T)
    colcross_pair_onehot = list(dfcross_hot.columns)

    model = None
    ##############################################################################
    if 'path_features_store' in pars:
        save_features(dfcross_hot, 'colcross_onehot',
                      pars['path_features_store'])
        save(colcross_single_onehot_select,
             pars['path_pipeline_export'] + f'/{prefix}_select.pkl')
        save(colcross_pair,
             pars['path_pipeline_export'] + f'/{prefix}_stats.pkl')
        save(colcross_pair_onehot,
             pars['path_pipeline_export'] + f'/{prefix}_pair.pkl')
        save(model, pars['path_pipeline_export'] + f'/{prefix}_pars.pkl')

    col_pars = {'model': model, 'stats': colcross_pair}
    col_pars['cols_new'] = {
        # 'colcross_single'     :  col ,    ###list
        'colcross_pair': colcross_pair_onehot  ### list
    }
    return dfcross_hot, col_pars
Exemple #8
0
def pd_ts_deltapy2(
    df=None,
    col=None,
    pars={},
):
    """
       Delta py
       pars : {  'name' :  "robust_scaler",
                 'pars'  :  {}
       }
    """
    prefix = 'colts_deltapy'

    ###### Custom code ################################################################
    dfin = df.fillna(method='ffill')
    model_name = pars['name']
    model_pars = pars.get('pars', {})

    if 'path_pipeline' in pars:  #### Prediction time
        model = load(pars['path_pipeline'] + f"/{prefix}_model.pkl")
        pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")

    else:  ### Training time  : Dynamic function load
        from util_feature import load_function_uri
        ##### transform.robust_scaler(df, drop=["Close_1"])
        model = load_function_uri2(model_name)

    ##### Transform Data  ############################################################
    df_out = model(dfin, **model_pars)

    # Extract only returns one value, so no columns to loop over.
    model_name2 = model_name.replace("::", "-")
    if 'extract' in model_name:
        col_out = "0_" + model_name
    else:
        col_out = [coli + "_" + model_name for coli in df_out.columns]
        df_out.columns = col_out
        df_out.index = df_out.index
    col_new = col_out

    ###### Export #####################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df_out, 'df_' + prefix, pars['path_features_store'])
        save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col_new, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: col_new  ### list of columns
    }
    return df_out, col_pars
Exemple #9
0
def pd_colnum_normalize(df: pd.DataFrame, col: list = None, pars: dict = None):
    """ Float num INTO [0,1]
      'quantile_cutoff', 'quantile_cutoff_2', 'minmax'      
      'name': 'fillna', 'na_val' : 0.0 

    """
    prefix = 'colnum_norm'  ### == cols_out
    df = df[col]
    log2(
        "### colnum normalize  #############################################################"
    )
    from util_feature import pd_colnum_normalize as pd_normalize_fun
    colnum = col
    if pars is None:
        pars = {
            'pipe_list': [
                {
                    'name': 'quantile_cutoff'
                },  #  
                {
                    'name': 'fillna',
                    'na_val': 0.0
                },
            ]
        }
    if 'path_pipeline' in pars:  #### Load existing column list
        pars = load(pars['path_pipeline'] + f'/{prefix}_pars.pkl')

    dfnum_norm, colnum_norm = pd_normalize_fun(df,
                                               colname=colnum,
                                               pars=pars,
                                               suffix="_norm",
                                               return_val="dataframe,param")
    log3('dfnum_norm', dfnum_norm.head(4), colnum_norm)
    log3('dfnum_norn NA', dfnum_norm.isna().sum())
    colnew = colnum_norm

    log3(
        "##### Export ######################################################################"
    )
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(dfnum_norm, prefix, pars['path_features_store'])
        save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: colnew  ### list
    }
    return dfnum_norm, col_pars
Exemple #10
0
def pd_sample_imblearn(df=None, col=None, pars=None):
    """
        Over-sample
    """
    params_check(pars, ['model_name', 'pars_resample', 'coly'])  # , 'dfy'
    prefix = '_sample_imblearn'

    ######################################################################################
    from imblearn.over_sampling import SMOTE
    from imblearn.combine import SMOTEENN, SMOTETomek
    from imblearn.under_sampling import NearMiss

    # model_resample = { 'SMOTE' : SMOTE, 'SMOTEENN': SMOTEENN }[  pars.get("model_name", 'SMOTEENN') ]
    model_resample = locals()[pars.get("model_name", 'SMOTEENN')]
    pars_resample = pars.get('pars_resample', {
        'sampling_strategy': 'auto',
        'random_state': 0
    })  # , 'n_jobs': 2

    if 'path_pipeline' in pars:  #### Inference time
        return df, {'col_new': col}

    else:  ### Training time
        colX = col  # [col_ for col_ in col if col_ not in coly]
        coly = pars['coly']
        train_y = pars['dfy']  ## df[coly] #
        train_X = df[colX].fillna(method='ffill')
        gp = model_resample(**pars_resample)
        X_resample, y_resample = gp.fit_resample(train_X, train_y)

        col_new = [t + f"_{prefix}" for t in col]
        df2 = pd.DataFrame(X_resample,
                           columns=col_new)  # , index=train_X.index
        df2[coly] = y_resample

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df2, prefix.replace("col_", "df_"),
                      pars['path_features_store'])
        save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_resample,
             pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: col_new  ###  for training input data
    }
    return df2, col_pars
Exemple #11
0
def pd_coly_clean(df, col, pars):
    path_features_store = pars['path_features_store']
    # path_pipeline_export = pars['path_pipeline_export']
    coly = col = [0]
    y_norm_fun = None
    # Target coly processing, Normalization process  , customize by model
    log("y_norm_fun preprocess_pars")
    y_norm_fun = pars.get('y_norm_fun', None)
    if y_norm_fun is not None:
        df[coly] = df[coly].apply(lambda x: y_norm_fun(x))
        # save(y_norm_fun, f'{path_pipeline_export}/y_norm.pkl' )
        save_features(df[coly], 'dfy', path_features_store)
    return df, coly
Exemple #12
0
def pd_colcat_to_onehot(df, col=None, pars=None):
    """

    """
    log("#### colcat to onehot")
    col = [col] if isinstance(col, str) else col
    if len(col) == 1:
        colnew = [col[0] + "_onehot"]
        df[colnew] = df[col]
        col_pars = {}
        col_pars['colcat_onehot'] = colnew
        col_pars['cols_new'] = {
            # 'colnum'        :  col ,    ###list
            'colcat_onehot': colnew  ### list
        }
        return df[colnew], col_pars

    colcat_onehot = None
    if 'path_pipeline' in pars:
        colcat_onehot = load(pars['path_pipeline'] + '/colcat_onehot.pkl')

    ######################################################################################
    colcat = col
    dfcat_hot, colcat_onehot = util_feature.pd_col_to_onehot(
        df[colcat],
        colname=colcat,
        colonehot=colcat_onehot,
        return_val="dataframe,param")
    log(dfcat_hot[colcat_onehot].head(5))

    ######################################################################################
    if 'path_features_store' in pars:
        save_features(dfcat_hot, 'colcat_onehot', pars['path_features_store'])
        save(colcat_onehot,
             pars['path_pipeline_export'] + "/colcat_onehot.pkl")
        save(colcat, pars['path_pipeline_export'] + "/colcat.pkl")

    col_pars = {}
    col_pars['colcat_onehot'] = colcat_onehot
    col_pars['cols_new'] = {
        # 'colnum'        :  col ,    ###list
        'colcat_onehot': colcat_onehot  ### list
    }

    print("ok ------------")
    return dfcat_hot, col_pars
Exemple #13
0
def pd_colnum_bin(df: pd.DataFrame, col: list = None, pars: dict = None):
    """  float column into  binned columns
    :param df:
    :param col:
    :param pars:
    :return:
    """
    from util_feature import pd_colnum_tocat

    path_pipeline = pars.get('path_pipeline', False)
    colnum_binmap = load(
        f'{path_pipeline}/colnum_binmap.pkl') if path_pipeline else None
    log2(colnum_binmap)
    colnum = col

    log2(
        "### colnum Map numerics to Category bin  ###########################################"
    )
    dfnum_bin, colnum_binmap = pd_colnum_tocat(df,
                                               colname=colnum,
                                               colexclude=None,
                                               colbinmap=colnum_binmap,
                                               bins=10,
                                               suffix="_bin",
                                               method="uniform",
                                               return_val="dataframe,param")
    log3(colnum_binmap)
    ### Renaming colunm_bin with suffix
    colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())]
    log3(colnum_bin)

    if 'path_features_store' in pars:
        scol = "_".join(col[:5])
        save_features(dfnum_bin, 'colnum_bin' + "-" + scol,
                      pars['path_features_store'])
        save(colnum_binmap,
             pars['path_pipeline_export'] + "/colnum_binmap.pkl")
        save(colnum_bin, pars['path_pipeline_export'] + "/colnum_bin.pkl")

    col_pars = {}
    col_pars['colnumbin_map'] = colnum_binmap
    col_pars['cols_new'] = {
        'colnum': col,  ###list
        'colnum_bin': colnum_bin  ### list
    }
    return dfnum_bin, col_pars
Exemple #14
0
def pd_colcat_minhash(df, col, pars):
    """
       MinHash Algo for category
       https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087

    """
    prefix = 'colcat_minhash'
    colcat = col

    pars_minhash = {
        'n_component': [4, 2],
        'model_pretrain_dict': None,
    }
    if 'path_pipeline_export' in pars:
        try:
            pars_minhash = load(pars['path_pipeline_export'] +
                                '/colcat_minhash_pars.pkl')
        except:
            pass

    log("#### Colcat to Hash encoding #############################################"
        )
    from utils import util_text
    dfcat_bin, col_hash_model = util_text.pd_coltext_minhash(
        df[colcat], colcat, return_val="dataframe,param", **pars_minhash)
    colcat_minhash = list(dfcat_bin.columns)
    log(col_hash_model)

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(dfcat_bin, prefix, pars['path_features_store'])
        save(colcat_minhash, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_minhash,
             pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")
        save(col_hash_model,
             pars['path_pipeline_export'] + f"/{prefix}_model.pkl")

    col_pars = {}
    col_pars['col_hash_model'] = col_hash_model
    col_pars['cols_new'] = {
        'colcat_minhash': colcat_minhash  ### list
    }
    return dfcat_bin, col_pars
Exemple #15
0
def pd_colnum_normalize(df, col, pars):
    log("### colnum normalize  ###############################################################"
        )
    from util_feature import pd_colnum_normalize
    colnum = col

    pars = {
        'pipe_list': [{
            'name': 'fillna',
            'naval': 0.0
        }, {
            'name': 'minmax'
        }]
    }
    dfnum_norm, colnum_norm = pd_colnum_normalize(df,
                                                  colname=colnum,
                                                  pars=pars,
                                                  suffix="_norm",
                                                  return_val="dataframe,param")
    log(colnum_norm)
    if pars.get('path_features_store', None) is not None:
        path_features_store = pars['path_features_store']
        save_features(dfnum_norm, 'dfnum_norm', path_features_store)
    return dfnum_norm, colnum_norm
Exemple #16
0
def pd_coltext(df, col, pars={}):
    """
    df : Datframe
    col : list of columns
    pars : dict of pars

    """
    from utils import util_text, util_model

    #### Load pars ###################################################################
    path_pipeline        = pars.get('path_pipeline', None)
    word_tokeep_dict_all = load(  path_pipeline + "/word_tokeep_dict_all.pkl" )  if path_pipeline is not None else {}
    # dftext_tdidf_all = load(f'{path_pipeline}/dftext_tdidf.pkl') if  path_pipeline else None
    # dftext_svd_list_all      = load(f'{path_pipeline}/dftext_svd.pkl')   if  path_pipeline else None
    dimpca       = pars.get('dimpca', 2)
    word_minfreq = pars.get('word_minfreq', 3)

    #### Process  ####################################################################
    stopwords           = nlp_get_stopwords()
    dftext              = pd_coltext_clean(df, col, stopwords= stopwords , pars=pars)
    dftext_svd_list_all = None
    dftext_tdidf_all    = None

    ### Processing each of text columns to create a bag of word/to load the bag of word -> tf-idf -> svd
    for col_ in col:

            if path_pipeline is not None:
                ### If it is in Inference step, use the saved bag of word for the column `col_`
                word_tokeep = word_tokeep_dict_all[col_]

            else:
                ### If it is not, create a bag of word
                coltext_freq, word_tokeep = pd_coltext_wordfreq(df, col_, stopwords, ntoken=100)  ## nb of words to keep
                word_tokeep_dict_all[col_] = word_tokeep  ## save the bag of wrod for `col_` in a dict

            dftext_tdidf_dict, word_tokeep_dict = util_text.pd_coltext_tdidf(dftext, coltext=col_, word_minfreq= word_minfreq,
                                                                             word_tokeep = word_tokeep,
                                                                             return_val  = "dataframe,param")

            dftext_tdidf_all = pd.DataFrame(dftext_tdidf_dict) if dftext_tdidf_all is None else pd.concat((dftext_tdidf_all,pd.DataFrame(dftext_tdidf_dict)),axis=1)
            log(word_tokeep_dict)

            ###  Dimesnion reduction for Sparse Matrix
            dftext_svd_list, svd_list = util_model.pd_dim_reduction(dftext_tdidf_dict,
                                                           colname        = None,
                                                           model_pretrain = None,
                                                           colprefix      = col_ + "_svd",
                                                           method         = "svd",  dimpca=dimpca,  return_val="dataframe,param")

            dftext_svd_list_all = dftext_svd_list if dftext_svd_list_all is None else pd.concat((dftext_svd_list_all,dftext_svd_list),axis=1)
    #################################################################################

    ###### Save and Export ##########################################################
    if 'path_features_store' in pars:
            save_features(dftext_svd_list_all, 'dftext_svd' + "-" + str(col), pars['path_features_store'])
            # save(dftext_svd_list_all,  pars['path_pipeline_export'] + "/dftext_svd.pkl")
            # save(dftext_tdidf_all,     pars['path_pipeline_export'] + "/dftext_tdidf.pkl" )
            save(word_tokeep_dict_all,     pars['path_pipeline_export'] + "/word_tokeep_dict_all.pkl" )

    col_pars = {}
    col_pars['cols_new'] = {
     # 'coltext_tdidf'    : dftext_tdidf_all.columns.tolist(),       ### list
     'coltext_svd'      : dftext_svd_list_all.columns.tolist()      ### list
    }

    dftext_svd_list_all.index = dftext.index
    # return pd.concat((dftext_svd_list_all,dftext_svd_list_all),axis=1), col_pars
    return dftext_svd_list_all, col_pars
Exemple #17
0
def pd_col_genetic_transform(df=None, col=None, pars=None):
    """
        Find Symbolic formulae for faeture engineering

    """
    prefix = 'col_genetic'
    ######################################################################################
    from gplearn.genetic import SymbolicTransformer
    from gplearn.functions import make_function
    import random

    colX = col  # [col_ for col_ in col if col_ not in coly]
    train_X = df[colX].fillna(method='ffill')
    feature_name_ = colX

    def squaree(x):
        return x * x

    square_ = make_function(function=squaree, name='square_', arity=1)

    function_set = pars.get('function_set', [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan',
        square_
    ])
    pars_genetic = pars.get(
        'pars_genetic',
        {
            'generations': 5,
            'population_size': 10,  ### Higher than nb_features
            'metric': 'spearman',
            'tournament_size': 20,
            'stopping_criteria': 1.0,
            'const_range': (-1., 1.),
            'p_crossover': 0.9,
            'p_subtree_mutation': 0.01,
            'p_hoist_mutation': 0.01,
            'p_point_mutation': 0.01,
            'p_point_replace': 0.05,
            'parsimony_coefficient': 0.005,  ####   0.00005 Control Complexity
            'max_samples': 0.9,
            'verbose': 1,

            #'n_components'      ### Control number of outtput features  : n_components
            'random_state': 0,
            'n_jobs': 4,
        })

    if 'path_pipeline' in pars:  #### Inference time
        gp = load(pars['path_pipeline'] + f"/{prefix}_model.pkl")
        pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")
    else:  ### Training time
        coly = pars['coly']
        train_y = pars['dfy']
        gp = SymbolicTransformer(
            hall_of_fame=train_X.shape[1] + 1,  ### Buggy
            n_components=pars_genetic.get('n_components', train_X.shape[1]),
            feature_names=feature_name_,
            function_set=function_set,
            **pars_genetic)
        gp.fit(train_X, train_y)

    ##### Transform Data  #########################################
    df_genetic = gp.transform(train_X)
    tag = random.randint(0, 10)  #### UNIQUE TAG
    col_genetic = [f"gen_{tag}_{i}" for i in range(df_genetic.shape[1])]
    df_genetic = pd.DataFrame(df_genetic,
                              columns=col_genetic,
                              index=train_X.index)
    df_genetic.index = train_X.index
    pars_gen_all = {'pars_genetic': pars_genetic, 'function_set': function_set}

    ##### Formulae Exrraction #####################################
    formula = str(gp).replace("[", "").replace("]", "")
    flist = formula.split(",\n")
    form_dict = {x: flist[i] for i, x in enumerate(col_genetic)}
    pars_gen_all['formulae_dict'] = form_dict
    log("########## Formulae ", form_dict)
    # col_pars['map_dict'] = dict(zip(train_X.columns.to_list(), feature_name_))

    col_new = col_genetic

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df_genetic, 'df_genetic', pars['path_features_store'])
        save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_gen_all,
             pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")
        # save(form_dict,      pars['path_pipeline_export'] + f"/{prefix}_formula.pkl")
        save_json(form_dict, pars['path_pipeline_export'] +
                  f"/{prefix}_formula.json")  ### Human readable

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: col_new  ### list
    }
    return df_genetic, col_pars
Exemple #18
0
def pd_coltext_universal_google(df, col, pars={}):
    """
     # Universal sentence encoding from Tensorflow
       Text ---> Vectors
    from source.preprocessors import  pd_coltext_universal_google
    https://tfhub.dev/google/universal-sentence-encoder-multilingual/3

    #latest Tensorflow that supports sentencepiece is 1.13.1
    !pip uninstall --quiet --yes tensorflow
    !pip install --quiet tensorflow-gpu==1.13.1
    !pip install --quiet tensorflow-hub
    pip install --quiet tf-sentencepiece, simpleneighbors
    !pip install --quiet simpleneighbors

    # df : dataframe
    # col : list of text colnum names
    pars
    """
    prefix = "coltext_universal_google"
    if 'path_pipeline' in  pars  :   ### Load during Inference
       coltext_embed = load( pars['path_pipeline'] + "/{prefix}.pkl" )
       pars_model    = load( pars['path_pipeline'] + "/{prefix}_pars.pkl" )

    ####### Custom Code ###############################################################
    import tensorflow as tf
    import tensorflow_hub as hub
    import tensorflow_text
    #from tqdm import tqdm #progress bar
    uri_list = [
    ]
    url_default = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
    url         = pars.get("model_uri", url_default )
    model       = hub.load( url )
    pars_model  = {}
    dfall       = None
    for coli in col[:1] :
        X = []
        for r in (df[coli]):
            if pd.isnull(r)==True :
                r=""
            emb = model(r)
            review_emb = tf.reshape(emb, [-1]).numpy()
            X.append(review_emb)

        dfi   = pd.DataFrame(X, columns= [ coli + "_" + str(i) for i in range( len(X[0]))   ] ,
                             index = df.index)
        dfall = pd.concat((dfall, dfi))  if dfall is not None else dfi

    coltext_embed = list(dfall.columns)


    ##### Export ####################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
       save_features(dfall, 'dftext_embed', pars['path_features_store'])
       save(coltext_embed,  pars['path_pipeline_export'] + "/{prefix}.pkl" )
       save(pars_model,     pars['path_pipeline_export'] + "/{prefix}_pars.pkl" )
       # save(model,          pars['path_pipeline_export'] + "/{prefix}_model.pkl" )
       # model_uri = pars['path_pipeline_export'] + "/{prefix}_model.pkl"


    # col_pars = {'model_uri' :  model_uri, 'pars': pars_model}
    col_pars = {'model_uri' :  url , 'pars': pars_model} # model_uri
    col_pars['cols_new']      = {
       'coltext_universal_google' :  coltext_embed ### list
    }
    return dfall, col_pars
Exemple #19
0
def pd_colnum_quantile_norm(df, col, pars={}):
    """
     colnum normalization by quantile
  """
    prefix = "colnum_quantile_norm"
    df = df[col]
    num_col = col

    ##### Grab previous computed params  ################################################
    pars2 = {}
    if 'path_pipeline' in pars:  #### Load existing column list
        colnum_quantile_norm = load(pars['path_pipeline'] + f'/{prefix}.pkl')
        model = load(pars['path_pipeline'] + f'/{prefix}_model.pkl')
        pars2 = load(pars['path_pipeline'] + f'/{prefix}_pars.pkl')

    lower_bound_sparse = pars2.get('lower_bound_sparse', None)
    upper_bound_sparse = pars2.get('upper_bound_sparse', None)
    lower_bound = pars2.get('lower_bound_sparse', None)
    upper_bound = pars2.get('upper_bound_sparse', None)
    sparse_col = pars2.get('colsparse', ['capital-gain', 'capital-loss'])

    ####### Find IQR and implement to numericals and sparse columns seperately ##########
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1

    for col in num_col:
        if col in sparse_col:
            df_nosparse = pd.DataFrame(df[df[col] != df[col].mode()[0]][col])

            if lower_bound_sparse is not None:
                pass

            elif df_nosparse[col].quantile(
                    0.25) < df[col].mode()[0]:  #Unexpected case
                lower_bound_sparse = df_nosparse[col].quantile(0.25)

            else:
                lower_bound_sparse = df[col].mode()[0]

            if upper_bound_sparse is not None:
                pass

            elif df_nosparse[col].quantile(
                    0.75) < df[col].mode()[0]:  #Unexpected case
                upper_bound_sparse = df[col].mode()[0]

            else:
                upper_bound_sparse = df_nosparse[col].quantile(0.75)

            n_outliers = len(df[(df[col] < lower_bound_sparse) |
                                (df[col] > upper_bound_sparse)][col])

            if n_outliers > 0:
                df.loc[df[col] < lower_bound_sparse,
                       col] = lower_bound_sparse * 0.75  #--> MAIN DF CHANGED
                df.loc[df[col] > upper_bound_sparse,
                       col] = upper_bound_sparse * 1.25  # --> MAIN DF CHANGED

        else:
            if lower_bound is None or upper_bound is None:
                lower_bound = df[col].quantile(0.25) - 1.5 * IQR[col]
                upper_bound = df[col].quantile(0.75) + 1.5 * IQR[col]

            df[col] = np.where(df[col] > upper_bound, 1.25 * upper_bound,
                               df[col])
            df[col] = np.where(df[col] < lower_bound, 0.75 * lower_bound,
                               df[col])

    df.columns = [t + "_qt_norm" for t in df.columns]
    pars_new = {
        'lower_bound': lower_bound,
        'upper_bound': upper_bound,
        'lower_bound_sparse': lower_bound_sparse,
        'upper_bound_sparse': upper_bound_sparse
    }
    dfnew = df
    model = None
    colnew = list(df.columns)

    ##### Export ##############################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df, prefix, pars['path_features_store'])
        save(colnew, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_new, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")
        save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: colnew  ### list
    }
    return dfnew, col_pars
Exemple #20
0
def pd_colcross(df: pd.DataFrame, col: list = None, pars: dict = None):
    """
     cross_feature_new =  feat1 X feat2  (pair feature)

    """
    log("#####  Cross Features From OneHot Features   ######################################"
        )
    prefix = 'colcross_onehot'

    # params_check(pars,  [('dfcat_hot', pd.DataFrame), 'colid',   ])
    from util_feature import pd_feature_generate_cross

    dfcat_hot = pars['dfcat_hot']
    colid = pars['colid']

    try:
        dfnum_hot = pars['dfnum_hot']
        dfnum_hot = dfnum_hot.drop_duplicates(
        )  ### Create bug if not unique ids
        df_onehot = dfcat_hot.reset_index().join(dfnum_hot,
                                                 on=[colid],
                                                 how='left')
        # df_onehot = pd.merge(dfcat_hot.reset_index(), dfnum_hot.reset_index() , on= [colid], how='left')

        #log4_pd('df_onehot', df_onehot )
        #log4(df_onehot.head(4).T )
        assert set(dfcat_hot.index) == set(
            dfnum_hot.index), "Not equal index between dfcat_hot, dfnum_hot"
        log4('index', colid, dfcat_hot.index)
        log4(dfnum_hot.index)

        # df_onehot = df_onehot.set_index(colid)
        log4('colid', colid)
        log4_pd('dfnum_hot', dfnum_hot)
        log4_pd('dfcat_hot', dfcat_hot)

    except Exception as e:
        log4('error', e)
        df_onehot = copy.deepcopy(dfcat_hot)

    colcross_single = pars['colcross_single']
    pars_model = {'pct_threshold': 0.02, 'm_combination': 2}
    if 'path_pipeline' in pars:  #### Load existing column list
        colcross_single = load(pars['path_pipeline'] + f'/{prefix}_select.pkl')
        # pars_model      = load( pars['path_pipeline']  + f'/{prefix}_pars.pkl')

    log4('colcross_single', colcross_single, len(colcross_single))

    colcross_single_onehot_select = []  ## Select existing columns
    for t in list(df_onehot.columns):
        for c1 in colcross_single:
            if c1 in t:
                colcross_single_onehot_select.append(t)
    colcross_single_onehot_select = sorted(
        list(set(colcross_single_onehot_select)))
    log4('colcross_single_select', colcross_single_onehot_select,
         len(colcross_single_onehot_select))

    df_onehot = df_onehot[colcross_single_onehot_select]
    log4_pd('df_onehot', df_onehot)
    dfcross_hot, colcross_pair = pd_feature_generate_cross(
        df_onehot, colcross_single_onehot_select, **pars_model)
    log4_pd("dfcross_hot", dfcross_hot)
    colcross_pair_onehot = list(dfcross_hot.columns)

    model = None
    ##############################################################################
    if 'path_features_store' in pars:
        save_features(dfcross_hot, 'colcross_onehot',
                      pars['path_features_store'])
        save(colcross_single_onehot_select,
             pars['path_pipeline_export'] + f'/{prefix}_select.pkl')
        save(colcross_pair,
             pars['path_pipeline_export'] + f'/{prefix}_stats.pkl')
        save(colcross_pair_onehot,
             pars['path_pipeline_export'] + f'/{prefix}_pair.pkl')
        save(model, pars['path_pipeline_export'] + f'/{prefix}_pars.pkl')

    col_pars = {'model': model, 'stats': colcross_pair}
    col_pars['cols_new'] = {
        # 'colcross_single'     :  col ,    ###list
        'colcross_pair': colcross_pair_onehot  ### list
    }
    return dfcross_hot, col_pars