Exemple #1
0
def pd_colcat_bin(df, col=None, pars=None):
    # dfbum_bin = df[col]
    path_pipeline = pars.get('path_pipeline', False)
    colcat_bin_map = load(
        f'{path_pipeline}/colcat_bin_map.pkl') if path_pipeline else None
    colcat = [col] if isinstance(col, str) else col

    log("#### Colcat to integer encoding ")
    dfcat_bin, colcat_bin_map = util_feature.pd_colcat_toint(
        df[colcat], colname=colcat, colcat_map=colcat_bin_map, suffix="_int")
    colcat_bin = list(dfcat_bin.columns)
    ##### Colcat processing   ################################################################
    colcat_map = util_feature.pd_colcat_mapping(df, colcat)
    log(df[colcat].dtypes, colcat_map)

    if 'path_features_store' in pars:
        save_features(dfcat_bin, 'dfcat_bin', pars['path_features_store'])
        save(colcat_bin_map,
             pars['path_pipeline_export'] + "/colcat_bin_map.pkl")
        save(colcat_bin, pars['path_pipeline_export'] + "/colcat_bin.pkl")

    col_pars = {}
    col_pars['colcat_bin_map'] = colcat_bin_map
    col_pars['cols_new'] = {
        'colcat': col,  ###list
        'colcat_bin': colcat_bin  ### list
    }

    return dfcat_bin, col_pars
Exemple #2
0
def pd_colnum_binto_onehot(df, col=None, pars=None):
    assert isinstance(col, list) and isinstance(df, pd.DataFrame)

    dfnum_bin = df[col]
    colnum_bin = col

    path_pipeline = pars.get('path_pipeline', False)
    colnum_onehot = load(
        f'{path_pipeline}/colnum_onehot.pkl') if path_pipeline else None

    log("###### colnum bin to One Hot  #################################################"
        )
    from util_feature import pd_col_to_onehot
    dfnum_hot, colnum_onehot = pd_col_to_onehot(dfnum_bin[colnum_bin],
                                                colname=colnum_bin,
                                                colonehot=colnum_onehot,
                                                return_val="dataframe,param")
    log(colnum_onehot)

    if 'path_features_store' in pars:
        save_features(dfnum_hot, 'colnum_onehot', pars['path_features_store'])
        save(colnum_onehot,
             pars['path_pipeline_export'] + "/colnum_onehot.pkl")

    col_pars = {}
    col_pars['colnum_onehot'] = colnum_onehot
    col_pars['cols_new'] = {
        # 'colnum'        :  col ,    ###list
        'colnum_onehot': colnum_onehot  ### list
    }
    return dfnum_hot, col_pars
Exemple #3
0
def pd_colnum_bin(df, col, pars):
    from util_feature import  pd_colnum_tocat

    path_pipeline = pars.get('path_pipeline', False)
    colnum_binmap  = load(f'{path_pipeline}/colnum_binmap.pkl') if  path_pipeline else None
    log(colnum_binmap)

    colnum = col

    log("### colnum Map numerics to Category bin  ###########################################")
    dfnum_bin, colnum_binmap = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=colnum_binmap,
                                               bins=10, suffix="_bin", method="uniform",
                                               return_val="dataframe,param")
    log(colnum_binmap)
    ### Renaming colunm_bin with suffix
    colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())]
    log(colnum_bin)

    if 'path_features_store' in pars:
        scol = "_".join(col[:5])
        save_features(dfnum_bin, 'colnum_bin' + "-" + scol, pars['path_features_store'])
        save(colnum_binmap,  pars['path_pipeline_export'] + "/colnum_binmap.pkl" )
        save(colnum_bin,     pars['path_pipeline_export'] + "/colnum_bin.pkl" )


    col_pars = {}
    col_pars['colnumbin_map'] = colnum_binmap
    col_pars['cols_new'] = {
     'colnum'     :  col ,    ###list
     'colnum_bin' :  colnum_bin       ### list
    }
    return dfnum_bin, col_pars
Exemple #4
0
def pd_coltext_universal_google(df, col, pars={}):
    """
     # Universal sentence encoding from Tensorflow
       Text ---> Vectors
    from source.preprocessors import  pd_coltext_universal_google
    https://tfhub.dev/google/universal-sentence-encoder-multilingual/3

    #@title Setup Environment
    #latest Tensorflow that supports sentencepiece is 1.13.1
    !pip uninstall --quiet --yes tensorflow
    !pip install --quiet tensorflow-gpu==1.13.1
    !pip install --quiet tensorflow-hub
    pip install --quiet tf-sentencepiece, simpleneighbors
    !pip install --quiet simpleneighbors

    # df : dataframe
    # col : list of text colnum names
    pars
    """
    import tensorflow as tf
    import tensorflow_hub as hub
    import tensorflow_text
    #from tqdm import tqdm #progress bar
    uri_list = []
    uri_default = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
    uri = pars.get("url_model", uri_default)
    use = hub.load(uri)
    dfall = None
    for coli in col[:1]:
        X = []
        for r in (df[coli]):
            if pd.isnull(r) == True:
                r = ""
            emb = use(r)
            review_emb = tf.reshape(emb, [-1]).numpy()
            X.append(review_emb)

        dfi = pd.DataFrame(
            X,
            columns=[coli + "_" + str(i) for i in range(len(X[0]))],
            index=df.index)
        dfall = pd.concat((dfall, dfi)) if dfall is not None else dfi

    coltext_embed = list(dfall.columns)

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(dfall, 'dftext_embed', pars['path_features_store'])
        save(coltext_embed,
             pars['path_pipeline_export'] + "/coltext_universal_google.pkl")

    col_pars = {'model_encoder': uri}
    col_pars['cols_new'] = {
        'coltext_universal_google': coltext_embed  ### list
    }
    return dfall, col_pars
Exemple #5
0
def pd_colnum_normalize(df: pd.DataFrame, col: list = None, pars: dict = None):
    """ Float num INTO [0,1]
      'quantile_cutoff', 'quantile_cutoff_2', 'minmax'      
      'name': 'fillna', 'na_val' : 0.0 

    """
    prefix = 'colnum_norm'  ### == cols_out
    df = df[col]
    log2(
        "### colnum normalize  #############################################################"
    )
    from util_feature import pd_colnum_normalize as pd_normalize_fun
    colnum = col
    if pars is None:
        pars = {
            'pipe_list': [
                {
                    'name': 'quantile_cutoff'
                },  #  
                {
                    'name': 'fillna',
                    'na_val': 0.0
                },
            ]
        }
    if 'path_pipeline' in pars:  #### Load existing column list
        pars = load(pars['path_pipeline'] + f'/{prefix}_pars.pkl')

    dfnum_norm, colnum_norm = pd_normalize_fun(df,
                                               colname=colnum,
                                               pars=pars,
                                               suffix="_norm",
                                               return_val="dataframe,param")
    log3('dfnum_norm', dfnum_norm.head(4), colnum_norm)
    log3('dfnum_norn NA', dfnum_norm.isna().sum())
    colnew = colnum_norm

    log3(
        "##### Export ######################################################################"
    )
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(dfnum_norm, prefix, pars['path_features_store'])
        save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: colnew  ### list
    }
    return dfnum_norm, col_pars
Exemple #6
0
def pd_colcross(df, col, pars):
    """
     cross_feature_new =  feat1 X feat2  (pair feature)

    """
    log("#####  Cross Features From OneHot Features   ######################################"
        )
    prefix = 'colcross_onehot'

    # params_check(pars,  [('dfcat_hot', pd.DataFrame), 'colid',   ])
    from util_feature import pd_feature_generate_cross

    dfcat_hot = pars['dfcat_hot']
    colid = pars['colid']

    try:
        dfnum_hot = pars['dfnum_hot']
        df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left')
    except:
        df_onehot = copy.deepcopy(dfcat_hot)

    colcross_single = pars['colcross_single']
    pars_model = {'pct_threshold': 0.02, 'm_combination': 2}
    if 'path_pipeline' in pars:  #### Load existing column list
        colcross_single = load(pars['path_pipeline'] + f'/{prefix}_select.pkl')
        # pars_model      = load( pars['path_pipeline']  + f'/{prefix}_pars.pkl')

    colcross_single_onehot_select = []  ## Select existing columns
    for t in list(df_onehot.columns):
        for c1 in colcross_single:
            if c1 in t:
                colcross_single_onehot_select.append(t)

    df_onehot = df_onehot[colcross_single_onehot_select]
    dfcross_hot, colcross_pair = pd_feature_generate_cross(
        df_onehot, colcross_single_onehot_select, **pars_model)
    log(dfcross_hot.head(2).T)
    colcross_pair_onehot = list(dfcross_hot.columns)

    model = None
    ##############################################################################
    if 'path_features_store' in pars:
        save_features(dfcross_hot, 'colcross_onehot',
                      pars['path_features_store'])
        save(colcross_single_onehot_select,
             pars['path_pipeline_export'] + f'/{prefix}_select.pkl')
        save(colcross_pair,
             pars['path_pipeline_export'] + f'/{prefix}_stats.pkl')
        save(colcross_pair_onehot,
             pars['path_pipeline_export'] + f'/{prefix}_pair.pkl')
        save(model, pars['path_pipeline_export'] + f'/{prefix}_pars.pkl')

    col_pars = {'model': model, 'stats': colcross_pair}
    col_pars['cols_new'] = {
        # 'colcross_single'     :  col ,    ###list
        'colcross_pair': colcross_pair_onehot  ### list
    }
    return dfcross_hot, col_pars
Exemple #7
0
def pd_colcat_to_onehot(df, col=None, pars=None):
    """

    """
    log("#### colcat to onehot")
    col = [col] if isinstance(col, str) else col
    if len(col) == 1:
        colnew = [col[0] + "_onehot"]
        df[colnew] = df[col]
        col_pars = {}
        col_pars['colcat_onehot'] = colnew
        col_pars['cols_new'] = {
            # 'colnum'        :  col ,    ###list
            'colcat_onehot': colnew  ### list
        }
        return df[colnew], col_pars

    colcat_onehot = None
    if 'path_pipeline' in pars:
        colcat_onehot = load(pars['path_pipeline'] + '/colcat_onehot.pkl')

    ######################################################################################
    colcat = col
    dfcat_hot, colcat_onehot = util_feature.pd_col_to_onehot(
        df[colcat],
        colname=colcat,
        colonehot=colcat_onehot,
        return_val="dataframe,param")
    log(dfcat_hot[colcat_onehot].head(5))

    ######################################################################################
    if 'path_features_store' in pars:
        save_features(dfcat_hot, 'colcat_onehot', pars['path_features_store'])
        save(colcat_onehot,
             pars['path_pipeline_export'] + "/colcat_onehot.pkl")
        save(colcat, pars['path_pipeline_export'] + "/colcat.pkl")

    col_pars = {}
    col_pars['colcat_onehot'] = colcat_onehot
    col_pars['cols_new'] = {
        # 'colnum'        :  col ,    ###list
        'colcat_onehot': colcat_onehot  ### list
    }

    print("ok ------------")
    return dfcat_hot, col_pars
Exemple #8
0
def pd_colcat_to_onehot(df, col=None, pars=None):
    dfbum_bin = df[col]
    if len(col) == 1:

        colnew = [col[0] + "_onehot"]
        df[colnew] = df[col]
        col_pars = {}
        col_pars['colcat_onehot'] = colnew
        col_pars['cols_new'] = {
            # 'colnum'        :  col ,    ###list
            'colcat_onehot': colnew  ### list
        }
        return df[colnew], col_pars

    path_pipeline = pars.get('path_pipeline', False)
    colcat_onehot = load(
        f'{path_pipeline}/colcat_onehot.pkl') if path_pipeline else None

    colcat = col
    log("#### colcat to onehot")
    dfcat_hot, colcat_onehot = util_feature.pd_col_to_onehot(
        df[colcat],
        colname=colcat,
        colonehot=colcat_onehot,
        return_val="dataframe,param")
    log(dfcat_hot[colcat_onehot].head(5))

    if 'path_features_store' in pars:
        path_features_store = pars['path_features_store']
        save_features(dfcat_hot, 'colcat_onehot', path_features_store)
        save(colcat_onehot,
             pars['path_pipeline_export'] + "/colcat_onehot.pkl")
        save(colcat, pars['path_pipeline_export'] + "/colcat.pkl")

    col_pars = {}
    col_pars['colcat_onehot'] = colcat_onehot
    col_pars['cols_new'] = {
        # 'colnum'        :  col ,    ###list
        'colcat_onehot': colcat_onehot  ### list
    }

    print("ok ------------")
    return dfcat_hot, col_pars
Exemple #9
0
def pd_colcat_minhash(df, col, pars):
    """
       MinHash Algo for category
       https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087

    """
    prefix = 'colcat_minhash'
    colcat              = col

    pars_minhash = {'n_component' : [4, 2], 'model_pretrain_dict' : None,}
    if 'path_pipeline_export' in pars :
        try :
            pars_minhash = load( pars['path_pipeline_export'] + '/colcat_minhash_pars.pkl')
        except : pass

    log("#### Colcat to Hash encoding #############################################")
    from utils import util_text
    dfcat_bin, col_hash_model= util_text.pd_coltext_minhash(df[colcat], colcat,
                                                            return_val="dataframe,param", **pars_minhash )
    colcat_minhash = list(dfcat_bin.columns)
    log(col_hash_model)

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
       save_features(dfcat_bin, prefix, pars['path_features_store'])
       save(colcat_minhash, pars['path_pipeline_export'] + f"/{prefix}.pkl" )
       save(pars_minhash,   pars['path_pipeline_export'] + f"/{prefix}_pars.pkl" )
       save(col_hash_model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl" )

    col_pars = {}
    col_pars['col_hash_model'] = col_hash_model
    col_pars['cols_new'] = {
     'colcat_minhash' :  colcat_minhash  ### list
    }
    return dfcat_bin, col_pars
Exemple #10
0
def pd_colcat_symbolic(df, col, pars):
    """
       https://github.com/arita37/deltapy

       pip install deltapy

    """
    pars_encoder = pars
    pars_encoder['cols'] = col
    if 'path_pipeline_export' in pars:
        try:
            pars_encoder = load(pars['path_pipeline_export'] +
                                '/col_genetic_pars.pkl')
            model_encoder = load(pars['path_pipeline_export'] +
                                 '/col_genetic_model.pkl')
            col_encoder = load(pars['path_pipeline_export'] +
                               '/col_genetic.pkl')
        except:
            pass

    ###################################################################################
    coly = pars['coly']
    from gplearn.genetic import SymbolicTransformer
    function_set = [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan'
    ]

    gp = SymbolicTransformer(generations=20,
                             population_size=200,
                             hall_of_fame=100,
                             n_components=10,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9,
                             verbose=1,
                             random_state=0,
                             n_jobs=6)

    gen_feats = gp.fit_transform(df[col], df[coly])
    gen_feats = pd.DataFrame(
        gen_feats,
        columns=["gen_" + str(a) for a in range(gen_feats.shape[1])])
    gen_feats.index = df.index
    dfnew = gen_feats
    dfnew.columns = [t for t in dfnew.columns]

    ###################################################################################
    colnew = list(dfnew.columns)
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(dfnew, 'dfgen', pars['path_features_store'])
        save(gp, pars['path_pipeline_export'] + "/col_genetic_model.pkl")
        save(pars_encoder,
             pars['path_pipeline_export'] + "/col_genetic_pars.pkl")
        save(colnew, pars['path_pipeline_export'] + "/col_genetic.pkl")

    col_pars = {'model': gp}
    col_pars['cols_new'] = {
        'col_genetic': colnew  ### list
    }
    return dfnew, col_pars
Exemple #11
0
def pd_colcat_encoder_generic(df, col, pars):
    """
        Create a Class or decorator
        https://pypi.org/project/category-encoders/
        encoder = ce.BackwardDifferenceEncoder(cols=[...])
        encoder = ce.BaseNEncoder(cols=[...])
        encoder = ce.BinaryEncoder(cols=[...])
        encoder = ce.CatBoostEncoder(cols=[...])
        encoder = ce.CountEncoder(cols=[...])
        encoder = ce.GLMMEncoder(cols=[...])
        encoder = ce.HashingEncoder(cols=[...])
        encoder = ce.HelmertEncoder(cols=[...])
        encoder = ce.JamesSteinEncoder(cols=[...])
        encoder = ce.LeaveOneOutEncoder(cols=[...])
        encoder = ce.MEstimateEncoder(cols=[...])
        encoder = ce.OneHotEncoder(cols=[...])
        encoder = ce.OrdinalEncoder(cols=[...])
        encoder = ce.SumEncoder(cols=[...])
        encoder = ce.PolynomialEncoder(cols=[...])
        encoder = ce.TargetEncoder(cols=[...])
        encoder = ce.WOEEncoder(cols=[...])
    """
    prefix = "colcat_encoder_generic"
    pars_model = None
    if 'path_pipeline' in pars:  ### Load during Inference
        colcat_encoder = load(pars['path_pipeline'] + f"/{prefix}.pkl")
        pars_model = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")
        #model         = load( pars['path_pipeline'] + f"/{prefix}_model.pkl" )

    ####### Custom Code ###############################################################
    from category_encoders import HashingEncoder, WOEEncoder
    pars_model = pars.get('model_pars',
                          {}) if pars_model is None else pars_model
    pars_model['cols'] = col
    model_name = pars.get('model_name', 'HashingEncoder')

    model_class = {'HashingEncoder': HashingEncoder}[model_name]
    model = model_class(**pars_model)
    dfcat_encoder = model.fit_transform(df[col])

    dfcat_encoder.columns = [t + "_cod" for t in dfcat_encoder.columns]
    colcat_encoder = list(dfcat_encoder.columns)

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(dfcat_encoder, 'dfcat_encoder',
                      pars['path_features_store'])
        save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(pars_model, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")
        save(colcat_encoder, pars['path_pipeline_export'] + f"/{prefix}.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        'colcat_encoder_generic': colcat_encoder  ### list
    }
    return dfcat_encoder, col_pars
Exemple #12
0
def prepro_save(prefix, pars, df_new, cols_new,
                prepro) -> (pd.DataFrame, dict):
    """  Save preprocessors and export
    :param prefix:
    :param pars:
    :param df_new:
    :param cols_new:
    :param prepro:
    :param pars_prepro:
    :return:
    """
    ### Clean Pars of extra heavy data
    pars2 = {}
    for k, val in pars.items():
        if isinstance(val, pd.DataFrame):
            continue
        pars2[k] = val

    if "path_features_store" in pars and "path_pipeline_export" in pars:
        save(prepro, pars["path_pipeline_export"] + f"/{prefix}_model.pkl")
        save(cols_new, pars["path_pipeline_export"] + f"/{prefix}_cols.pkl")
        save(pars2, pars["path_pipeline_export"] + f"/{prefix}_pars.pkl")

    ###### Training & Inference time : df + new column names ##########################
    col_pars = {
        "prefix": prefix,
        "path": pars.get("path_pipeline_export",
                         pars.get("path_pipeline", None))
    }
    col_pars["cols_new"] = {
        prefix: cols_new  ### new column list
    }
    return df_new, col_pars
Exemple #13
0
def pd_colcat_encoder_generic(df, col, pars):
    """
       https://pypi.org/project/category-encoders/
       encoder = ce.BackwardDifferenceEncoder(cols=[...])
encoder = ce.BaseNEncoder(cols=[...])
encoder = ce.BinaryEncoder(cols=[...])
encoder = ce.CatBoostEncoder(cols=[...])
encoder = ce.CountEncoder(cols=[...])
encoder = ce.GLMMEncoder(cols=[...])
encoder = ce.HashingEncoder(cols=[...])
encoder = ce.HelmertEncoder(cols=[...])
encoder = ce.JamesSteinEncoder(cols=[...])
encoder = ce.LeaveOneOutEncoder(cols=[...])
encoder = ce.MEstimateEncoder(cols=[...])
encoder = ce.OneHotEncoder(cols=[...])
encoder = ce.OrdinalEncoder(cols=[...])
encoder = ce.SumEncoder(cols=[...])
encoder = ce.PolynomialEncoder(cols=[...])
encoder = ce.TargetEncoder(cols=[...])
encoder = ce.WOEEncoder(cols=[...])


    """
    colcat = col
    import category_encoders as ce
    pars_encoder = pars
    pars_encoder['cols'] = col
    if 'path_pipeline_export' in pars:
        try:
            pars_encoder = load(pars['path_pipeline_export'] +
                                '/colcat_encoder_pars.pkl')
        except:
            pass

    encoder = ce.HashingEncoder(**pars_encoder)
    dfcat_bin = encoder.fit_transform(df[col])

    dfcat_bin.columns = [t for t in dfcat_bin.columns]
    colcat_encoder = list(dfcat_bin.columns)

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(dfcat_bin, 'dfcat_encoder', pars['path_features_store'])
        save(encoder,
             pars['path_pipeline_export'] + "/colcat_encoder_model.pkl")
        save(pars_encoder,
             pars['path_pipeline_export'] + "/colcat_encoder_pars.pkl")
        save(colcat_encoder,
             pars['path_pipeline_export'] + "/colcat_encoder.pkl")

    col_pars = {}
    col_pars['col_encode_model'] = encoder
    col_pars['cols_new'] = {
        'colcat_encoder': colcat_encoder  ### list
    }
    return dfcat_bin, col_pars
Exemple #14
0
def pd_ts_deltapy2(
    df=None,
    col=None,
    pars={},
):
    """
       Delta py
       pars : {  'name' :  "robust_scaler",
                 'pars'  :  {}
       }
    """
    prefix = 'colts_deltapy'

    ###### Custom code ################################################################
    dfin = df.fillna(method='ffill')
    model_name = pars['name']
    model_pars = pars.get('pars', {})

    if 'path_pipeline' in pars:  #### Prediction time
        model = load(pars['path_pipeline'] + f"/{prefix}_model.pkl")
        pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")

    else:  ### Training time  : Dynamic function load
        from util_feature import load_function_uri
        ##### transform.robust_scaler(df, drop=["Close_1"])
        model = load_function_uri2(model_name)

    ##### Transform Data  ############################################################
    df_out = model(dfin, **model_pars)

    # Extract only returns one value, so no columns to loop over.
    model_name2 = model_name.replace("::", "-")
    if 'extract' in model_name:
        col_out = "0_" + model_name
    else:
        col_out = [coli + "_" + model_name for coli in df_out.columns]
        df_out.columns = col_out
        df_out.index = df_out.index
    col_new = col_out

    ###### Export #####################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df_out, 'df_' + prefix, pars['path_features_store'])
        save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col_new, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: col_new  ### list of columns
    }
    return df_out, col_pars
Exemple #15
0
def pd_sample_imblearn(df=None, col=None, pars=None):
    """
        Over-sample
    """
    params_check(pars, ['model_name', 'pars_resample', 'coly'])  # , 'dfy'
    prefix = '_sample_imblearn'

    ######################################################################################
    from imblearn.over_sampling import SMOTE
    from imblearn.combine import SMOTEENN, SMOTETomek
    from imblearn.under_sampling import NearMiss

    # model_resample = { 'SMOTE' : SMOTE, 'SMOTEENN': SMOTEENN }[  pars.get("model_name", 'SMOTEENN') ]
    model_resample = locals()[pars.get("model_name", 'SMOTEENN')]
    pars_resample = pars.get('pars_resample', {
        'sampling_strategy': 'auto',
        'random_state': 0
    })  # , 'n_jobs': 2

    if 'path_pipeline' in pars:  #### Inference time
        return df, {'col_new': col}

    else:  ### Training time
        colX = col  # [col_ for col_ in col if col_ not in coly]
        coly = pars['coly']
        train_y = pars['dfy']  ## df[coly] #
        train_X = df[colX].fillna(method='ffill')
        gp = model_resample(**pars_resample)
        X_resample, y_resample = gp.fit_resample(train_X, train_y)

        col_new = [t + f"_{prefix}" for t in col]
        df2 = pd.DataFrame(X_resample,
                           columns=col_new)  # , index=train_X.index
        df2[coly] = y_resample

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df2, prefix.replace("col_", "df_"),
                      pars['path_features_store'])
        save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_resample,
             pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: col_new  ###  for training input data
    }
    return df2, col_pars
Exemple #16
0
def pd_col_genetic_transform(df=None, col=None, pars=None):
    """
        Find Symbolic formulae for faeture engineering

    """
    prefix = 'col_genetic'
    ######################################################################################
    from gplearn.genetic import SymbolicTransformer
    coly = pars['coly']
    colX = [t for t in col if t not in [coly]]
    train_X = df[colX]
    train_y = df[coly]

    function_set = [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan'
    ]
    pars_genetic = pars.get('pars_genetic', {
        'generations': 20,
        'n_components': 10,
        'population_size': 200
    })

    gp = SymbolicTransformer(hall_of_fame=100,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9,
                             verbose=1,
                             random_state=0,
                             n_jobs=6,
                             **pars_genetic)

    gp.fit(train_X, train_y)
    df_genetic = gp.transform(train_X)
    df_genetic = pd.DataFrame(
        df_genetic,
        columns=["gen_" + str(a) for a in range(df_genetic.shape[1])])
    df_genetic.index = train_X.index

    col_genetic = list(df_genetic.columns)
    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df_genetic, 'df_genetic', pars['path_features_store'])
        save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_genetic,
             pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {'model': gp, 'pars': pars_genetic}
    col_pars['cols_new'] = {
        'col_genetic': col_genetic  ### list
    }
    return df_genetic, col_pars
Exemple #17
0
def pd_colts_generate(df=None, col=None, pars={}):
    """
       pars : {  'model_name' :  "transform.robust_scaler",
                 'model_pars'  :  {}


       }
    """
    prefix = 'colts_generate'

    ###### Custom code ################################################################
    dfin = df[col].fillna(method='ffill')
    model_name = pars['model_name']
    model_pars = pars.get('model_pars', {})

    if 'path_pipeline' in pars:  #### Prediction time
        model = load(pars['path_pipeline'] + f"/{prefix}_model.pkl")
        pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")

    else:  ### Training time  : Dynamic function load
        from util_feature import load_function_uri
        ##### transform.robust_scaler(df, drop=["Close_1"])
        model = load_function_uri(model_name)

    model_name = model_name.replace(".", "_")

    ##### Transform Data  ############################################################
    df_out = model(dfin, col, **model_pars)
    col_out = [coli + "_" + model_name for coli in df_out.columns]
    df_out.columns = col_out
    df_out.index = train_X.index
    col_new = col_out

    ###### Export #####################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df_out, 'df_' + prefix, pars['path_features_store'])
        save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col_new, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: col_new  ### list of columns
    }
    return df_out, col_pars
Exemple #18
0
def pd_col_atemplate(df=None, col=None, pars={}):
    """
    Example of custom Processor
    Used at prediction time
        "path_pipeline"  : 

    Training time :
        "path_features_store" :  to store intermediate dataframe
        "path_pipeline_export":  to store pipeline  for later usage

    """
    from source.util_feature import save, load
    prefix = "col_myfun"
    #### Inference time LOAD previous pars  ###########################################
    if "path_pipeline" in pars:
        prepro = load(pars["path_pipeline"] + f"/{prefix}_model.pkl")
        pars = load(pars["path_pipeline"] + f"/{prefix}_pars.pkl")
        pars = {} if pars is None else pars

    #### Do something #################################################################
    df_new = df[col]  ### Do nithi
    df_new.columns = [col + "_myfun" for col in df.columns]
    cols_new = list(df_new.columns)

    prepro = None  ### model
    pars_new = None  ### new params

    ###################################################################################
    ###### Training time save all #####################################################
    if "path_features_store" in pars and "path_pipeline_export" in pars:
        save(prepro, pars["path_pipeline_export"] + f"/{prefix}_model.pkl")
        save(cols_new, pars["path_pipeline_export"] + f"/{prefix}.pkl")
        save(pars_new, pars["path_pipeline_export"] + f"/{prefix}_pars.pkl")

    ###### Training & Inference time : df + new column names ##########################
    col_pars = {
        "prefix": prefix,
        "path": pars.get("path_pipeline_export",
                         pars.get("path_pipeline", None))
    }
    col_pars["cols_new"] = {
        "col_myfun": cols_new  ### new column list
    }
    return df_new, col_pars
Exemple #19
0
def pd_filter_resample(df=None, col=None, pars=None):
    """
        Over-sample, Under-sample
    """
    prefix = 'col_imbalance'
    ######################################################################################
    from imblearn.over_sampling import SMOTE

    model_resample = { 'SMOTE' : SMOTE}[  pars.get("model_name", 'SMOTE') ]

    pars_resample  = pars.get('pars_resample',
                             {'sampling_strategy' : 'auto', 'random_state':0, 'k_neighbors':5, 'n_jobs': 2})

    if 'path_pipeline' in pars :   #### Inference time
        return df, {'col_new': col }
        #gp   = load(pars['path_pipeline'] + f"/{prefix}_model.pkl" )
        #pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl" )

    else :     ### Training time
        colX          = col # [col_ for col_ in col if col_ not in coly]
        train_X       = df[colX].fillna(method='ffill')
        coly     = pars['coly']
        train_y  = pars['dfy']
        gp       = model_resample( **pars_resample)
        X_resample, y_resample = gp.fit_resample(train_X, train_y)

        df2       = pd.DataFrame(X_resample, columns = col, index=train_X.index)
        df2[coly] = y_resample


    col_new = col
    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
       save_features(df2, 'df_resample', pars['path_features_store'])
       save(gp,             pars['path_pipeline_export'] + f"/{prefix}_model.pkl" )
       save(col,            pars['path_pipeline_export'] + f"/{prefix}.pkl" )
       save(pars_resample,   pars['path_pipeline_export'] + f"/{prefix}_pars.pkl" )


    col_pars = {'prefix' : prefix , 'path' :   pars.get('path_pipeline_export', pars.get('path_pipeline', None)) }
    col_pars['cols_new'] = {
       prefix :  col_new  ### list
    }
    return df2, col_pars
Exemple #20
0
def pd_coltext(df, col, pars={}):
    """
    df : Datframe
    col : list of columns
    pars : dict of pars

    """
    from utils import util_text, util_model

    #### Load pars ###################################################################
    path_pipeline        = pars.get('path_pipeline', None)
    word_tokeep_dict_all = load(  path_pipeline + "/word_tokeep_dict_all.pkl" )  if path_pipeline is not None else {}
    # dftext_tdidf_all = load(f'{path_pipeline}/dftext_tdidf.pkl') if  path_pipeline else None
    # dftext_svd_list_all      = load(f'{path_pipeline}/dftext_svd.pkl')   if  path_pipeline else None
    dimpca       = pars.get('dimpca', 2)
    word_minfreq = pars.get('word_minfreq', 3)

    #### Process  ####################################################################
    stopwords           = nlp_get_stopwords()
    dftext              = pd_coltext_clean(df, col, stopwords= stopwords , pars=pars)
    dftext_svd_list_all = None
    dftext_tdidf_all    = None

    ### Processing each of text columns to create a bag of word/to load the bag of word -> tf-idf -> svd
    for col_ in col:

            if path_pipeline is not None:
                ### If it is in Inference step, use the saved bag of word for the column `col_`
                word_tokeep = word_tokeep_dict_all[col_]

            else:
                ### If it is not, create a bag of word
                coltext_freq, word_tokeep = pd_coltext_wordfreq(df, col_, stopwords, ntoken=100)  ## nb of words to keep
                word_tokeep_dict_all[col_] = word_tokeep  ## save the bag of wrod for `col_` in a dict

            dftext_tdidf_dict, word_tokeep_dict = util_text.pd_coltext_tdidf(dftext, coltext=col_, word_minfreq= word_minfreq,
                                                                             word_tokeep = word_tokeep,
                                                                             return_val  = "dataframe,param")

            dftext_tdidf_all = pd.DataFrame(dftext_tdidf_dict) if dftext_tdidf_all is None else pd.concat((dftext_tdidf_all,pd.DataFrame(dftext_tdidf_dict)),axis=1)
            log(word_tokeep_dict)

            ###  Dimesnion reduction for Sparse Matrix
            dftext_svd_list, svd_list = util_model.pd_dim_reduction(dftext_tdidf_dict,
                                                           colname        = None,
                                                           model_pretrain = None,
                                                           colprefix      = col_ + "_svd",
                                                           method         = "svd",  dimpca=dimpca,  return_val="dataframe,param")

            dftext_svd_list_all = dftext_svd_list if dftext_svd_list_all is None else pd.concat((dftext_svd_list_all,dftext_svd_list),axis=1)
    #################################################################################

    ###### Save and Export ##########################################################
    if 'path_features_store' in pars:
            save_features(dftext_svd_list_all, 'dftext_svd' + "-" + str(col), pars['path_features_store'])
            # save(dftext_svd_list_all,  pars['path_pipeline_export'] + "/dftext_svd.pkl")
            # save(dftext_tdidf_all,     pars['path_pipeline_export'] + "/dftext_tdidf.pkl" )
            save(word_tokeep_dict_all,     pars['path_pipeline_export'] + "/word_tokeep_dict_all.pkl" )

    col_pars = {}
    col_pars['cols_new'] = {
     # 'coltext_tdidf'    : dftext_tdidf_all.columns.tolist(),       ### list
     'coltext_svd'      : dftext_svd_list_all.columns.tolist()      ### list
    }

    dftext_svd_list_all.index = dftext.index
    # return pd.concat((dftext_svd_list_all,dftext_svd_list_all),axis=1), col_pars
    return dftext_svd_list_all, col_pars
Exemple #21
0
def preprocess(path_train_X="",
               path_train_y="",
               path_pipeline_export="",
               cols_group=None,
               n_sample=5000,
               preprocess_pars={},
               path_features_store=None):
    """
      Used for trainiing only
      Save params on disk

    :param path_train_X:
    :param path_train_y:
    :param path_pipeline_export:
    :param cols_group:
    :param n_sample:
    :param preprocess_pars:
    :param path_features_store:
    :return:
    """
    ##### column names for feature generation #####################################################
    log(cols_group)
    coly = cols_group['coly']  # 'salary'
    colid = cols_group['colid']  # "jobId"
    colcat = cols_group[
        'colcat']  # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ]
    colnum = cols_group['colnum']  # ['yearsExperience', 'milesFromMetropolis']
    os.makedirs(path_pipeline_export, exist_ok=True)
    log(path_pipeline_export)
    save(colid, f'{path_pipeline_export}/colid.pkl')

    ### Pipeline Execution ##########################################
    pipe_default = [{
        'uri': 'source/prepro.py::pd_coly',
        'pars': {},
        'cols_family': 'coly',
        'type': 'coly'
    }, {
        'uri': 'source/prepro.py::pd_colnum_bin',
        'pars': {},
        'cols_family': 'colnum',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colnum_binto_onehot',
        'pars': {},
        'cols_family': 'colnum_bin',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colcat_bin',
        'pars': {},
        'cols_family': 'colcat',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colcat_to_onehot',
        'pars': {},
        'cols_family': 'colcat_bin',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colcross',
        'pars': {},
        'cols_family': 'colcross',
        'type': 'cross'
    }]

    pipe_list = preprocess_pars.get('pipe_list', pipe_default)
    pipe_list_X = [
        task for task in pipe_list
        if task.get('type', '') not in ['coly', 'filter']
    ]
    pipe_list_y = [
        task for task in pipe_list if task.get('type', '') in ['coly']
    ]
    pipe_filter = [
        task for task in pipe_list if task.get('type', '') in ['filter']
    ]
    ##### Load data #################################################################################
    df = load_dataset(path_train_X, path_train_y, colid, n_sample=n_sample)

    ##### Generate features ##########################################################################
    dfi_all = {}  ### Dict of all features
    cols_family_all = {'colid': colid, 'colnum': colnum, 'colcat': colcat}

    if len(pipe_filter) > 0:
        log("#####  Filter  #########################################################################"
            )
        pipe_i = pipe_filter[0]
        pipe_fun = load_function_uri(pipe_i['uri'])
        df, col_pars = pipe_fun(df,
                                list(df.columns),
                                pars=pipe_i.get('pars', {}))

    if len(pipe_list_y) > 0:
        log("#####  coly  ###########################################################################"
            )
        pipe_i = pipe_list_y[0]
        pipe_fun = load_function_uri(pipe_i['uri'])
        logs("----------df----------\n", df)
        pars = pipe_i.get('pars', {})
        pars['path_features_store'] = path_features_store
        pars['path_pipeline_export'] = path_pipeline_export
        df, col_pars = pipe_fun(df, cols_group['coly'],
                                pars=pars)  ### coly can remove rows

        logs("----------df----------\n", df)
        dfi_all['coly'] = df[cols_group['coly']]
        cols_family_all['coly'] = cols_group['coly']
        save_features(df[cols_group['coly']], "coly",
                      path_features_store)  ### already saved
        save(coly, f'{path_pipeline_export}/coly.pkl')

    #####  Processors  ###############################################################################
    dfi_all['coly'] = df[cols_group['coly']]
    #for colg, colg_list in cols_group.items() :
    #   if colg not in  ['colid']:
    #      dfi_all[colg]   = df[colg_list]   ## colnum colcat, coly

    for pipe_i in pipe_list_X:
        log("###################", pipe_i,
            "##########################################################")
        pipe_fun = load_function_uri(
            pipe_i['uri'])  ### Load the code definition  into pipe_fun
        cols_name = pipe_i['cols_family']
        col_type = pipe_i['type']

        pars = pipe_i.get('pars', {})
        pars[
            'path_features_store'] = path_features_store  ### intermdiate dataframe
        pars['path_pipeline_export'] = path_pipeline_export  ### Store pipeline

        if col_type == 'cross':
            log("###################  Adding Cross ###################################################"
                )
            pars['dfnum_hot'] = dfi_all[
                'colnum_onehot']  ### dfnum_hot --> dfcross
            pars['dfcat_hot'] = dfi_all['colcat_onehot']
            pars['colid'] = colid
            pars['colcross_single'] = cols_group.get('colcross', [])

        elif col_type == 'add_coly':
            log('add_coly genetic', cols_group['coly'])
            pars['coly'] = cols_group['coly']
            pars['dfy'] = dfi_all['coly']  ### Transformed dfy

        ### Input columns or prevously Computed Columns ( colnum_bin )
        cols_list = cols_group[cols_name] if cols_name in cols_group else list(
            dfi_all[cols_name].columns)
        df_ = df[cols_list] if cols_name in cols_group else dfi_all[cols_name]
        #cols_list  = list(dfi_all[cols_name].columns)
        #df_        = dfi_all[cols_name]

        dfi, col_pars = pipe_fun(df_, cols_list, pars=pars)

        ### Concatenate colnum, colnum_bin into cols_family_all , dfi_all  ###########################
        for colj, colist in col_pars['cols_new'].items():
            ### Merge sub-family
            cols_family_all[colj] = cols_family_all.get(colj, []) + colist
            dfi_all[colj] = pd.concat(
                (dfi_all[colj], dfi), axis=1) if colj in dfi_all else dfi
            # save_features(dfi_all[colj], colj, path_features_store)

    ######  Merge AlL int dfXy  ##################################################################
    dfXy = df[[coly] + colnum + colcat]
    #dfXy = df[ [coly]  ]

    for t in dfi_all.keys():
        if t not in ['coly', 'colnum', 'colcat']:
            dfXy = pd.concat((dfXy, dfi_all[t]), axis=1)
    save_features(dfXy, 'dfX', path_features_store)

    colXy = list(dfXy.columns)
    colXy.remove(coly)  ##### Only X columns
    if len(colid) > 0:
        cols_family_all['colid'] = colid
    cols_family_all['colX'] = colXy

    ####  Cols group for model input  ###########################################################

    save(colXy, f'{path_pipeline_export}/colsX.pkl')
    save(cols_family_all, f'{path_pipeline_export}/cols_family.pkl')

    ###### Return values  #######################################################################
    return dfXy, cols_family_all
Exemple #22
0
def run_preprocess(config_name,
                   config_path,
                   n_sample=5000,
                   mode='run_preprocess',
                   model_dict=None
                   ):  #prefix "pre" added, in order to make if loop possible
    """
    :param config_name:   titanic_lightgbm
    :param config_path:   titanic_classifier.py
    :param n_sample:     nb of rows used
    :param mode:     'run_preprocess'  / 'load_prerocess'
    :param model_dict:  Optional provide the dict model
    :return: None,  only show and save dataframe
    """

    model_dict = model_dict_load(model_dict,
                                 config_path,
                                 config_name,
                                 verbose=True)

    m = model_dict['global_pars']
    path_data = m['path_data_preprocess']
    path_train_X = m.get(
        'path_data_prepro_X', path_data +
        "/features.zip")  # ### Can be a list of zip or parquet files
    path_train_y = m.get(
        'path_data_prepro_y',
        path_data + "/target.zip")  # ### Can be a list of zip or parquet files

    path_output = m['path_train_output']
    path_pipeline = m.get('path_pipeline', path_output + "/pipeline/")
    path_features_store = m.get(
        'path_features_store', path_output + '/features_store/'
    )  #path_data_train replaced with path_output, because preprocessed files are stored there
    path_check_out = m.get('path_check_out', path_output + "/check/")
    log(path_output)

    log("#### load input column family  ###################################################"
        )
    try:
        cols_group = model_dict['data_pars'][
            'cols_input_type']  ### the model config file
    except:
        cols_group = json.load(open(path_data + "/cols_group.json", mode='r'))

    #pars_download = model_dict['data_pars'].get('download_pars', None )
    #if pars_download :
    #    for url, target_path in pars_download['']:
    #        pass

    log("#### Preprocess  #################################################################"
        )
    preprocess_pars = model_dict['model_pars']['pre_process_pars']

    if mode == "run_preprocess":
        dfXy, cols = preprocess(path_train_X, path_train_y, path_pipeline,
                                cols_group, n_sample, preprocess_pars,
                                path_features_store)

    elif mode == "load_preprocess":
        dfXy, cols = preprocess_load(path_train_X, path_train_y, path_pipeline,
                                     cols_group, n_sample, preprocess_pars,
                                     path_features_store)
    model_dict['data_pars']['coly'] = cols['coly']

    ### Generate actual column names from colum groups  INTO a single list of columns
    model_dict['data_pars']['cols_model'] = sum([
        cols[colgroup]
        for colgroup in model_dict['data_pars']['cols_model_group']
    ], [])
    log(model_dict['data_pars']['cols_model'], model_dict['data_pars']['coly'])

    log("#### Save data on disk #############################")
    dfXy.to_parquet(path_output + "/dfXy.parquet")
    save(model_dict, path_output + "/model_dict.pkl")

    log("######### finish #################################", )
Exemple #23
0
def pd_col_genetic_transform(df=None, col=None, pars=None):
    """
        Find Symbolic formulae for faeture engineering

    """
    prefix = 'col_genetic'
    ######################################################################################
    from gplearn.genetic import SymbolicTransformer
    from gplearn.functions import make_function
    import random

    colX = col  # [col_ for col_ in col if col_ not in coly]
    train_X = df[colX].fillna(method='ffill')
    feature_name_ = colX

    def squaree(x):
        return x * x

    square_ = make_function(function=squaree, name='square_', arity=1)

    function_set = pars.get('function_set', [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan',
        square_
    ])
    pars_genetic = pars.get(
        'pars_genetic',
        {
            'generations': 5,
            'population_size': 10,  ### Higher than nb_features
            'metric': 'spearman',
            'tournament_size': 20,
            'stopping_criteria': 1.0,
            'const_range': (-1., 1.),
            'p_crossover': 0.9,
            'p_subtree_mutation': 0.01,
            'p_hoist_mutation': 0.01,
            'p_point_mutation': 0.01,
            'p_point_replace': 0.05,
            'parsimony_coefficient': 0.005,  ####   0.00005 Control Complexity
            'max_samples': 0.9,
            'verbose': 1,

            #'n_components'      ### Control number of outtput features  : n_components
            'random_state': 0,
            'n_jobs': 4,
        })

    if 'path_pipeline' in pars:  #### Inference time
        gp = load(pars['path_pipeline'] + f"/{prefix}_model.pkl")
        pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")
    else:  ### Training time
        coly = pars['coly']
        train_y = pars['dfy']
        gp = SymbolicTransformer(
            hall_of_fame=train_X.shape[1] + 1,  ### Buggy
            n_components=pars_genetic.get('n_components', train_X.shape[1]),
            feature_names=feature_name_,
            function_set=function_set,
            **pars_genetic)
        gp.fit(train_X, train_y)

    ##### Transform Data  #########################################
    df_genetic = gp.transform(train_X)
    tag = random.randint(0, 10)  #### UNIQUE TAG
    col_genetic = [f"gen_{tag}_{i}" for i in range(df_genetic.shape[1])]
    df_genetic = pd.DataFrame(df_genetic,
                              columns=col_genetic,
                              index=train_X.index)
    df_genetic.index = train_X.index
    pars_gen_all = {'pars_genetic': pars_genetic, 'function_set': function_set}

    ##### Formulae Exrraction #####################################
    formula = str(gp).replace("[", "").replace("]", "")
    flist = formula.split(",\n")
    form_dict = {x: flist[i] for i, x in enumerate(col_genetic)}
    pars_gen_all['formulae_dict'] = form_dict
    log("########## Formulae ", form_dict)
    # col_pars['map_dict'] = dict(zip(train_X.columns.to_list(), feature_name_))

    col_new = col_genetic

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df_genetic, 'df_genetic', pars['path_features_store'])
        save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_gen_all,
             pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")
        # save(form_dict,      pars['path_pipeline_export'] + f"/{prefix}_formula.pkl")
        save_json(form_dict, pars['path_pipeline_export'] +
                  f"/{prefix}_formula.json")  ### Human readable

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: col_new  ### list
    }
    return df_genetic, col_pars
Exemple #24
0
    if 'path_model_load' in pars:
            model = load(pars['path_model_load'])
    else:
            log('##### Training Started #####')
                
            model = {'TVAE' : TVAE, 'CTGAN' : CTGAN, 'PAR' : PAR}[model_name]
            if model_name == 'PAR':
                model = model(entity_columns = pars['entity_columns'],
                              context_columns = pars['context_columns'],
                              sequence_index = pars['sequence_index'])
            else:
                model = model(primary_key=primary_key)   
            model.fit(df)
            log('##### Training Finshed #####')
            try:
                 save(model, path_model_save )
                 log('model saved at: ', path_model_save  )
            except:
                 log('saving model failed: ', path_model_save)

    log('##### Generating Samples #############')
    new_data = model.sample(n_samples)
    log_pd( new_data, n=7)
    
   
    log('######### Evaluation Results #########')
    if metrics_type == True:
      evals = evaluate(new_data, df, aggregate= True )        
      log(evals)
    else:
      evals = evaluate(new_data, df, aggregate= False )        
Exemple #25
0
def pd_augmentation_sdv(df, col=None, pars={}):
    '''
    Using SDV Variation Autoencoders, the function augments more data into the dataset
    params:
            df          : (pandas dataframe) original dataframe
            col : column name for data enancement
            pars        : (dict - optional) contains:                                          
                n_samples     : (int - optional) number of samples you would like to add, defaul is 10%
                primary_key   : (String - optional) the primary key of dataframe
                aggregate  : (boolean - optional) if False, prints SVD metrics, else it averages them
                path_model_save: saving location if save_model is set to True
                path_model_load: saved model location to skip training
                path_data_new  : new data where saved
    returns:
            df_new      : (pandas dataframe) df with more augmented data
            col         : (list of strings) same columns
    '''
    n_samples = pars.get('n_samples', max(1, int(
        len(df) * 0.10)))  ## Add 10% or 1 sample by default value
    primary_key = pars.get('colid', None)  ### Custom can be created on the fly
    metrics_type = pars.get('aggregate', False)
    path_model_save = pars.get('path_model_save', 'data/output/ztmp/')
    model_name = pars.get('model_name', "TVAE")

    # importing libraries
    try:
        #from sdv.demo import load_tabular_demo
        from sdv.tabular import TVAE
        from sdv.tabular import CTGAN
        from sdv.timeseries import PAR
        from sdv.evaluation import evaluate
        import ctgan

        if ctgan.__version__ != '0.3.1.dev0':
            raise Exception('ctgan outdated, updating...')
    except:
        os.system("pip install sdv")
        os.system('pip install ctgan==0.3.1.dev0')
        from sdv.tabular import TVAE
        from sdv.tabular import CTGAN
        from sdv.timeseries import PAR
        from sdv.evaluation import evaluate

    # model fitting
    if 'path_model_load' in pars:
        model = load(pars['path_model_load'])
    else:
        log('##### Training Started #####')

        model = {'TVAE': TVAE, 'CTGAN': CTGAN, 'PAR': PAR}[model_name]
        if model_name == 'PAR':
            model = model(entity_columns=pars['entity_columns'],
                          context_columns=pars['context_columns'],
                          sequence_index=pars['sequence_index'])
        else:
            model = model(primary_key=primary_key)
        model.fit(df)
        log('##### Training Finshed #####')
        try:
            save(model, path_model_save)
            log('model saved at: ', path_model_save)
        except:
            log('saving model failed: ', path_model_save)

    log('##### Generating Samples #############')
    new_data = model.sample(n_samples)
    log_pd(new_data, n=7)

    log('######### Evaluation Results #########')
    if metrics_type == True:
        evals = evaluate(new_data, df, aggregate=True)
        log(evals)
    else:
        evals = evaluate(new_data, df, aggregate=False)
        log_pd(evals, n=7)

    # appending new data
    df_new = df.append(new_data)
    log(str(len(df_new) - len(df)) + ' new data added')

    if 'path_newdata' in pars:
        new_data.to_parquet(pars['path_newdata'] + '/features.parquet')
        log('###### df augmentation save on disk', pars['path_newdata'])

    log('###### augmentation complete ######')
    return df_new, col
Exemple #26
0
def train(model_dict, dfX, cols_family, post_process_fun):
    """  Train the model using model_dict, save model, save prediction
    :param model_dict:  dict containing params
    :param dfX:  pd.DataFrame
    :param cols_family: dict of list containing column names
    :param post_process_fun:
    :return: dfXtrain , dfXval  DataFrame containing prediction.
    """
    model_pars, compute_pars = model_dict['model_pars'], model_dict[
        'compute_pars']
    data_pars = model_dict['data_pars']
    model_name, model_path = model_pars['model_class'], model_dict[
        'global_pars']['path_train_model']
    metric_list = compute_pars['metric_list']

    assert 'cols_model_type2' in data_pars, 'Missing cols_model_type2, split of columns by data type '
    log2(data_pars['cols_model_type2'])

    log("#### Model Input preparation ##################################################"
        )
    log2(dfX.shape)
    dfX = dfX.sample(frac=1.0)
    itrain = int(0.6 * len(dfX))
    ival = int(0.8 * len(dfX))
    colsX = data_pars['cols_model']
    coly = data_pars['coly']
    log2('Model colsX', colsX)
    log2('Model coly', coly)
    log2('Model column type: ', data_pars['cols_model_type2'])

    ### Only Parameters
    data_pars_ref = copy.deepcopy(data_pars)

    #### TODO : Lazy Dict to have large dataset
    data_pars['data_type'] = 'ram'
    data_pars['train'] = {
        'Xtrain': dfX[colsX].iloc[:itrain, :],
        'ytrain': dfX[coly].iloc[:itrain],
        'Xtest': dfX[colsX].iloc[itrain:ival, :],
        'ytest': dfX[coly].iloc[itrain:ival],
        'Xval': dfX[colsX].iloc[ival:, :],
        'yval': dfX[coly].iloc[ival:],
    }

    log("#### Init, Train ############################################################"
        )
    # from config_model import map_model
    modelx = map_model(model_name)
    log2(modelx)
    modelx.reset()
    ###  data_pars_ref has NO data.
    modelx.init(model_pars, data_pars=data_pars_ref, compute_pars=compute_pars)

    ### Using Actual daa in data_pars['train']
    modelx.fit(data_pars, compute_pars)

    log("#### Predict ################################################################"
        )
    ypred, ypred_proba = modelx.predict(dfX[colsX],
                                        data_pars=data_pars_ref,
                                        compute_pars=compute_pars)

    dfX[coly + '_pred'] = ypred  # y_norm(ypred, inverse=True)

    dfX[coly] = dfX[coly].apply(lambda x: post_process_fun(x))
    dfX[coly + '_pred'] = dfX[coly +
                              '_pred'].apply(lambda x: post_process_fun(x))

    if ypred_proba is None:  ### No proba
        ypred_proba_val = None

    elif len(ypred_proba.shape) <= 1:  #### Single dim proba
        ypred_proba_val = ypred_proba[ival:]
        dfX[coly + '_proba'] = ypred_proba

    elif len(ypred_proba.shape) > 1:  ## Muitple proba
        from util_feature import np_conv_to_one_col
        ypred_proba_val = ypred_proba[ival:, :]
        dfX[coly + '_proba'] = np_conv_to_one_col(
            ypred_proba, ";")  ### merge into string "p1,p2,p3,p4"
        log(dfX.head(3).T)

    log2("Actual    : ", dfX[coly])
    log2("Prediction: ", dfX[coly + '_pred'])

    log("#### Metrics ###############################################################"
        )
    from util_feature import metrics_eval
    metrics_test = metrics_eval(metric_list,
                                ytrue=dfX[coly].iloc[ival:],
                                ypred=dfX[coly + '_pred'].iloc[ival:],
                                ypred_proba=ypred_proba_val)
    stats = {'metrics_test': metrics_test}
    log(stats)

    log("### Saving model, dfX, columns #############################################"
        )
    log2(model_path + "/model.pkl")
    os.makedirs(model_path, exist_ok=True)
    save(colsX, model_path + "/colsX.pkl")
    save(coly, model_path + "/coly.pkl")
    modelx.save(model_path, stats)

    log("### Reload model,            ###############################################"
        )
    log2(modelx.model.model_pars, modelx.model.compute_pars)
    modelx = map_model(model_name)
    modelx.load_model(model_path)
    log("Reload model pars", modelx.model.model_pars)
    log2("Reload model", modelx.model)

    return dfX.iloc[:ival, :].reset_index(), dfX.iloc[
        ival:, :].reset_index(), stats
def preprocess(path_train_X="", path_train_y="", path_pipeline_export="", cols_group=None, n_sample=5000,
               preprocess_pars={}, filter_pars={}, path_features_store=None):
    """
    :param path_train_X:
    :param path_train_y:
    :param path_pipeline_export:
    :param cols_group:
    :param n_sample:
    :param preprocess_pars:
    :param filter_pars:
    :param path_features_store:
    :return:
    """
    from util_feature import (pd_colnum_tocat, pd_col_to_onehot, pd_colcat_mapping, pd_colcat_toint,
                              pd_feature_generate_cross)

    ##### column names for feature generation #####################################################
    log(cols_group)
    coly            = cols_group['coly']  # 'salary'
    colid           = cols_group['colid']  # "jobId"
    colcat          = cols_group['colcat']  # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ]
    colnum          = cols_group['colnum']  # ['yearsExperience', 'milesFromMetropolis']
    
    colcross_single = cols_group.get('colcross', [])   ### List of single columns
    coltext         = cols_group.get('coltext', [])
    coldate         = cols_group.get('coldate', [])
    colall          = colnum + colcat + coltext + coldate
    log(colall)

    #### Pipeline Execution
    pipe_default    = [ 'filter', 'label', 'dfnum_bin', 'dfnum_hot',  'dfcat_bin', 'dfcat_hot', 'dfcross_hot', ]
    pipe_list       = preprocess_pars.get('pipe_list', pipe_default)
    pipe_list.append('dfdate')
    pipe_list_pars  = preprocess_pars.get('pipe_pars', [])



    ##### Load data ##############################################################################
    df = load_dataset(path_train_X, path_train_y, colid, n_sample= n_sample)

    ##### Filtering / cleaning rows :   #########################################################
    if "filter" in pipe_list :
        def isfloat(x):
            try :
                a= float(x)
                return 1
            except:
                return 0
        ymin, ymax = filter_pars.get('ymin', -9999999999.0), filter_pars.get('ymax', 999999999.0)
        print(coly)
        df['_isfloat'] = df[ coly ].apply(lambda x : isfloat(x))
        print(df['_isfloat'])
        df = df[ df['_isfloat'] > 0 ]
        df = df[df[coly] > ymin]
        df = df[df[coly] < ymax]


    ##### Label processing   ####################################################################
    y_norm_fun = None
    if "label" in pipe_list :
        # Target coly processing, Normalization process  , customize by model
        log("y_norm_fun preprocess_pars")
        y_norm_fun = preprocess_pars.get('y_norm_fun', None)
        if y_norm_fun is not None:
            df[coly] = df[coly].apply(lambda x: y_norm_fun(x))
            save(y_norm_fun, f'{path_pipeline_export}/y_norm.pkl' )
            save_features(df[coly], 'dfy', path_features_store)


    ########### colnum procesing   #############################################################
    for x in colnum:
        print('bam',x)
        df[x] = df[x].astype("float")
    log(df[colall].dtypes)


    if "dfnum" in pipe_list :
        pass


    if "dfnum_norm" in pipe_list :
        log("### colnum normalize  ###############################################################")
        from util_feature import pd_colnum_normalize
        pars = { 'pipe_list': [ {'name': 'fillna', 'naval' : 0.0 }, {'name': 'minmax'} ]}
        dfnum_norm, colnum_norm = pd_colnum_normalize(df, colname=colnum,  pars=pars, suffix = "_norm",
                                                      return_val="dataframe,param")
        log(colnum_norm)
        save_features(dfnum_norm, 'dfnum_norm', path_features_store)


    if "dfnum_bin" in pipe_list :
        log("### colnum Map numerics to Category bin  ###########################################")
        dfnum_bin, colnum_binmap = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=None,
                                                   bins=10, suffix="_bin", method="uniform",
                                                   return_val="dataframe,param")
        log(colnum_binmap)
        ### Renaming colunm_bin with suffix
        colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())]
        log(colnum_bin)
        save_features(dfnum_bin, 'dfnum_binmap', path_features_store)


    if "dfnum_hot" in pipe_list and "dfnum_bin" in pipe_list  :
        log("### colnum bin to One Hot")
        dfnum_hot, colnum_onehot = pd_col_to_onehot(dfnum_bin[colnum_bin], colname=colnum_bin,
                                                    colonehot=None, return_val="dataframe,param")
        log(colnum_onehot)
        save_features(dfnum_hot, 'dfnum_onehot', path_features_store)


    ##### Colcat processing   ################################################################
    colcat_map = pd_colcat_mapping(df, colcat)
    log(df[colcat].dtypes, colcat_map)

    if "dfcat_hot" in pipe_list :
        log("#### colcat to onehot")
        dfcat_hot, colcat_onehot = pd_col_to_onehot(df[colcat], colname=colcat,
                                                    colonehot=None, return_val="dataframe,param")
        log(dfcat_hot[colcat_onehot].head(5))
        save_features(dfcat_hot, 'dfcat_onehot', path_features_store)



    if "dfcat_bin" in pipe_list :
        log("#### Colcat to integer encoding ")
        dfcat_bin, colcat_bin_map = pd_colcat_toint(df[colcat], colname=colcat,
                                                    colcat_map=None, suffix="_int")
        colcat_bin = list(dfcat_bin.columns)
        save_features(dfcat_bin, 'dfcat_bin', path_features_store)

    if "dfcross_hot" in pipe_list :
        log("#####  Cross Features From OneHot Features   ######################################")
        try :
           df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left')
        except :
           df_onehot = copy.deepcopy(dfcat_hot)

        colcross_single_onehot_select = []
        for t in list(df_onehot) :
            for c1 in colcross_single :
                if c1 in t :
                   colcross_single_onehot_select.append(t)

        df_onehot = df_onehot[colcross_single_onehot_select ]
        dfcross_hot, colcross_pair = pd_feature_generate_cross(df_onehot, colcross_single_onehot_select,
                                                               pct_threshold=0.02,  m_combination=2)
        log(dfcross_hot.head(2).T)
        colcross_pair_onehot = list(dfcross_hot.columns)
        save_features(dfcross_hot, 'dfcross_onehot', path_features_store)
        del df_onehot ,colcross_pair_onehot

    

    if "dftext" in pipe_list :
        log("##### Coltext processing   ###############################################################")
        stopwords = nlp_get_stopwords()
        pars      = {'n_token' : 100 , 'stopwords': stopwords}
        dftext    = None
        
        for coltext_i in coltext :
            
            ##### Run the text processor on each column text  #############################
            dftext_i = pipe_text( df[[coltext_i ]], coltext_i, pars )
            dftext   = pd.concat((dftext, dftext_i), axis=1)  if dftext is not None else dftext_i
            save_features(dftext_i, 'dftext_' + coltext_i, path_features_store)

        log(dftext.head(6))
        save_features(dftext, 'dftext', path_features_store)



    if "dfdate" in pipe_list :
        log("##### Coldate processing   #############################################################")
        from utils import util_date
        dfdate = None
        for coldate_i in coldate :
            dfdate_i =  util_date.pd_datestring_split( df[[coldate_i]] , coldate_i, fmt="auto", return_val= "split" )
            dfdate  = pd.concat((dfdate, dfdate_i), axis=1)  if dfdate is not None else dfdate_i
            save_features(dfdate_i, 'dfdate_' + coldate_i, path_features_store)
        save_features(dfdate, 'dfdate', path_features_store)
        print('spoo',dfdate)


    ###################################################################################
# ###############
    ##### Save pre-processor meta-parameters
    os.makedirs(path_pipeline_export, exist_ok=True)
    log(path_pipeline_export)
    cols_family = {}

    for t in ['colid',
              "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap",  #### Colnum columns
              "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
              'colcross_single_onehot_select', "colcross_pair_onehot",  'colcross_pair',  #### colcross columns

              'coldate',
              'coltext',

              "coly", "y_norm_fun"
              ]:
        tfile = f'{path_pipeline_export}/{t}.pkl'
        log(tfile)
        t_val = locals().get(t, None)
        if t_val is not None :
           save(t_val, tfile)
           cols_family[t] = t_val


    ######  Merge AlL  #############################################################################
    dfXy = df[colnum + colcat + [coly] ]
    print('localTT',dfXy)
    for t in [ 'dfnum_bin', 'dfnum_hot', 'dfcat_bin', 'dfcat_hot', 'dfcross_hot',
               'dfdate',  'dftext'  ] :
        if t in locals() :
            print('localT', t, locals()[t])
            dfXy = pd.concat((dfXy, locals()[t] ), axis=1)

    save_features(dfXy, 'dfX', path_features_store)
    colXy = list(dfXy.columns)
    colXy.remove(coly)    ##### Only X columns
    cols_family['colX'] = colXy
    save(colXy, f'{path_pipeline_export}/colsX.pkl' )
    save(cols_family, f'{path_pipeline_export}/cols_family.pkl' )


    ###### Return values  #########################################################################
    return dfXy, cols_family
Exemple #28
0
def pd_colnum_quantile_norm(df, col, pars={}):
    """
     colnum normalization by quantile
  """
    prefix = "colnum_quantile_norm"
    df = df[col]
    num_col = col

    ##### Grab previous computed params  ################################################
    pars2 = {}
    if 'path_pipeline' in pars:  #### Load existing column list
        colnum_quantile_norm = load(pars['path_pipeline'] + f'/{prefix}.pkl')
        model = load(pars['path_pipeline'] + f'/{prefix}_model.pkl')
        pars2 = load(pars['path_pipeline'] + f'/{prefix}_pars.pkl')

    lower_bound_sparse = pars2.get('lower_bound_sparse', None)
    upper_bound_sparse = pars2.get('upper_bound_sparse', None)
    lower_bound = pars2.get('lower_bound_sparse', None)
    upper_bound = pars2.get('upper_bound_sparse', None)
    sparse_col = pars2.get('colsparse', ['capital-gain', 'capital-loss'])

    ####### Find IQR and implement to numericals and sparse columns seperately ##########
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1

    for col in num_col:
        if col in sparse_col:
            df_nosparse = pd.DataFrame(df[df[col] != df[col].mode()[0]][col])

            if lower_bound_sparse is not None:
                pass

            elif df_nosparse[col].quantile(
                    0.25) < df[col].mode()[0]:  #Unexpected case
                lower_bound_sparse = df_nosparse[col].quantile(0.25)

            else:
                lower_bound_sparse = df[col].mode()[0]

            if upper_bound_sparse is not None:
                pass

            elif df_nosparse[col].quantile(
                    0.75) < df[col].mode()[0]:  #Unexpected case
                upper_bound_sparse = df[col].mode()[0]

            else:
                upper_bound_sparse = df_nosparse[col].quantile(0.75)

            n_outliers = len(df[(df[col] < lower_bound_sparse) |
                                (df[col] > upper_bound_sparse)][col])

            if n_outliers > 0:
                df.loc[df[col] < lower_bound_sparse,
                       col] = lower_bound_sparse * 0.75  #--> MAIN DF CHANGED
                df.loc[df[col] > upper_bound_sparse,
                       col] = upper_bound_sparse * 1.25  # --> MAIN DF CHANGED

        else:
            if lower_bound is None or upper_bound is None:
                lower_bound = df[col].quantile(0.25) - 1.5 * IQR[col]
                upper_bound = df[col].quantile(0.75) + 1.5 * IQR[col]

            df[col] = np.where(df[col] > upper_bound, 1.25 * upper_bound,
                               df[col])
            df[col] = np.where(df[col] < lower_bound, 0.75 * lower_bound,
                               df[col])

    df.columns = [t + "_qt_norm" for t in df.columns]
    pars_new = {
        'lower_bound': lower_bound,
        'upper_bound': upper_bound,
        'lower_bound_sparse': lower_bound_sparse,
        'upper_bound_sparse': upper_bound_sparse
    }
    dfnew = df
    model = None
    colnew = list(df.columns)

    ##### Export ##############################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df, prefix, pars['path_features_store'])
        save(colnew, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_new, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")
        save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: colnew  ### list
    }
    return dfnew, col_pars
Exemple #29
0
def text_preprocess(path_train_X="",
                    path_train_y="",
                    path_pipeline_export="",
                    cols_group=None,
                    n_sample=5000,
                    preprocess_pars={},
                    filter_pars={},
                    path_features_store=None):
    """

    :param path_train_X:
    :param path_train_y:
    :param path_pipeline_export:
    :param cols_group:
    :param n_sample:
    :param preprocess_pars:
    :param filter_pars:
    :param path_features_store:
    :return:
    """
    from util_feature import (pd_colnum_tocat, pd_col_to_onehot,
                              pd_colcat_mapping, pd_colcat_toint,
                              pd_feature_generate_cross)

    ##### column names for feature generation ###############################################
    log(cols_group)
    coly = cols_group['coly']  # 'salary'
    colid = cols_group['colid']  # "jobId"
    colcat = cols_group[
        'colcat']  # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ]
    colnum = cols_group['colnum']  # ['yearsExperience', 'milesFromMetropolis']

    colcross_single = cols_group.get('colcross',
                                     [])  ### List of single columns
    coltext = cols_group.get('coltext', [])
    coldate = cols_group.get('coldate', [])
    colall = colnum + colcat + coltext + coldate
    log(colall)

    ##### Load data ########################################################################
    df = load_dataset(path_train_X, path_train_y, colid, n_sample=n_sample)

    log("##### Coltext processing   ###############################################################"
        )
    from utils import util_text, util_model

    ### Remoe common words  #############################################
    import json
    import string
    punctuations = string.punctuation
    stopwords = json.load(open("stopwords_en.json"))["word"]
    stopwords = [t for t in string.punctuation] + stopwords
    stopwords = ["", " ", ",", ".", "-", "*", '€', "+", "/"] + stopwords
    stopwords = list(set(stopwords))
    stopwords.sort()
    print(stopwords)
    stopwords = set(stopwords)

    def pipe_text(df, col, pars={}):
        ntoken = pars['n_token']
        df = df.fillna("")
        dftext = df
        log(dftext)
        log(col)
        list1 = []
        list1.append(col)

        # fromword = [ r"\b({w})\b".format(w=w)  for w in fromword    ]
        # print(fromword)
        for col_n in list1:
            dftext[col_n] = dftext[col_n].fillna("")
            dftext[col_n] = dftext[col_n].str.lower()
            dftext[col_n] = dftext[col_n].apply(
                lambda x: x.translate(string.punctuation))
            dftext[col_n] = dftext[col_n].apply(
                lambda x: x.translate(string.digits))
            dftext[col_n] = dftext[col_n].apply(
                lambda x: re.sub("[!@,#$+%*:()'-]", " ", x))

            dftext[col_n] = dftext[col_n].apply(
                lambda x: coltext_stopwords(x, stopwords=stopwords))

        print(dftext.head(6))

        sep = " "
        """
        :param df:
        :param coltext:  text where word frequency should be extracted
        :param nb_to_show:
        :return:
        """
        coltext_freq = df[col].apply(
            lambda x: pd.value_counts(x.split(sep))).sum(axis=0).reset_index()
        coltext_freq.columns = ["word", "freq"]
        coltext_freq = coltext_freq.sort_values("freq", ascending=0)
        log(coltext_freq)

        word_tokeep = coltext_freq["word"].values[:ntoken]
        word_tokeep = [t for t in word_tokeep if t not in stopwords]

        dftext_tdidf_dict, word_tokeep_dict = util_text.pd_coltext_tdidf(
            dftext,
            coltext=col,
            word_minfreq=1,
            word_tokeep=word_tokeep,
            return_val="dataframe,param")

        log(word_tokeep_dict)
        ###  Dimesnion reduction for Sparse Matrix
        dftext_svd_list, svd_list = util_model.pd_dim_reduction(
            dftext_tdidf_dict,
            colname=None,
            model_pretrain=None,
            colprefix=col + "_svd",
            method="svd",
            dimpca=2,
            return_val="dataframe,param")
        return dftext_svd_list

    pars = {'n_token': 100}
    dftext1 = None
    for coltext_i in coltext:
        dftext_i = pipe_text(df[[coltext_i]], coltext_i, pars)
        save_features(dftext_i, 'dftext_' + coltext_i, path_features_store)
        dftext1 = pd.concat(
            (dftext1, dftext_i)) if dftext1 is not None else dftext_i
    print(dftext1.head(6))
    dftext1.to_csv(r"" + path_features_store + "\dftext.csv", index=False)

    ##################################################################################################
    ##### Save pre-processor meta-parameters
    os.makedirs(path_pipeline_export, exist_ok=True)
    log(path_pipeline_export)
    cols_family = {}

    for t in ['coltext']:
        tfile = f'{path_pipeline_export}/{t}.pkl'
        log(tfile)
        t_val = locals().get(t, None)
        if t_val is not None:
            save(t_val, tfile)
            cols_family[t] = t_val

    return dftext1, cols_family
Exemple #30
0
def train(model_dict, dfX, cols_family, post_process_fun):
    """  Train the model using model_dict, save model, save prediction
    :param model_dict:  dict containing params
    :param dfX:  pd.DataFrame
    :param cols_family: dict of list containing column names
    :param post_process_fun:
    :return: dfXtrain , dfXval  DataFrame containing prediction.
    """
    model_pars, compute_pars = model_dict['model_pars'], model_dict[
        'compute_pars']
    data_pars = model_dict['data_pars']
    model_name, model_path = model_pars['model_class'], model_dict[
        'global_pars']['path_train_model']
    metric_list = compute_pars['metric_list']

    log("#### Data preparation #########################################################"
        )
    log(dfX.shape)
    dfX = dfX.sample(frac=1.0)
    itrain = int(0.6 * len(dfX))
    ival = int(0.8 * len(dfX))
    colsX = data_pars['cols_model']
    coly = data_pars['coly']
    log('Model colsX', colsX)
    log('Model coly', coly)

    data_pars['data_type'] = 'ram'
    data_pars['train'] = {
        'Xtrain': dfX[colsX].iloc[:itrain, :],
        'ytrain': dfX[coly].iloc[:itrain],
        'Xtest': dfX[colsX].iloc[itrain:ival, :],
        'ytest': dfX[coly].iloc[itrain:ival],
        'Xval': dfX[colsX].iloc[ival:, :],
        'yval': dfX[coly].iloc[ival:],
    }

    log("#### Init, Train ############################################################"
        )
    # from config_model import map_model
    modelx = map_model(model_name)
    log(modelx)
    modelx.reset()
    modelx.init(model_pars, compute_pars=compute_pars)

    if 'optuna' in model_name:
        modelx.fit(data_pars, compute_pars)
        # No need anymore
        # modelx.model.model_pars['optuna_model'] = modelx.fit(data_pars, compute_pars)
    else:
        modelx.fit(data_pars, compute_pars)

    log("#### Predict ################################################################"
        )
    ypred, ypred_proba = modelx.predict(dfX[colsX], compute_pars=compute_pars)

    dfX[coly + '_pred'] = ypred  # y_norm(ypred, inverse=True)

    dfX[coly] = dfX[coly].apply(lambda x: post_process_fun(x))
    dfX[coly + '_pred'] = dfX[coly +
                              '_pred'].apply(lambda x: post_process_fun(x))

    if ypred_proba is None:
        ypred_proba_val = None

    elif len(ypred_proba.shape) <= 1:
        ypred_proba_val = ypred_proba[ival:]
        dfX[coly + '_proba'] = ypred_proba

    elif len(ypred_proba.shape) > 1:
        from util_feature import np_conv_to_one_col
        ypred_proba_val = ypred_proba[ival:, :]
        dfX[coly + '_proba'] = np_conv_to_one_col(
            ypred_proba, ";")  ### merge into string "p1,p2,p3,p4"
        log(dfX.head(3).T)

    log("Actual    : ", dfX[coly])
    log("Prediction: ", dfX[coly + '_pred'])

    log("#### Metrics #############################################################"
        )
    from util_feature import metrics_eval
    metrics_test = metrics_eval(metric_list,
                                ytrue=dfX[coly].iloc[ival:],
                                ypred=dfX[coly + '_pred'].iloc[ival:],
                                ypred_proba=ypred_proba_val)
    stats = {'metrics_test': metrics_test}
    log(stats)

    log("### Saving model, dfX, columns ###########################################"
        )
    log(model_path + "/model.pkl")
    os.makedirs(model_path, exist_ok=True)
    save(colsX, model_path + "/colsX.pkl")
    save(coly, model_path + "/coly.pkl")
    modelx.save(model_path, stats)

    log("### Reload model,            ############################################"
        )
    log(modelx.model.model_pars, modelx.model.compute_pars)
    a = load(model_path + "/model.pkl")
    log("Reload model pars", a.model_pars)

    return dfX.iloc[:ival, :].reset_index(), dfX.iloc[ival:, :].reset_index()