Ejemplo n.º 1
0
def pd_col_myfun(df=None, col=None, pars={}):
    """
         Example of custom Processor
    """
    from source.util_feature import save, load
    prefix = "col_myfun`"
    if "path_pipeline" in pars:  #### Inference time LOAD previous pars
        prepro = load(pars["path_pipeline"] + f"/{prefix}_model.pkl")
        pars = load(pars["path_pipeline"] + f"/{prefix}_pars.pkl")
        pars = {} if pars is None else pars
    #### Do something #################################################################
    df_new = df[col]  ### Do nithi
    df_new.columns = [col + "_myfun" for col in df.columns]
    cols_new = list(df_new.columns)

    prepro = None
    pars_new = None

    ###################################################################################
    if "path_features_store" in pars and "path_pipeline_export" in pars:
        save(prepro, pars["path_pipeline_export"] + f"/{prefix}_model.pkl")
        save(cols_new, pars["path_pipeline_export"] + f"/{prefix}.pkl")
        save(pars_new, pars["path_pipeline_export"] + f"/{prefix}_pars.pkl")

    col_pars = {
        "prefix": prefix,
        "path": pars.get("path_pipeline_export",
                         pars.get("path_pipeline", None))
    }
    col_pars["cols_new"] = {
        "col_myfun": cols_new  ### list
    }
    return df_new, col_pars
Ejemplo n.º 2
0
def pd_col_myfun(df=None, col=None, pars={}):
    """
         Example of custom Processor
    """
    from source.util_feature import save, load
    prefix = 'col_myfun`'
    if 'path_pipeline' in pars:  #### Inference time LOAD previous pars
        prepro = load(pars['path_pipeline'] + f"/{prefix}_model.pkl")
        pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")
        pars = {} if pars is None else pars
    #### Do something #################################################################
    df_new = df[col]  ### Do nithi
    df_new.columns = [col + "_myfun" for col in df.columns]
    cols_new = list(df_new.columns)

    prepro = None
    pars_new = None

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save(prepro, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(cols_new, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_new, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        'col_myfun': cols_new  ### list
    }
    return df_new, col_pars
Ejemplo n.º 3
0
def register(run_name,
             params,
             metrics,
             signature,
             model_class,
             tracking_uri="sqlite:///local.db"):
    """
    :run_name: Name of model
    :log_params: dict with model params
    :metrics: dict with model evaluation metrics
    :signature: Its a signature that describes model input and output Schema
    :model_class: Type of class model
    :return:
    """
    mlflow.set_tracking_uri(tracking_uri)
    with mlflow.start_run(run_name=run_name) as run:
        run_id = run.info.run_uuid
        experiment_id = run.info.experiment_id

        sk_model = load(params['path_train_model'] + "/model.pkl")
        mlflow.log_params(params)

        metrics.apply(lambda x: mlflow.log_metric(x.metric_name, x.metric_val),
                      axis=1)

        mlflow.sklearn.log_model(sk_model,
                                 run_name,
                                 signature=signature,
                                 registered_model_name="sklearn_" + run_name +
                                 "_" + model_class)

        log("MLFLOW identifiers", run_id, experiment_id)

    mlflow.end_run()
Ejemplo n.º 4
0
def preprocess_load(path_train_X="", path_train_y="", path_pipeline_export="", cols_group=None, n_sample=5000,
               preprocess_pars={}, filter_pars={}, path_features_store=None):
    
    from source.util_feature import load

    dfXy        = pd.read_parquet(path_features_store + "/dfX/features.parquet")

    try :
       dfy  = pd.read_parquet(path_features_store + "/dfy/features.parquet")  
       dfXy = dfXy.join(dfy, on= cols_group['colid']  , how="left") 
    except :
       log('Error no label', path_features_store + "/dfy/features.parquet")
     
    cols_family = load(f'{path_pipeline_export}/cols_family.pkl')

    return  dfXy, cols_family
def pd_colnum_quantile_norm(df, col, pars={}):
  """
     colnum normalization by quantile
  """
  import pandas as pd, numpy as np
  from source.util_feature import  load, save
  prefix  = "colnum_quantile_norm"
  df      = df[col]
  num_col = col

  ##### Grab previous computed params  ################################################
  pars2 = {}
  if  'path_pipeline' in pars :   #### Load existing column list
       colnum_quantile_norm = load( pars['path_pipeline']  +f'/{prefix}.pkl')
       model                = load( pars['path_pipeline']  +f'/{prefix}_model.pkl')
       pars2                = load( pars['path_pipeline']  +f'/{prefix}_pars.pkl')

  ########### Compute #################################################################
  lower_bound_sparse = pars2.get('lower_bound_sparse', None)
  upper_bound_sparse = pars2.get('upper_bound_sparse', None)
  lower_bound        = pars2.get('lower_bound_sparse', None)
  upper_bound        = pars2.get('upper_bound_sparse', None)
  sparse_col         = pars2.get('colsparse', ['capital-gain', 'capital-loss'] )

  ####### Find IQR and implement to numericals and sparse columns seperately ##########
  Q1  = df.quantile(0.25)
  Q3  = df.quantile(0.75)
  IQR = Q3 - Q1

  for col in num_col:
    if col in sparse_col:
      df_nosparse = pd.DataFrame(df[df[col] != df[col].mode()[0]][col])

      if lower_bound_sparse is not None:
        pass

      elif df_nosparse[col].quantile(0.25) < df[col].mode()[0]: #Unexpected case
        lower_bound_sparse = df_nosparse[col].quantile(0.25)

      else:
        lower_bound_sparse = df[col].mode()[0]

      if upper_bound_sparse is not None:
        pass

      elif df_nosparse[col].quantile(0.75) < df[col].mode()[0]: #Unexpected case
        upper_bound_sparse = df[col].mode()[0]

      else:
        upper_bound_sparse = df_nosparse[col].quantile(0.75)

      n_outliers = len(df[(df[col] < lower_bound_sparse) | (df[col] > upper_bound_sparse)][col])

      if n_outliers > 0:
        df.loc[df[col] < lower_bound_sparse, col] = lower_bound_sparse * 0.75 #--> MAIN DF CHANGED
        df.loc[df[col] > upper_bound_sparse, col] = upper_bound_sparse * 1.25 # --> MAIN DF CHANGED

    else:
      if lower_bound is None or upper_bound is None :
         lower_bound = df[col].quantile(0.25) - 1.5 * IQR[col]
         upper_bound = df[col].quantile(0.75) + 1.5 * IQR[col]

      df[col] = np.where(df[col] > upper_bound, 1.25 * upper_bound, df[col])
      df[col] = np.where(df[col] < lower_bound, 0.75 * lower_bound, df[col])

  df.columns = [ t + "_qt_norm" for t in df.columns ]
  pars_new   = {'lower_bound' : lower_bound, 'upper_bound': upper_bound,
                'lower_bound_sparse' : lower_bound_sparse, 'upper_bound_sparse' : upper_bound_sparse  }
  dfnew    = df
  model    = None
  colnew   = list(df.columns)

  ##### Export ##############################################################################
  if 'path_features_store' in pars and 'path_pipeline_export' in pars:
      # save_features(df,  prefix, pars['path_features_store'])
      save(colnew,     pars['path_pipeline_export']  + f"/{prefix}.pkl" )
      save(pars_new,   pars['path_pipeline_export']  + f"/{prefix}_pars.pkl" )
      save(model,      pars['path_pipeline_export']  + f"/{prefix}_model.pkl" )



  col_pars = {'prefix' : prefix, 'path': pars.get('path_pipeline_export', pars.get("path_pipeline", None)) }
  col_pars['cols_new'] = {
    prefix :  colnew  ### list
  }
  return dfnew,  col_pars