Beispiel #1
0
def pd_col_myfun(df=None, col=None, pars={}):
    """
         Example of custom Processor
    """
    from source.util_feature import save, load
    prefix = 'col_myfun`'
    if 'path_pipeline' in pars:  #### Inference time LOAD previous pars
        prepro = load(pars['path_pipeline'] + f"/{prefix}_model.pkl")
        pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")
        pars = {} if pars is None else pars
    #### Do something #################################################################
    df_new = df[col]  ### Do nithi
    df_new.columns = [col + "_myfun" for col in df.columns]
    cols_new = list(df_new.columns)

    prepro = None
    pars_new = None

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save(prepro, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(cols_new, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_new, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        'col_myfun': cols_new  ### list
    }
    return df_new, col_pars
Beispiel #2
0
def pd_col_myfun(df=None, col=None, pars={}):
    """
         Example of custom Processor
    """
    from source.util_feature import save, load
    prefix = "col_myfun`"
    if "path_pipeline" in pars:  #### Inference time LOAD previous pars
        prepro = load(pars["path_pipeline"] + f"/{prefix}_model.pkl")
        pars = load(pars["path_pipeline"] + f"/{prefix}_pars.pkl")
        pars = {} if pars is None else pars
    #### Do something #################################################################
    df_new = df[col]  ### Do nithi
    df_new.columns = [col + "_myfun" for col in df.columns]
    cols_new = list(df_new.columns)

    prepro = None
    pars_new = None

    ###################################################################################
    if "path_features_store" in pars and "path_pipeline_export" in pars:
        save(prepro, pars["path_pipeline_export"] + f"/{prefix}_model.pkl")
        save(cols_new, pars["path_pipeline_export"] + f"/{prefix}.pkl")
        save(pars_new, pars["path_pipeline_export"] + f"/{prefix}_pars.pkl")

    col_pars = {
        "prefix": prefix,
        "path": pars.get("path_pipeline_export",
                         pars.get("path_pipeline", None))
    }
    col_pars["cols_new"] = {
        "col_myfun": cols_new  ### list
    }
    return df_new, col_pars
def pd_colnum_quantile_norm(df, col, pars={}):
  """
     colnum normalization by quantile
  """
  import pandas as pd, numpy as np
  from source.util_feature import  load, save
  prefix  = "colnum_quantile_norm"
  df      = df[col]
  num_col = col

  ##### Grab previous computed params  ################################################
  pars2 = {}
  if  'path_pipeline' in pars :   #### Load existing column list
       colnum_quantile_norm = load( pars['path_pipeline']  +f'/{prefix}.pkl')
       model                = load( pars['path_pipeline']  +f'/{prefix}_model.pkl')
       pars2                = load( pars['path_pipeline']  +f'/{prefix}_pars.pkl')

  ########### Compute #################################################################
  lower_bound_sparse = pars2.get('lower_bound_sparse', None)
  upper_bound_sparse = pars2.get('upper_bound_sparse', None)
  lower_bound        = pars2.get('lower_bound_sparse', None)
  upper_bound        = pars2.get('upper_bound_sparse', None)
  sparse_col         = pars2.get('colsparse', ['capital-gain', 'capital-loss'] )

  ####### Find IQR and implement to numericals and sparse columns seperately ##########
  Q1  = df.quantile(0.25)
  Q3  = df.quantile(0.75)
  IQR = Q3 - Q1

  for col in num_col:
    if col in sparse_col:
      df_nosparse = pd.DataFrame(df[df[col] != df[col].mode()[0]][col])

      if lower_bound_sparse is not None:
        pass

      elif df_nosparse[col].quantile(0.25) < df[col].mode()[0]: #Unexpected case
        lower_bound_sparse = df_nosparse[col].quantile(0.25)

      else:
        lower_bound_sparse = df[col].mode()[0]

      if upper_bound_sparse is not None:
        pass

      elif df_nosparse[col].quantile(0.75) < df[col].mode()[0]: #Unexpected case
        upper_bound_sparse = df[col].mode()[0]

      else:
        upper_bound_sparse = df_nosparse[col].quantile(0.75)

      n_outliers = len(df[(df[col] < lower_bound_sparse) | (df[col] > upper_bound_sparse)][col])

      if n_outliers > 0:
        df.loc[df[col] < lower_bound_sparse, col] = lower_bound_sparse * 0.75 #--> MAIN DF CHANGED
        df.loc[df[col] > upper_bound_sparse, col] = upper_bound_sparse * 1.25 # --> MAIN DF CHANGED

    else:
      if lower_bound is None or upper_bound is None :
         lower_bound = df[col].quantile(0.25) - 1.5 * IQR[col]
         upper_bound = df[col].quantile(0.75) + 1.5 * IQR[col]

      df[col] = np.where(df[col] > upper_bound, 1.25 * upper_bound, df[col])
      df[col] = np.where(df[col] < lower_bound, 0.75 * lower_bound, df[col])

  df.columns = [ t + "_qt_norm" for t in df.columns ]
  pars_new   = {'lower_bound' : lower_bound, 'upper_bound': upper_bound,
                'lower_bound_sparse' : lower_bound_sparse, 'upper_bound_sparse' : upper_bound_sparse  }
  dfnew    = df
  model    = None
  colnew   = list(df.columns)

  ##### Export ##############################################################################
  if 'path_features_store' in pars and 'path_pipeline_export' in pars:
      # save_features(df,  prefix, pars['path_features_store'])
      save(colnew,     pars['path_pipeline_export']  + f"/{prefix}.pkl" )
      save(pars_new,   pars['path_pipeline_export']  + f"/{prefix}_pars.pkl" )
      save(model,      pars['path_pipeline_export']  + f"/{prefix}_model.pkl" )



  col_pars = {'prefix' : prefix, 'path': pars.get('path_pipeline_export', pars.get("path_pipeline", None)) }
  col_pars['cols_new'] = {
    prefix :  colnew  ### list
  }
  return dfnew,  col_pars