def pipeline( df: pd.DataFrame, config: Config, train_csv: str = None, test_csv: str = None, prediction_csv: str = None) -> (pd.DataFrame, Optional[np.float64]): if config.is_train(): config['stages'] = {} for ids, stage in enumerate(config['graph']): if len(stage) == 0 or stage[0] is None or stage[0] == '': config["stage"] = '{0}/n{1}'.format( config["stage"], 'Error value stage "{0}" in pipeline'.format(stage)) raise ValueError(config["stage"]) config["stage"] = stage[0] config["stage_nb"] = ids if config.is_train(): config['stages'][config["stage"]] = {} config['stages'][config["stage"]]['time'] = 0 start_time = time.time() if stage[0] == 'Start': continue # elif stage[0] == 'End': # break elif not stage[0] in config['params']['pipeline']: config["stage"] = '{0}/n{1}'.format( config["stage"], 'Unknow node "{0}" in pipeline'.format(stage[0])) raise ValueError(config["stage"]) elif not config['params']['pipeline'][stage[0]]['node'] in _node_map: config["stage"] = '{0}/n{1}'.format( config["stage"], 'Unknow node "{0}" in _node_map'.format( config['params']['pipeline'][stage[0]]['node'])) raise ValueError(config["stage"]) node = _node_map[config['params']['pipeline'][stage[0]]['node']] if node.name == 'read_df': if config.is_train(): df = node(train_csv, config) elif 'args' in config['params']['pipeline'][stage[0]] \ and len(config['params']['pipeline'][stage[0]]['args'])!=0: node.function(df, config, **config['params']['pipeline'][stage[0]]['args']) else: node(df, config) stage_time_inc(config, start_time, stage[0])
def feature_selection(df: pd.DataFrame, config: Config): if config.is_train(): df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb < 2 * 1024: return selected_columns = [] config_sample = copy.deepcopy(config) for i in range(10): df_sample = df.sample(min(1000, len(df)), random_state=i).copy(deep=True) preprocess_pipeline(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) if len(selected_columns) > 0: X = X.drop(selected_columns, axis=1) if len(X.columns) > 0: selected_columns += select_features(X, y, config["mode"]) else: break log("Selected columns: {}".format(selected_columns)) drop_number_columns = [ c for c in df if (c.startswith("number_") or c.startswith("id_")) and c not in selected_columns ] if len(drop_number_columns) > 0: config["drop_number_columns"] = drop_number_columns config["date_columns"] = {} for c in [c for c in selected_columns if c.startswith("datetime_")]: d = c.split("_") date_col = d[0] + "_" + d[1] date_part = d[2] if date_col not in config["date_columns"]: config["date_columns"][date_col] = [] config["date_columns"][date_col].append(date_part) drop_datetime_columns = [ c for c in df if c.startswith("datetime_") and c not in config["date_columns"] ] if len(drop_datetime_columns) > 0: config["drop_datetime_columns"] = drop_datetime_columns if "drop_number_columns" in config: log("Drop number columns: {}".format(config["drop_number_columns"])) df.drop(config["drop_number_columns"], axis=1, inplace=True) if "drop_datetime_columns" in config: log("Drop datetime columns: {}".format( config["drop_datetime_columns"])) df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
def read_df(csv_path: str, config: Config) -> pd.DataFrame: if "dtype" not in config: preview_df(csv_path, config) df = optimize_dataframe(pandas_read_csv(csv_path, config)) if config.is_train(): config["nrows"] = len(df) return df
def read_df(csv_path: str, config: Config) -> pd.DataFrame: if "dtype" not in config: preview_df(csv_path, config) df = pandas_read_csv(csv_path, config) if config.is_train(): config["nrows"] = len(df) config["target_data"] = df['target'].copy() return df
def read_df(csv_path: str, config: Config) -> pd.DataFrame: if "dtype" not in config: preview_df(csv_path, config) df = pandas_read_csv(csv_path, config) if config.is_train(): config["nrows_stage_nb"] = 0 config["nrows"] = len(df) return df
def feature_selection(df: pd.DataFrame, config: Config): if config.is_train(): df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb < 2 * 1024: return selected_columns = [] for i in range(3): config_sample = copy.deepcopy(config) df_sample = df.sample(frac=0.05, random_state=i).copy(deep=True) df_sample = preprocess_pipeline(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) if len(X.columns) > 0: selected_columns += select_features(X, y, config["mode"]) else: break df_size_mb = df.drop( list(set(df) - set(selected_columns)), 1, errors='ignore').memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb < 2 * 1024: break selected_columns = list(set(selected_columns)) log("Selected columns: {}".format(selected_columns)) drop_number_columns = [ c for c in df if (c.startswith("number_") or c.startswith("id_")) and c not in selected_columns ] if len(drop_number_columns) > 0: config["drop_number_columns"] = drop_number_columns drop_datetime_columns = [ c for c in df if c.startswith("datetime_") and c not in selected_columns ] if len(drop_datetime_columns) > 0: config["drop_datetime_columns"] = drop_datetime_columns if "drop_number_columns" in config: log("Drop number columns: {}".format(config["drop_number_columns"])) df.drop(config["drop_number_columns"], axis=1, inplace=True) if "drop_datetime_columns" in config: log("Drop datetime columns: {}".format( config["drop_datetime_columns"])) df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
def fillna(df: pd.DataFrame, config: Config, args: dict = {}): if len(args) != 0: for k, v in args.items(): if config.is_train(): lst_columns = [c for c in df if c.startswith(k)] config['stages'][config["stage"]][k] = { 'lst_columns': lst_columns } if len(lst_columns) != 0: if 'agg' in v or 'value' in v: if config.is_train(): s_fillna_values = calc_columns_metric( df, lst_columns, metric=v['agg'] if 'agg' in v else None, value=v['value'] if 'value' in v else None) config['stages'][config["stage"]][k][ 'fillna_values'] = deepcopy(s_fillna_values) if len(config['stages'][config["stage"]][k]['lst_columns']) != 0: fillna_columns( df, config['stages'][config["stage"]][k]['fillna_values']) else: for c in [c for c in df if c.startswith("number_")]: df[c].fillna(-1, inplace=True) for c in [c for c in df if c.startswith("string_")]: df[c].fillna("", inplace=True) for c in [c for c in df if c.startswith("datetime_")]: df[c].fillna(datetime.datetime(1970, 1, 1), inplace=True)
def subsample(df: pd.DataFrame, config: Config, max_size_mb: float=2.0): if config.is_train(): df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb > max_size_mb: mem_per_row = df_size_mb / len(df) sample_rows = int(max_size_mb / mem_per_row) log("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows.".format(df_size_mb, len(df), sample_rows)) _, df_drop = train_test_split(df, train_size=sample_rows, random_state=1) df.drop(df_drop.index, inplace=True) config["nrows"] = sample_rows else: config["nrows"] = len(df)
def check_columns_exists(df: pd.DataFrame, config: Config, key_stage: str, drop_columns_test: bool = True): field_target_name = config['params']['field_target_name'] if config.is_train(): if not 'columns_exists' in config['params']['pipeline'][ config["stage"]]: config['params']['pipeline'][ config["stage"]]['columns_exists'] = {} if not field_target_name in df.columns: raise ValueError( 'Column y="{0}" not exists in train dataset'.format( field_target_name)) config['params']['pipeline'][config["stage"]]['columns_exists'][key_stage] = \ set([x for x in df.columns if x!=field_target_name]) elif 'columns_exists' in config['params']['pipeline'][config["stage"]]: if key_stage in config['params']['pipeline'][ config["stage"]]['columns_exists']: set_columns = config['params']['pipeline'][ config["stage"]]['columns_exists'][key_stage] - set(df.columns) if len(set_columns) != 0: raise ValueError( 'Columns "{0}" not exists in test dataset on stage {1}'. format(str(set_columns), key_stage)) set_columns = set(df.columns) - config['params']['pipeline'][ config["stage"]]['columns_exists'][key_stage] if len(set_columns) != 0: if drop_columns_test: df.drop(columns=[x for x in set_columns], inplace=True) else: raise ValueError( 'Columns "{0}" not exists in train dataset on stage {1}' .format(str(set_columns), key_stage)) else: raise ValueError( 'Preprocess stage "{0}" not exists'.format(key_stage))
def subsample(df: pd.DataFrame, config: Config): if config.is_train(): # df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 df_size_mb, sample_rows = get_sample_rows(df, config) if df_size_mb > config['params']['memory']['max_size_mb']: # mem_per_row = df_size_mb / len(df) # sample_rows = int(config['params']['memory']['max_size_mb'] / mem_per_row) log("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows." \ .format(df_size_mb, len(df), sample_rows), config.verbose) _, df_drop = train_test_split(df, train_size=sample_rows, random_state=1) df.drop(df_drop.index, inplace=True) config["nrows"] = sample_rows elif config["nrows_stage_nb"] == 0: config["nrows"] = max(sample_rows, len(df)) else: config["nrows"] = min(sample_rows, config["nrows"]) config["nrows_stage_nb"] = config["stage_nb"]
def scale(df: pd.DataFrame, config: Config): warnings.filterwarnings(action='ignore', category=DataConversionWarning) scale_columns = [ c for c in df if c.startswith("number_") and df[c].dtype != np.int8 and c not in config["categorical_columns"] ] if len(scale_columns) > 0: if config.is_train(): config['stages'][config["stage"]]['scale_columns'] = deepcopy( scale_columns) config['stages'][config["stage"]]['model'] = StandardScaler( copy=False) config['stages'][config["stage"]]['scale_columns'] = deepcopy( scale_columns) config['stages'][config["stage"]]['model'].fit( df[scale_columns].astype(np.float32)) df[config['stages'][config["stage"]]['scale_columns']] = \ config['stages'][config["stage"]]['model'].transform( \ df[config['stages'][config["stage"]]['scale_columns']].astype(np.float32) ).astype(np.float32)
def non_negative_target_detect(df: pd.DataFrame, config: Config): if config.is_train(): config["non_negative_target"] = df["target"].lt(0).sum() == 0
def time_series_detect(df: pd.DataFrame, config: Config): sample_size = 10000 model_params = { "objective": "regression" if config["mode"] == "regression" else "binary", "metric": "rmse" if config["mode"] == "regression" else "auc", "learning_rate": 0.01, "verbosity": -1, "seed": 1, "max_depth": -1, } if config.is_train(): datetime_columns = [c for c in df if c.startswith("datetime_")] id_columns = [c for c in df if c.startswith("id_")] sort_columns = [] for dc in datetime_columns: sort_columns.append([dc]) for ic in id_columns: sort_columns.append([ic, dc]) else: for ic in id_columns: sort_columns.append([ic]) scores = [] config.limit_time_fraction(0.1) for sc in sort_columns: if config.is_time_fraction_limit(): break Log.silent(True) df.sort_values(sc, inplace=True) config_sample = copy.deepcopy(config) df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy() df_sample = df_sample[[c for c in df_sample if c.startswith("number_") or c == "target" or c in sc]] shift_columns(df_sample, group= sc[0] if len(sc) > 1 else None) transform(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) X_train, X_test, y_train, y_test = ts_split(X, y, test_size=0.5) model_sorted = lgb.train(model_params, lgb.Dataset(X_train, label=y_train), 3000, lgb.Dataset(X_test, label=y_test), early_stopping_rounds=100, verbose_eval=False) score_sorted = model_sorted.best_score["valid_0"][model_params["metric"]] sampled_columns = [c for c in X if "_shift" not in c] model_sampled = lgb.train(model_params, lgb.Dataset(X_train[sampled_columns], label=y_train), 3000, lgb.Dataset(X_test[sampled_columns], label=y_test), early_stopping_rounds=100, verbose_eval=False) score_sampled = model_sampled.best_score["valid_0"][model_params["metric"]] if config.is_classification(): score_sorted = -score_sorted score_sampled = -score_sampled Log.silent(False) Log.print("Sort: {}. Score sorted: {:0.4f}. Score sampled: {:0.4f}".format(sc, score_sorted, score_sampled)) score_ratio = score_sampled / score_sorted if config.is_regression() else abs(score_sorted / score_sampled) if score_ratio >= 1.03: Log.print(score_ratio) scores.append((score_sorted, sc)) if len(scores) > 0: scores = sorted(scores, key=lambda x: x[0]) Log.print("Scores: {}".format(scores)) config["sort_values"] = scores[0][1] df.sort_values(config["sort_values"], inplace=True) config_sample = copy.deepcopy(config) df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy() shift_columns(df_sample, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None) transform(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) model = lgb.train(model_params, lgb.Dataset(X, label=y), 1000) fi = pd.Series(model.feature_importance(importance_type="gain"), index=X.columns) fi = fi[fi > 0].sort_values() selected_columns = fi[fi >= fi.quantile(0.75)].index.tolist() selected_shift_columns = [c.replace("_shift", "") for c in selected_columns if "_shift" in c] if len(selected_shift_columns) > 0: Log.print("Shift columns: {}".format(selected_shift_columns)) config["shift_columns"] = selected_shift_columns if "shift_columns" in config: shift_columns(df, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None, number_columns=config["shift_columns"])
def model(df: pd.DataFrame, config: Config, models: list): # -> (Optional[pd.DataFrame], Optional[pd.Series]) if config.is_train(): X, y = split_X_y(df, config) if config.is_train(): # train(X, y, config) if 'args' in config['params']['pipeline'][config["stage"]] and \ 'models' in config['params']['pipeline'][config["stage"]]['args']: if config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'h2o': train_h2o(X, y, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'lightgbm': train_lightgbm(X, y, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'vw': train_vw(X, y, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'lm': train_lm(X, y, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'rf': train_rand_forest(X, y, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'lcv': train_linear_cv(X, y, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'bayes': train_bayesian(X, y, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'arima': train_arima(X, y, config) else: raise ValueError('Train: Unknow model name "{0}"'.format( config['params']['pipeline'][ config["stage"]]['args']['models'][0])) else: config['params']['pipeline'][config["stage"]]['models'] = [] if config["nrows"] < 1000: train_h2o(X, y, config) config['params']['pipeline'][ config["stage"]]['models'].append('h2o') else: train_lightgbm(X, y, config) config['params']['pipeline'][ config["stage"]]['models'].append('lightgbm') if config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'h2o': df[config["stage"]] = predict_h2o(X, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'lightgbm': df[config["stage"]] = predict_lightgbm(X, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'vw': df[config["stage"]] = predict_vw(X, y, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'lm': df[config["stage"]] = predict_lm(X, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'rf': df[config["stage"]] = predict_rand_forest(X, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'lcv': df[config["stage"]] = predict_linear_cv(X, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'bayes': df[config["stage"]] = predict_bayesian(X, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'arima': df[config["stage"]] = predict_arima(X, config) else: if config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'h2o': df[config["stage"]] = predict_h2o(df, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'lightgbm': df[config["stage"]] = predict_lightgbm(df, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'vw': df[config["stage"]] = predict_vw(df, y, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'lm': df[config["stage"]] = predict_lm(df, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'rf': df[config["stage"]] = predict_rand_forest(df, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'lcv': df[config["stage"]] = predict_linear_cv(df, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'bayes': df[config["stage"]] = predict_bayesian(df, config) elif config['params']['pipeline'][ config["stage"]]['args']['models'][0] == 'arima': df[config["stage"]] = predict_arima(df, config) if config["non_negative_target"]: df[config["stage"]] = df[config["stage"]].apply(lambda p: max(0, p))
def working_days_zero_detect(df: pd.DataFrame, config: Config) -> bool: if config.is_train() & ("is_working_datetime_0" in df): if (df.loc[df["is_working_datetime_0"] == 0, "target"] == 0).all(): config["working_days_zero"] = True log("Working days zero detect", config.verbose)