Example #1
0
def pipeline(
        df: pd.DataFrame,
        config: Config,
        train_csv: str = None,
        test_csv: str = None,
        prediction_csv: str = None) -> (pd.DataFrame, Optional[np.float64]):

    if config.is_train():
        config['stages'] = {}

    for ids, stage in enumerate(config['graph']):
        if len(stage) == 0 or stage[0] is None or stage[0] == '':
            config["stage"] = '{0}/n{1}'.format(
                config["stage"],
                'Error value stage "{0}" in pipeline'.format(stage))
            raise ValueError(config["stage"])

        config["stage"] = stage[0]
        config["stage_nb"] = ids

        if config.is_train():
            config['stages'][config["stage"]] = {}

        config['stages'][config["stage"]]['time'] = 0
        start_time = time.time()

        if stage[0] == 'Start':
            continue
        # elif stage[0] == 'End':
        #     break

        elif not stage[0] in config['params']['pipeline']:
            config["stage"] = '{0}/n{1}'.format(
                config["stage"],
                'Unknow node "{0}" in pipeline'.format(stage[0]))
            raise ValueError(config["stage"])

        elif not config['params']['pipeline'][stage[0]]['node'] in _node_map:
            config["stage"] = '{0}/n{1}'.format(
                config["stage"], 'Unknow node "{0}" in _node_map'.format(
                    config['params']['pipeline'][stage[0]]['node']))
            raise ValueError(config["stage"])

        node = _node_map[config['params']['pipeline'][stage[0]]['node']]
        if node.name == 'read_df':
            if config.is_train():
                df = node(train_csv, config)

        elif 'args' in config['params']['pipeline'][stage[0]] \
                and len(config['params']['pipeline'][stage[0]]['args'])!=0:
            node.function(df, config,
                          **config['params']['pipeline'][stage[0]]['args'])
        else:
            node(df, config)

        stage_time_inc(config, start_time, stage[0])
Example #2
0
def feature_selection(df: pd.DataFrame, config: Config):
    if config.is_train():
        df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        if df_size_mb < 2 * 1024:
            return

        selected_columns = []
        config_sample = copy.deepcopy(config)
        for i in range(10):
            df_sample = df.sample(min(1000, len(df)),
                                  random_state=i).copy(deep=True)
            preprocess_pipeline(df_sample, config_sample)
            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)

            if len(selected_columns) > 0:
                X = X.drop(selected_columns, axis=1)

            if len(X.columns) > 0:
                selected_columns += select_features(X, y, config["mode"])
            else:
                break

        log("Selected columns: {}".format(selected_columns))

        drop_number_columns = [
            c for c in df if (c.startswith("number_") or c.startswith("id_"))
            and c not in selected_columns
        ]
        if len(drop_number_columns) > 0:
            config["drop_number_columns"] = drop_number_columns

        config["date_columns"] = {}
        for c in [c for c in selected_columns if c.startswith("datetime_")]:
            d = c.split("_")
            date_col = d[0] + "_" + d[1]
            date_part = d[2]

            if date_col not in config["date_columns"]:
                config["date_columns"][date_col] = []

            config["date_columns"][date_col].append(date_part)

        drop_datetime_columns = [
            c for c in df
            if c.startswith("datetime_") and c not in config["date_columns"]
        ]
        if len(drop_datetime_columns) > 0:
            config["drop_datetime_columns"] = drop_datetime_columns

    if "drop_number_columns" in config:
        log("Drop number columns: {}".format(config["drop_number_columns"]))
        df.drop(config["drop_number_columns"], axis=1, inplace=True)

    if "drop_datetime_columns" in config:
        log("Drop datetime columns: {}".format(
            config["drop_datetime_columns"]))
        df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
Example #3
0
def read_df(csv_path: str, config: Config) -> pd.DataFrame:
    if "dtype" not in config:
        preview_df(csv_path, config)

    df = optimize_dataframe(pandas_read_csv(csv_path, config))
    if config.is_train():
        config["nrows"] = len(df)

    return df
Example #4
0
def read_df(csv_path: str, config: Config) -> pd.DataFrame:
    if "dtype" not in config:
        preview_df(csv_path, config)

    df = pandas_read_csv(csv_path, config)
    if config.is_train():
        config["nrows"] = len(df)
        config["target_data"] = df['target'].copy()

    return df
Example #5
0
def read_df(csv_path: str, config: Config) -> pd.DataFrame:
    if "dtype" not in config:
        preview_df(csv_path, config)

    df = pandas_read_csv(csv_path, config)
    if config.is_train():
        config["nrows_stage_nb"] = 0
        config["nrows"] = len(df)

    return df
Example #6
0
def feature_selection(df: pd.DataFrame, config: Config):
    if config.is_train():
        df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        if df_size_mb < 2 * 1024:
            return

        selected_columns = []
        for i in range(3):

            config_sample = copy.deepcopy(config)

            df_sample = df.sample(frac=0.05, random_state=i).copy(deep=True)
            df_sample = preprocess_pipeline(df_sample, config_sample)
            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)

            if len(X.columns) > 0:
                selected_columns += select_features(X, y, config["mode"])
            else:
                break

            df_size_mb = df.drop(
                list(set(df) - set(selected_columns)), 1,
                errors='ignore').memory_usage(deep=True).sum() / 1024 / 1024
            if df_size_mb < 2 * 1024:
                break

        selected_columns = list(set(selected_columns))

        log("Selected columns: {}".format(selected_columns))

        drop_number_columns = [
            c for c in df if (c.startswith("number_") or c.startswith("id_"))
            and c not in selected_columns
        ]
        if len(drop_number_columns) > 0:
            config["drop_number_columns"] = drop_number_columns

        drop_datetime_columns = [
            c for c in df
            if c.startswith("datetime_") and c not in selected_columns
        ]
        if len(drop_datetime_columns) > 0:
            config["drop_datetime_columns"] = drop_datetime_columns

    if "drop_number_columns" in config:
        log("Drop number columns: {}".format(config["drop_number_columns"]))
        df.drop(config["drop_number_columns"], axis=1, inplace=True)

    if "drop_datetime_columns" in config:
        log("Drop datetime columns: {}".format(
            config["drop_datetime_columns"]))
        df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
Example #7
0
def fillna(df: pd.DataFrame, config: Config, args: dict = {}):

    if len(args) != 0:

        for k, v in args.items():

            if config.is_train():
                lst_columns = [c for c in df if c.startswith(k)]
                config['stages'][config["stage"]][k] = {
                    'lst_columns': lst_columns
                }

                if len(lst_columns) != 0:
                    if 'agg' in v or 'value' in v:

                        if config.is_train():
                            s_fillna_values = calc_columns_metric(
                                df,
                                lst_columns,
                                metric=v['agg'] if 'agg' in v else None,
                                value=v['value'] if 'value' in v else None)

                            config['stages'][config["stage"]][k][
                                'fillna_values'] = deepcopy(s_fillna_values)

            if len(config['stages'][config["stage"]][k]['lst_columns']) != 0:
                fillna_columns(
                    df, config['stages'][config["stage"]][k]['fillna_values'])

    else:

        for c in [c for c in df if c.startswith("number_")]:
            df[c].fillna(-1, inplace=True)

        for c in [c for c in df if c.startswith("string_")]:
            df[c].fillna("", inplace=True)

        for c in [c for c in df if c.startswith("datetime_")]:
            df[c].fillna(datetime.datetime(1970, 1, 1), inplace=True)
Example #8
0
def subsample(df: pd.DataFrame, config: Config, max_size_mb: float=2.0):
    if config.is_train():
        df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        if df_size_mb > max_size_mb:
            mem_per_row = df_size_mb / len(df)
            sample_rows = int(max_size_mb / mem_per_row)

            log("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows.".format(df_size_mb, len(df), sample_rows))
            _, df_drop = train_test_split(df, train_size=sample_rows, random_state=1)
            df.drop(df_drop.index, inplace=True)

            config["nrows"] = sample_rows
        else:
            config["nrows"] = len(df)
Example #9
0
def check_columns_exists(df: pd.DataFrame,
                         config: Config,
                         key_stage: str,
                         drop_columns_test: bool = True):
    field_target_name = config['params']['field_target_name']
    if config.is_train():
        if not 'columns_exists' in config['params']['pipeline'][
                config["stage"]]:
            config['params']['pipeline'][
                config["stage"]]['columns_exists'] = {}
            if not field_target_name in df.columns:
                raise ValueError(
                    'Column y="{0}" not exists in train dataset'.format(
                        field_target_name))

        config['params']['pipeline'][config["stage"]]['columns_exists'][key_stage] = \
                                    set([x for x in df.columns if x!=field_target_name])

    elif 'columns_exists' in config['params']['pipeline'][config["stage"]]:
        if key_stage in config['params']['pipeline'][
                config["stage"]]['columns_exists']:
            set_columns = config['params']['pipeline'][
                config["stage"]]['columns_exists'][key_stage] - set(df.columns)
            if len(set_columns) != 0:
                raise ValueError(
                    'Columns "{0}" not exists in test dataset on stage {1}'.
                    format(str(set_columns), key_stage))

            set_columns = set(df.columns) - config['params']['pipeline'][
                config["stage"]]['columns_exists'][key_stage]
            if len(set_columns) != 0:
                if drop_columns_test:
                    df.drop(columns=[x for x in set_columns], inplace=True)
                else:
                    raise ValueError(
                        'Columns "{0}" not exists in train dataset on stage {1}'
                        .format(str(set_columns), key_stage))
        else:
            raise ValueError(
                'Preprocess stage "{0}" not exists'.format(key_stage))
Example #10
0
def subsample(df: pd.DataFrame, config: Config):
    if config.is_train():
        # df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        df_size_mb, sample_rows = get_sample_rows(df, config)

        if df_size_mb > config['params']['memory']['max_size_mb']:
            # mem_per_row = df_size_mb / len(df)
            # sample_rows = int(config['params']['memory']['max_size_mb'] / mem_per_row)

            log("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows." \
                    .format(df_size_mb, len(df), sample_rows), config.verbose)
            _, df_drop = train_test_split(df,
                                          train_size=sample_rows,
                                          random_state=1)
            df.drop(df_drop.index, inplace=True)

            config["nrows"] = sample_rows
        elif config["nrows_stage_nb"] == 0:
            config["nrows"] = max(sample_rows, len(df))
        else:
            config["nrows"] = min(sample_rows, config["nrows"])

        config["nrows_stage_nb"] = config["stage_nb"]
Example #11
0
def scale(df: pd.DataFrame, config: Config):
    warnings.filterwarnings(action='ignore', category=DataConversionWarning)
    scale_columns = [
        c for c in df if c.startswith("number_") and df[c].dtype != np.int8
        and c not in config["categorical_columns"]
    ]

    if len(scale_columns) > 0:

        if config.is_train():
            config['stages'][config["stage"]]['scale_columns'] = deepcopy(
                scale_columns)

            config['stages'][config["stage"]]['model'] = StandardScaler(
                copy=False)
            config['stages'][config["stage"]]['scale_columns'] = deepcopy(
                scale_columns)
            config['stages'][config["stage"]]['model'].fit(
                df[scale_columns].astype(np.float32))

        df[config['stages'][config["stage"]]['scale_columns']] = \
            config['stages'][config["stage"]]['model'].transform( \
                       df[config['stages'][config["stage"]]['scale_columns']].astype(np.float32) ).astype(np.float32)
Example #12
0
def non_negative_target_detect(df: pd.DataFrame, config: Config):
    if config.is_train():
        config["non_negative_target"] = df["target"].lt(0).sum() == 0
Example #13
0
def time_series_detect(df: pd.DataFrame, config: Config):
    sample_size = 10000
    model_params = {
        "objective": "regression" if config["mode"] == "regression" else "binary",
        "metric": "rmse" if config["mode"] == "regression" else "auc",
        "learning_rate": 0.01,
        "verbosity": -1,
        "seed": 1,
        "max_depth": -1,
    }

    if config.is_train():
        datetime_columns = [c for c in df if c.startswith("datetime_")]
        id_columns = [c for c in df if c.startswith("id_")]

        sort_columns = []
        for dc in datetime_columns:
            sort_columns.append([dc])
            for ic in id_columns:
                sort_columns.append([ic, dc])
        else:
            for ic in id_columns:
                sort_columns.append([ic])

        scores = []
        config.limit_time_fraction(0.1)
        for sc in sort_columns:
            if config.is_time_fraction_limit():
                break

            Log.silent(True)
            df.sort_values(sc, inplace=True)

            config_sample = copy.deepcopy(config)
            df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy()
            df_sample = df_sample[[c for c in df_sample if c.startswith("number_") or c == "target" or c in sc]]
            shift_columns(df_sample, group= sc[0] if len(sc) > 1 else None)
            transform(df_sample, config_sample)

            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)
            X_train, X_test, y_train, y_test = ts_split(X, y, test_size=0.5)

            model_sorted = lgb.train(model_params, lgb.Dataset(X_train, label=y_train), 3000, lgb.Dataset(X_test, label=y_test),
                              early_stopping_rounds=100, verbose_eval=False)
            score_sorted = model_sorted.best_score["valid_0"][model_params["metric"]]

            sampled_columns = [c for c in X if "_shift" not in c]
            model_sampled = lgb.train(model_params, lgb.Dataset(X_train[sampled_columns], label=y_train), 3000, lgb.Dataset(X_test[sampled_columns], label=y_test),
                              early_stopping_rounds=100, verbose_eval=False)
            score_sampled = model_sampled.best_score["valid_0"][model_params["metric"]]

            if config.is_classification():
                score_sorted = -score_sorted
                score_sampled = -score_sampled

            Log.silent(False)
            Log.print("Sort: {}. Score sorted: {:0.4f}. Score sampled: {:0.4f}".format(sc, score_sorted, score_sampled))
            score_ratio = score_sampled / score_sorted if config.is_regression() else abs(score_sorted / score_sampled)
            if score_ratio >= 1.03:
                Log.print(score_ratio)
                scores.append((score_sorted, sc))

        if len(scores) > 0:
            scores = sorted(scores, key=lambda x: x[0])
            Log.print("Scores: {}".format(scores))
            config["sort_values"] = scores[0][1]
            df.sort_values(config["sort_values"], inplace=True)

            config_sample = copy.deepcopy(config)
            df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy()
            shift_columns(df_sample, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None)
            transform(df_sample, config_sample)

            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)

            model = lgb.train(model_params, lgb.Dataset(X, label=y), 1000)
            fi = pd.Series(model.feature_importance(importance_type="gain"), index=X.columns)
            fi = fi[fi > 0].sort_values()
            selected_columns = fi[fi >= fi.quantile(0.75)].index.tolist()

            selected_shift_columns = [c.replace("_shift", "") for c in selected_columns if "_shift" in c]
            if len(selected_shift_columns) > 0:
                Log.print("Shift columns: {}".format(selected_shift_columns))
                config["shift_columns"] = selected_shift_columns

    if "shift_columns" in config:
        shift_columns(df, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None, number_columns=config["shift_columns"])
Example #14
0
def model(df: pd.DataFrame, config: Config,
          models: list):  #  -> (Optional[pd.DataFrame], Optional[pd.Series])
    if config.is_train():
        X, y = split_X_y(df, config)
        if config.is_train():
            # train(X, y, config)

            if 'args' in config['params']['pipeline'][config["stage"]] and \
                'models' in config['params']['pipeline'][config["stage"]]['args']:
                if config['params']['pipeline'][
                        config["stage"]]['args']['models'][0] == 'h2o':
                    train_h2o(X, y, config)
                elif config['params']['pipeline'][
                        config["stage"]]['args']['models'][0] == 'lightgbm':
                    train_lightgbm(X, y, config)
                elif config['params']['pipeline'][
                        config["stage"]]['args']['models'][0] == 'vw':
                    train_vw(X, y, config)
                elif config['params']['pipeline'][
                        config["stage"]]['args']['models'][0] == 'lm':
                    train_lm(X, y, config)
                elif config['params']['pipeline'][
                        config["stage"]]['args']['models'][0] == 'rf':
                    train_rand_forest(X, y, config)
                elif config['params']['pipeline'][
                        config["stage"]]['args']['models'][0] == 'lcv':
                    train_linear_cv(X, y, config)
                elif config['params']['pipeline'][
                        config["stage"]]['args']['models'][0] == 'bayes':
                    train_bayesian(X, y, config)
                elif config['params']['pipeline'][
                        config["stage"]]['args']['models'][0] == 'arima':
                    train_arima(X, y, config)
                else:
                    raise ValueError('Train: Unknow model name "{0}"'.format(
                        config['params']['pipeline'][
                            config["stage"]]['args']['models'][0]))

            else:
                config['params']['pipeline'][config["stage"]]['models'] = []
                if config["nrows"] < 1000:
                    train_h2o(X, y, config)
                    config['params']['pipeline'][
                        config["stage"]]['models'].append('h2o')
                else:
                    train_lightgbm(X, y, config)
                    config['params']['pipeline'][
                        config["stage"]]['models'].append('lightgbm')

        if config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'h2o':
            df[config["stage"]] = predict_h2o(X, config)
        elif config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'lightgbm':
            df[config["stage"]] = predict_lightgbm(X, config)
        elif config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'vw':
            df[config["stage"]] = predict_vw(X, y, config)
        elif config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'lm':
            df[config["stage"]] = predict_lm(X, config)
        elif config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'rf':
            df[config["stage"]] = predict_rand_forest(X, config)
        elif config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'lcv':
            df[config["stage"]] = predict_linear_cv(X, config)
        elif config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'bayes':
            df[config["stage"]] = predict_bayesian(X, config)
        elif config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'arima':
            df[config["stage"]] = predict_arima(X, config)

    else:
        if config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'h2o':
            df[config["stage"]] = predict_h2o(df, config)
        elif config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'lightgbm':
            df[config["stage"]] = predict_lightgbm(df, config)
        elif config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'vw':
            df[config["stage"]] = predict_vw(df, y, config)
        elif config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'lm':
            df[config["stage"]] = predict_lm(df, config)
        elif config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'rf':
            df[config["stage"]] = predict_rand_forest(df, config)
        elif config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'lcv':
            df[config["stage"]] = predict_linear_cv(df, config)
        elif config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'bayes':
            df[config["stage"]] = predict_bayesian(df, config)
        elif config['params']['pipeline'][
                config["stage"]]['args']['models'][0] == 'arima':
            df[config["stage"]] = predict_arima(df, config)

    if config["non_negative_target"]:
        df[config["stage"]] = df[config["stage"]].apply(lambda p: max(0, p))
Example #15
0
def working_days_zero_detect(df: pd.DataFrame, config: Config) -> bool:
    if config.is_train() & ("is_working_datetime_0" in df):
        if (df.loc[df["is_working_datetime_0"] == 0, "target"] == 0).all():
            config["working_days_zero"] = True
            log("Working days zero detect", config.verbose)