Exemple #1
0
def transform_datetime(df: pd.DataFrame, config: Config):
    date_parts = ["year", "weekday", "month", "day", "hour"]

    if "date_columns" not in config:
        config["date_columns"] = {}

        for c in [c for c in df if c.startswith("datetime_")]:
            config["date_columns"][c] = []
            for part in date_parts:
                part_col = c + "_" + part
                df[part_col] = getattr(df[c].dt, part).astype(
                    np.uint16 if part == "year" else np.uint8).values

                if not (df[part_col] != df[part_col].iloc[0]).any():
                    log(part_col + " is constant")
                    df.drop(part_col, axis=1, inplace=True)
                else:
                    config["date_columns"][c].append(part)

            df.drop(c, axis=1, inplace=True)
    else:
        for c, parts in config["date_columns"].items():
            for part in parts:
                part_col = c + "_" + part
                df[part_col] = getattr(df[c].dt, part)
            df.drop(c, axis=1, inplace=True)
Exemple #2
0
def hyperopt_lightgbm(X: pd.DataFrame, y: pd.Series, params: Dict, config: Config):
    X_train, X_val, y_train, y_val = data_split(X, y, test_size=0.5)
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_val, label=y_val)

    space = {
        "learning_rate": hp.uniform("learning_rate", 0.01, 0.05),
        "max_depth": hp.choice("max_depth", [-1, 2, 3, 4, 5, 6]),
        "num_leaves": hp.choice("num_leaves", np.linspace(10, 200, 50, dtype=int)),
        "feature_fraction": hp.quniform("feature_fraction", 0.5, 1.0, 0.1),
        "bagging_fraction": hp.quniform("bagging_fraction", 0.5, 1.0, 0.1),
        "bagging_freq": hp.choice("bagging_freq", np.linspace(0, 50, 10, dtype=int)),
        "reg_alpha": hp.uniform("reg_alpha", 0, 30),
        "reg_lambda": hp.uniform("reg_lambda", 0, 30),
        "min_child_weight": hp.uniform('min_child_weight', 0.5, 10),
    }

    def objective(hyperparams):
        model = lgb.train({**params, **hyperparams}, train_data, 300, valid_data,
                          early_stopping_rounds=100, verbose_eval=100)

        score = model.best_score["valid_0"][params["metric"]]
        if config.is_classification():
            score = -score

        return {'loss': score, 'status': STATUS_OK}

    trials = Trials()
    best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=50, verbose=1,
                         rstate=np.random.RandomState(1))

    hyperparams = space_eval(space, best)
    log("{:0.4f} {}".format(trials.best_trial['result']['loss'], hyperparams))
    return hyperparams
Exemple #3
0
def drop_constant_columns(df: pd.DataFrame, config: Config):
    if "constant_columns" not in config:
        config["constant_columns"] = [
            c for c in df
            if c.startswith("number_") and not (df[c] != df[c].iloc[0]).any()
        ]
        log("Constant columns: " + ", ".join(config["constant_columns"]))

    if len(config["constant_columns"]) > 0:
        df.drop(config["constant_columns"], axis=1, inplace=True)
Exemple #4
0
def validate_dataset(alias: str, mode: str, train_limit: int) -> np.float64:
    log(alias)

    automl = AutoML("models/check_{}".format(alias))

    automl.config["time_limit"] = train_limit
    automl.train("data/check_{}/train.csv".format(alias), mode)

    automl.config["time_limit"] = 300
    _, score = automl.predict("data/check_{}/test.csv".format(alias),
                              "predictions/check_{}.csv".format(alias))

    return score
Exemple #5
0
def subsample(df: pd.DataFrame, config: Config, max_size_mb: float = 2.0):
    if config.is_train():
        df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        if df_size_mb > max_size_mb:
            mem_per_row = df_size_mb / len(df)
            sample_rows = int(max_size_mb / mem_per_row)

            log("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows."
                .format(df_size_mb, len(df), sample_rows))
            _, df_drop = train_test_split(df,
                                          train_size=sample_rows,
                                          random_state=1)
            df.drop(df_drop.index, inplace=True)

            config["nrows"] = sample_rows
        else:
            config["nrows"] = len(df)
Exemple #6
0
def to_int8(df: pd.DataFrame, config: Config):
    if "int8_columns" not in config:
        config["int8_columns"] = []
        vals = [-1, 0, 1]

        for c in [c for c in df if c.startswith("number_")]:
            if (~df[c].isin(vals)).any():
                continue
            config["int8_columns"].append(c)

        log(config["int8_columns"])

    if len(config["int8_columns"]) > 0:
        df.loc[:,
               config["int8_columns"]] = df.loc[:,
                                                config["int8_columns"]].astype(
                                                    np.int8)
Exemple #7
0
def transform_categorical(df: pd.DataFrame, config: Config):
    if "categorical_columns" not in config:
        prior = config["categorical_prior"] = df["target"].mean()
        min_samples_leaf = int(0.01 * len(df))
        smoothing = 0.5 * min_samples_leaf

        config["categorical_columns"] = {}
        for c in [c for c in df if c.startswith("string_")]:
            averages = df[[c, "target"
                           ]].groupby(c)["target"].agg(["mean", "count"])
            smooth = 1 / (1 + np.exp(
                -(averages["count"] - min_samples_leaf) / smoothing))
            averages["target"] = prior * (1 -
                                          smooth) + averages["mean"] * smooth
            config["categorical_columns"][c] = averages["target"].to_dict()

        log(list(config["categorical_columns"].keys()))

    for c, values in config["categorical_columns"].items():
        df.loc[:, c] = df[c].apply(lambda x: values[x] if x in values else
                                   config["categorical_prior"])
Exemple #8
0
def feature_selection(df: pd.DataFrame, config: Config):
    if config.is_train():
        df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        if df_size_mb < 2 * 1024:
            return

        selected_columns = []
        config_sample = copy.deepcopy(config)
        for i in range(10):
            df_sample = df.sample(min(1000, len(df)),
                                  random_state=i).copy(deep=True)
            preprocess_pipeline(df_sample, config_sample)
            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)

            if len(selected_columns) > 0:
                X = X.drop(selected_columns, axis=1)

            if len(X.columns) > 0:
                selected_columns += select_features(X, y, config["mode"])
            else:
                break

        log("Selected columns: {}".format(selected_columns))

        drop_number_columns = [
            c for c in df if (c.startswith("number_") or c.startswith("id_"))
            and c not in selected_columns
        ]
        if len(drop_number_columns) > 0:
            config["drop_number_columns"] = drop_number_columns

        config["date_columns"] = {}
        for c in [c for c in selected_columns if c.startswith("datetime_")]:
            d = c.split("_")
            date_col = d[0] + "_" + d[1]
            date_part = d[2]

            if date_col not in config["date_columns"]:
                config["date_columns"][date_col] = []

            config["date_columns"][date_col].append(date_part)

        drop_datetime_columns = [
            c for c in df
            if c.startswith("datetime_") and c not in config["date_columns"]
        ]
        if len(drop_datetime_columns) > 0:
            config["drop_datetime_columns"] = drop_datetime_columns

    if "drop_number_columns" in config:
        log("Drop number columns: {}".format(config["drop_number_columns"]))
        df.drop(config["drop_number_columns"], axis=1, inplace=True)

    if "drop_datetime_columns" in config:
        log("Drop datetime columns: {}".format(
            config["drop_datetime_columns"]))
        df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
Exemple #9
0
def preview_df(train_csv: str, config: Config, nrows: int = 3000):
    num_rows = sum(1 for line in open(train_csv)) - 1
    log("Rows in train: {}".format(num_rows))

    df = pd.read_csv(train_csv,
                     encoding="utf-8",
                     low_memory=False,
                     nrows=nrows)
    mem_per_row = df.memory_usage(deep=True).sum() / nrows
    log("Memory per row: {:0.2f} Kb".format(mem_per_row / 1024))

    df_size = (num_rows * mem_per_row) / 1024 / 1024
    log("Approximate dataset size: {:0.2f} Mb".format(df_size))

    config["parse_dates"] = []
    config["dtype"] = {
        "line_id": int,
    }

    counters = {
        "id": 0,
        "number": 0,
        "string": 0,
        "datetime": 0,
    }

    for c in df:
        if c.startswith("number_"):
            counters["number"] += 1
        elif c.startswith("string_"):
            counters["string"] += 1
            config["dtype"][c] = str
        elif c.startswith("datetime_"):
            counters["datetime"] += 1
            config["dtype"][c] = str
            config["parse_dates"].append(c)
        elif c.startswith("id_"):
            counters["id"] += 1

    log("Number columns: {}".format(counters["number"]))
    log("String columns: {}".format(counters["string"]))
    log("Datetime columns: {}".format(counters["datetime"]))

    config["counters"] = counters
Exemple #10
0
def validate(preds: pd.DataFrame, target_csv: str, mode: str) -> np.float64:
    df = pd.merge(preds, pd.read_csv(target_csv), on="line_id", left_index=True)
    score = roc_auc_score(df.target.values, df.prediction.values) if mode == "classification" else \
        np.sqrt(mean_squared_error(df.target.values, df.prediction.values))
    log("Score: {:0.4f}".format(score))
    return score