Beispiel #1
0
def prepare_data(results_dir):

    train, test = load_dataset()

    cat_embed_cols = []
    for col in train.columns:
        if train[col].dtype == "O" or train[col].nunique(
        ) < 200 and col != "target":
            cat_embed_cols.append(col)
    num_cols = [
        c for c in train.columns if c not in cat_embed_cols + ["target"]
    ]

    args = read_best_model_args(results_dir)
    prepare_tab = TabPreprocessor(
        embed_cols=cat_embed_cols,
        continuous_cols=num_cols,
        scale=True,
        for_tabtransformer=True,
    )
    X_train = prepare_tab.fit_transform(train)
    y_train = train.target.values
    X_test = prepare_tab.transform(test)
    y_test = test.target.values

    mlp_hidden_dims_same = len(cat_embed_cols)

    return mlp_hidden_dims_same, args, prepare_tab, X_train, X_test, y_train, y_test
        embed_dropout=args.embed_dropout,
    )
    model = WideDeep(deeptabular=deeptabular)

    return model


if __name__ == "__main__":

    model_name = "tabresnet"

    results_dir, models_dir = set_dirs(model_name)

    prepare_tab, X_train, X_test, y_train, y_test = prepare_data(results_dir)

    args = read_best_model_args(results_dir)

    model = set_model(args, prepare_tab)

    run_experiment_and_save(
        model,
        model_name,
        results_dir,
        models_dir,
        args,
        X_train,
        X_test,
        y_train,
        y_test,
    )
Beispiel #3
0
# 200 is rather arbitraty but one has to make a decision as to how to decide
# if something will be represented as embeddings or continuous in a "kind-of"
# automated way
cat_embed_cols = []
for col in train.columns:
    if train[col].dtype == "O" or train[col].nunique() < 200 and col != "target":
        cat_embed_cols.append(col)

# all columns will be represented by embeddings
prepare_tab = TabPreprocessor(embed_cols=cat_embed_cols, for_tabtransformer=True)
X_train = prepare_tab.fit_transform(train)
y_train = train.target.values
X_test = prepare_tab.transform(test)
y_test = test.target.values

args = read_best_model_args(RESULTS_DIR)

if args.mlp_hidden_dims == "same":
    mlp_hidden_dims = [
        len(cat_embed_cols) * args.input_dim,
        len(cat_embed_cols) * args.input_dim,
        (len(cat_embed_cols) * args.input_dim) // 2,
    ]
elif args.mlp_hidden_dims == "None":
    mlp_hidden_dims = None
else:
    mlp_hidden_dims = eval(args.mlp_hidden_dims)

deeptabular = TabTransformer(
    column_idx=prepare_tab.column_idx,
    embed_input=prepare_tab.embeddings_input,
        results_d["trainer_history"] = trainer.history
        results_d["trainer_lr_history"] = trainer.lr_history
        results_d["runtime"] = runtime
        with open(results_dir / filename, "wb") as f:
            pickle.dump(results_d, f)


if __name__ == "__main__":

    model_name = "tabmlp"

    results_dir, models_dir = set_dirs(model_name)

    prepare_tab, X_train, X_test, y_train, y_test = prepare_data(results_dir)

    args = read_best_model_args(results_dir, exp_idx=0)

    model = set_model(args, prepare_tab)

    run_experiment_and_save(
        model,
        model_name,
        results_dir,
        models_dir,
        args,
        X_train,
        X_test,
        y_train,
        y_test,
        fl_exp_indx=0,
    )