Esempio n. 1
0
def prepare_data(results_dir):

    train, test = load_dataset()

    cat_embed_cols = []
    for col in train.columns:
        if train[col].dtype == "O" or train[col].nunique(
        ) < 200 and col != "target":
            cat_embed_cols.append(col)
    num_cols = [
        c for c in train.columns if c not in cat_embed_cols + ["target"]
    ]

    args = read_best_model_args(results_dir)
    prepare_tab = TabPreprocessor(
        embed_cols=cat_embed_cols,
        continuous_cols=num_cols,
        scale=True,
        for_tabtransformer=True,
    )
    X_train = prepare_tab.fit_transform(train)
    y_train = train.target.values
    X_test = prepare_tab.transform(test)
    y_test = test.target.values

    mlp_hidden_dims_same = len(cat_embed_cols)

    return mlp_hidden_dims_same, args, prepare_tab, X_train, X_test, y_train, y_test
def prepare_data(results_dir):

    train, test = load_dataset()

    # All columns will be treated as categorical. The column with the highest
    # number of categories has 308
    cat_embed_cols = [c for c in train.columns if c != "target"]

    prepare_tab = TabPreprocessor(embed_cols=cat_embed_cols)
    X_train = prepare_tab.fit_transform(train)
    y_train = train.target.values
    X_test = prepare_tab.transform(test)
    y_test = test.target.values

    return prepare_tab, X_train, X_test, y_train, y_test
train = train[train.target <= upper_limit]
valid = valid[valid.target <= upper_limit]

cat_embed_cols = []
for col in train.columns:
    if train[col].dtype == "O" or train[col].nunique() < 200 and col != "target":
        cat_embed_cols.append(col)
num_cols = [c for c in train.columns if c not in cat_embed_cols + ["target"]]

prepare_tab = TabPreprocessor(
    embed_cols=cat_embed_cols, continuous_cols=num_cols, scale=args.scale_cont
)
X_train = prepare_tab.fit_transform(train)
y_train = train.target.values
X_valid = prepare_tab.transform(valid)
y_valid = valid.target.values

if args.blocks_dims == "same":
    n_inp_dim = sum([e[2] for e in prepare_tab.embeddings_input])
    blocks_dims = [n_inp_dim, n_inp_dim, n_inp_dim]
else:
    blocks_dims = eval(args.blocks_dims)

if args.mlp_hidden_dims == "auto":
    n_inp_dim = blocks_dims[-1]
    mlp_hidden_dims = [4 * n_inp_dim, 2 * n_inp_dim]
else:
    mlp_hidden_dims = eval(args.mlp_hidden_dims)

deeptabular = TabResnet(
Esempio n. 4
0
    df.drop("education_num", axis=1, inplace=True)
train = pd.concat([train, valid], ignore_index=True)

# 200 is rather arbitraty but one has to make a decision as to how to decide
# if something will be represented as embeddings or continuous in a "kind-of"
# automated way
cat_embed_cols = []
for col in train.columns:
    if train[col].dtype == "O" or train[col].nunique() < 200 and col != "target":
        cat_embed_cols.append(col)

# all columns will be represented by embeddings
prepare_tab = TabPreprocessor(embed_cols=cat_embed_cols, for_tabtransformer=True)
X_train = prepare_tab.fit_transform(train)
y_train = train.target.values
X_test = prepare_tab.transform(test)
y_test = test.target.values

args = read_best_model_args(RESULTS_DIR)

if args.mlp_hidden_dims == "same":
    mlp_hidden_dims = [
        len(cat_embed_cols) * args.input_dim,
        len(cat_embed_cols) * args.input_dim,
        (len(cat_embed_cols) * args.input_dim) // 2,
    ]
elif args.mlp_hidden_dims == "None":
    mlp_hidden_dims = None
else:
    mlp_hidden_dims = eval(args.mlp_hidden_dims)
Esempio n. 5
0
train = pd.read_pickle(PROCESSED_DATA_DIR / "bankm_train.p")
valid = pd.read_pickle(PROCESSED_DATA_DIR / "bankm_val.p")
colnames = [c.replace(".", "_") for c in train.columns]
train.columns = colnames
valid.columns = colnames

# All columns will be treated as categorical. The column with the highest
# number of categories has 308
cat_embed_cols = [c for c in train.columns if c != "target"]

# all columns will be represented by embeddings
prepare_deep = TabPreprocessor(embed_cols=cat_embed_cols)
X_train = prepare_deep.fit_transform(train)
y_train = train.target.values
X_valid = prepare_deep.transform(valid)
y_valid = valid.target.values

deeptabular = TabNet(
    column_idx=prepare_deep.column_idx,
    embed_input=prepare_deep.embeddings_input,
    embed_dropout=args.embed_dropout,
    n_steps=args.n_steps,
    step_dim=args.step_dim,
    attn_dim=args.attn_dim,
    dropout=args.dropout,
    n_glu_step_dependent=args.n_glu_step_dependent,
    n_glu_shared=args.n_glu_shared,
    ghost_bn=args.ghost_bn,
    virtual_batch_size=args.virtual_batch_size,
    momentum=args.momentum,
def test_notfittederror():
    processor = TabPreprocessor(embed_cols=["col1", "col2"],
                                continuous_cols=["col3", "col4"])
    with pytest.raises(NotFittedError):
        processor.transform(df)