Exemple #1
0
        ("neighbourhood_cleansed", 64),
        ("cancellation_policy", 16),
    ]
    continuous_cols = [
        "latitude", "longitude", "security_deposit", "extra_people"
    ]
    already_standard = ["latitude", "longitude"]
    text_col = "description"
    word_vectors_path = "data/glove.6B/glove.6B.100d.txt"
    img_col = "id"
    img_path = "data/airbnb/property_picture"
    target = "yield"

    target = df[target].values

    wide_preprocessor = WidePreprocessor(wide_cols=wide_cols,
                                         crossed_cols=crossed_cols)
    X_wide = wide_preprocessor.fit_transform(df)

    tab_preprocessor = TabPreprocessor(
        embed_cols=cat_embed_cols,  # type: ignore[arg-type]
        continuous_cols=continuous_cols,
        already_standard=already_standard,
    )
    X_tab = tab_preprocessor.fit_transform(df)

    text_processor = TextPreprocessor(word_vectors_path=word_vectors_path,
                                      text_col=text_col)
    X_text = text_processor.fit_transform(df)

    image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path)
    X_images = image_processor.fit_transform(df)
Exemple #2
0
    cat_embed_cols = [(c, 16) for c in df.columns if "catg" in c] + [
        ("neighbourhood_cleansed", 64),
        ("cancellation_policy", 16),
    ]
    continuous_cols = [
        "latitude", "longitude", "security_deposit", "extra_people"
    ]
    already_standard = ["latitude", "longitude"]
    df["yield_cat"] = pd.cut(df["yield"],
                             bins=[0.2, 65, 163, 600],
                             labels=[0, 1, 2])
    df.drop("yield", axis=1, inplace=True)
    target = "yield_cat"

    target = np.array(df[target].values)
    prepare_wide = WidePreprocessor(wide_cols=wide_cols,
                                    crossed_cols=crossed_cols)
    X_wide = prepare_wide.fit_transform(df)

    prepare_deep = DensePreprocessor(embed_cols=cat_embed_cols,
                                     continuous_cols=continuous_cols)
    X_deep = prepare_deep.fit_transform(df)
    wide = Wide(wide_dim=X_wide.shape[1], pred_dim=3)
    deepdense = DeepDense(
        hidden_layers=[64, 32],
        dropout=[0.2, 0.2],
        deep_column_idx=prepare_deep.deep_column_idx,
        embed_input=prepare_deep.embeddings_input,
        continuous_cols=continuous_cols,
    )
    model = WideDeep(wide=wide, deepdense=deepdense, pred_dim=3)
    model.compile(method="multiclass", metrics=[Accuracy, F1Score])
        nuniques = df.col1.nunique() + df.col2.nunique()
    return df, nuniques


some_letters = ["a", "b", "c", "d", "e"]
some_numbers = [1, 2, 3, 4, 5]

wide_cols = ["col1", "col2"]
cross_cols = [("col1", "col2")]

###############################################################################
# Simple test of functionality making sure the shape match
###############################################################################
df_letters, unique_letters = create_test_dataset(some_letters)
df_numbers, unique_numbers = create_test_dataset(some_numbers)
preprocessor1 = WidePreprocessor(wide_cols, cross_cols)


@pytest.mark.parametrize(
    "input_df, expected_shape",
    [(df_letters, unique_letters), (df_numbers, unique_numbers)],
)
def test_preprocessor1(input_df, expected_shape):
    wide_mtx = preprocessor1.fit_transform(input_df)
    assert wide_mtx.shape[1] == expected_shape


###############################################################################
# Same test as before but checking that all works when no passing crossed cols
###############################################################################
df_letters_wo_crossed, unique_letters_wo_crossed = create_test_dataset(
Exemple #4
0
if args.with_wide:
    cat_embed_cols = []
    for col in train.columns:
        if train[col].nunique() >= 5 and train[col].nunique(
        ) < 200 and col != "target":
            cat_embed_cols.append(col)
    num_cols = [
        c for c in train.columns if c not in cat_embed_cols + ["target"]
    ]

    wide_cols = []
    for col in train.columns:
        if train[col].nunique() < 40 and col != "target":
            wide_cols.append(col)

    prepare_wide = WidePreprocessor(wide_cols)
    X_wide_train = prepare_wide.fit_transform(train)
    X_wide_valid = prepare_wide.transform(valid)

    prepare_tab = TabPreprocessor(
        embed_cols=cat_embed_cols,
        continuous_cols=num_cols,
        for_tabtransformer=True,
        scale=False,
    )
    X_tab_train = prepare_tab.fit_transform(train)
    X_tab_valid = prepare_tab.transform(valid)

    y_train = train.target.values
    y_valid = valid.target.values
def test_notfittederror():
    processor = WidePreprocessor(wide_cols, cross_cols)
    with pytest.raises(NotFittedError):
        processor.transform(df_letters)
Exemple #6
0
    # Deep
    deep_embedding_columns_list = [("education", 16), ("workclass", 16),
                                   ("occupation", 16), ("native-country", 32)]
    deep_continuous_column_list = ["age", "hours-per-week"]

    # Target
    target_column_list = ["income_label"]

    target = train_df[target_column_list].values
    """
    Preprocessing
    """

    # Wide
    wide_preprocessor = WidePreprocessor(wide_cols=wide_columns_list,
                                         crossed_cols=wide_cross_column_list)

    x_wide = wide_preprocessor.fit_transform(train_df)

    # Deep
    tab_preprocessor = TabPreprocessor(
        embed_cols=deep_embedding_columns_list,
        continuous_cols=deep_continuous_column_list)
    x_deep = tab_preprocessor.fit_transform(train_df)
    """
    Model 구조 정의
    """
    # Model
    wide = Wide(wide_dim=np.unique(x_wide).shape[0], pred_dim=1)
    deeptabular = TabMlp(mlp_hidden_dims=[64, 32],
                         column_idx=tab_preprocessor.column_idx,