def test_dense_preprocessor_inverse_transform(embed_cols, continuous_cols,
                                              scale):
    dense_preprocessor = DensePreprocessor(embed_cols=embed_cols,
                                           continuous_cols=continuous_cols,
                                           scale=scale)
    encoded = dense_preprocessor.fit_transform(df)
    decoded = dense_preprocessor.inverse_transform(encoded)
    try:
        if isinstance(embed_cols[0], tuple):
            embed_cols = [c[0] for c in embed_cols]
        emb_df = df[embed_cols]
    except Exception:
        emb_df = pd.DataFrame()
    try:
        cont_df = df[continuous_cols]
    except Exception:
        cont_df = pd.DataFrame()
    org_df = pd.concat([emb_df, cont_df], axis=1)
    decoded = decoded.astype(org_df.dtypes.to_dict())
    assert decoded.equals(org_df)
def test_prepare_deep_without_embedding_columns():

    errors = []
    df_randint = pd.DataFrame(np.random.choice(np.arange(100), (100, 2)))
    df_randint.columns = ["col1", "col2"]
    preprocessor3 = DensePreprocessor(continuous_cols=["col1", "col2"])

    try:
        X_randint = preprocessor3.fit_transform(df_randint)
    except Exception:
        errors.append("Fundamental Error")

    out_booleans = []

    means, stds = np.mean(X_randint, axis=0), np.std(X_randint, axis=0)
    for mean, std in zip(means, stds):
        out_booleans.append(np.isclose(mean, 0.0))
        out_booleans.append(np.isclose(std, 1.0))

    if not np.all(out_booleans):
        errors.append("There is something going on with the scaler")

    assert not errors, "errors occured:\n{}".format("\n".join(errors))
Ejemplo n.º 3
0
    continuous_cols = [
        "latitude", "longitude", "security_deposit", "extra_people"
    ]
    already_standard = ["latitude", "longitude"]
    df["yield_cat"] = pd.cut(df["yield"],
                             bins=[0.2, 65, 163, 600],
                             labels=[0, 1, 2])
    df.drop("yield", axis=1, inplace=True)
    target = "yield_cat"

    target = np.array(df[target].values)
    prepare_wide = WidePreprocessor(wide_cols=wide_cols,
                                    crossed_cols=crossed_cols)
    X_wide = prepare_wide.fit_transform(df)

    prepare_deep = DensePreprocessor(embed_cols=cat_embed_cols,
                                     continuous_cols=continuous_cols)
    X_deep = prepare_deep.fit_transform(df)
    wide = Wide(wide_dim=X_wide.shape[1], pred_dim=3)
    deepdense = DeepDense(
        hidden_layers=[64, 32],
        dropout=[0.2, 0.2],
        deep_column_idx=prepare_deep.deep_column_idx,
        embed_input=prepare_deep.embeddings_input,
        continuous_cols=continuous_cols,
    )
    model = WideDeep(wide=wide, deepdense=deepdense, pred_dim=3)
    model.compile(method="multiclass", metrics=[Accuracy, F1Score])

    model.fit(
        X_wide=X_wide,
        X_deep=X_deep,
    "input_df, encoder, output_df",
    [(df_letters, le_letters, df_letters_le),
     (df_numbers, le_numbers, df_numbers_le)],
)
def test_label_encoder(input_df, encoder, output_df):
    original_df = encoder.inverse_transform(output_df)
    assert original_df.equals(input_df)


################################################################################
# Test the DensePreprocessor: only categorical columns to be represented with
# embeddings
###############################################################################
cat_embed_cols = [("col1", 5), ("col2", 5)]

preprocessor1 = DensePreprocessor(cat_embed_cols)  # type: ignore[arg-type]
X_letters = preprocessor1.fit_transform(df_letters)

preprocessor2 = DensePreprocessor(cat_embed_cols)  # type: ignore[arg-type]
X_numbers = preprocessor2.fit_transform(df_numbers)

error_list = []


@pytest.mark.parametrize(
    "input_df, X_deep, preprocessor",
    [(df_letters, X_letters, preprocessor1),
     (df_numbers, X_numbers, preprocessor2)],
)
def test_prepare_deep_without_continous_columns(input_df, X_deep,
                                                preprocessor):
Ejemplo n.º 5
0
    continuous_cols = ["latitude", "longitude", "security_deposit", "extra_people"]
    already_standard = ["latitude", "longitude"]
    text_col = "description"
    word_vectors_path = "data/glove.6B/glove.6B.100d.txt"
    img_col = "id"
    img_path = "data/airbnb/property_picture"
    target = "yield"

    target = df[target].values

    prepare_wide = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
    X_wide = prepare_wide.fit_transform(df)

    prepare_deep = DensePreprocessor(
        embed_cols=cat_embed_cols,  # type: ignore[arg-type]
        continuous_cols=continuous_cols,
        already_standard=already_standard,
    )
    X_deep = prepare_deep.fit_transform(df)

    text_processor = TextPreprocessor(
        word_vectors_path=word_vectors_path, text_col=text_col
    )
    X_text = text_processor.fit_transform(df)

    image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path)
    X_images = image_processor.fit_transform(df)

    wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
    deepdense = DeepDense(
        hidden_layers=[64, 32],
Ejemplo n.º 6
0
    "input_df, encoder, output_df",
    [(df_letters, le_letters, df_letters_le),
     (df_numbers, le_numbers, df_numbers_le)],
)
def test_label_encoder(input_df, encoder, output_df):
    original_df = encoder.inverse_transform(output_df)
    assert original_df.equals(input_df)


################################################################################
# Test the DensePreprocessor: only categorical columns to be represented with
# embeddings
###############################################################################
cat_embed_cols = [("col1", 5), ("col2", 5)]

preprocessor1 = DensePreprocessor(cat_embed_cols)
X_letters = preprocessor1.fit_transform(df_letters)

preprocessor2 = DensePreprocessor(cat_embed_cols)
X_numbers = preprocessor2.fit_transform(df_numbers)

error_list = []


@pytest.mark.parametrize(
    "input_df, X_deep, preprocessor",
    [(df_letters, X_letters, preprocessor1),
     (df_numbers, X_numbers, preprocessor2)],
)
def test_prepare_deep_without_continous_columns(input_df, X_deep,
                                                preprocessor):
Ejemplo n.º 7
0
                    ("native_country", "occupation")]
    cat_embed_cols = [
        ("education", 10),
        ("relationship", 8),
        ("workclass", 10),
        ("occupation", 10),
        ("native_country", 10),
    ]
    continuous_cols = ["age", "hours_per_week"]
    target = "income_label"
    target = df[target].values
    prepare_wide = WidePreprocessor(wide_cols=wide_cols,
                                    crossed_cols=crossed_cols)
    X_wide = prepare_wide.fit_transform(df)
    prepare_deep = DensePreprocessor(
        embed_cols=cat_embed_cols,
        continuous_cols=continuous_cols  # type: ignore[arg-type]
    )
    X_deep = prepare_deep.fit_transform(df)

    wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)

    deepdense = DeepDense(
        hidden_layers=[64, 32],
        dropout=[0.2, 0.2],
        deep_column_idx=prepare_deep.deep_column_idx,
        embed_input=prepare_deep.embeddings_input,
        continuous_cols=continuous_cols,
    )

    # # To use DeepDenseResnet as the deepdense component simply:
    # deepdense = DeepDenseResnet(