("neighbourhood_cleansed", 64), ("cancellation_policy", 16), ] continuous_cols = [ "latitude", "longitude", "security_deposit", "extra_people" ] already_standard = ["latitude", "longitude"] text_col = "description" word_vectors_path = "data/glove.6B/glove.6B.100d.txt" img_col = "id" img_path = "data/airbnb/property_picture" target = "yield" target = df[target].values wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df) tab_preprocessor = TabPreprocessor( embed_cols=cat_embed_cols, # type: ignore[arg-type] continuous_cols=continuous_cols, already_standard=already_standard, ) X_tab = tab_preprocessor.fit_transform(df) text_processor = TextPreprocessor(word_vectors_path=word_vectors_path, text_col=text_col) X_text = text_processor.fit_transform(df) image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path) X_images = image_processor.fit_transform(df)
cat_embed_cols = [(c, 16) for c in df.columns if "catg" in c] + [ ("neighbourhood_cleansed", 64), ("cancellation_policy", 16), ] continuous_cols = [ "latitude", "longitude", "security_deposit", "extra_people" ] already_standard = ["latitude", "longitude"] df["yield_cat"] = pd.cut(df["yield"], bins=[0.2, 65, 163, 600], labels=[0, 1, 2]) df.drop("yield", axis=1, inplace=True) target = "yield_cat" target = np.array(df[target].values) prepare_wide = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = prepare_wide.fit_transform(df) prepare_deep = DensePreprocessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols) X_deep = prepare_deep.fit_transform(df) wide = Wide(wide_dim=X_wide.shape[1], pred_dim=3) deepdense = DeepDense( hidden_layers=[64, 32], dropout=[0.2, 0.2], deep_column_idx=prepare_deep.deep_column_idx, embed_input=prepare_deep.embeddings_input, continuous_cols=continuous_cols, ) model = WideDeep(wide=wide, deepdense=deepdense, pred_dim=3) model.compile(method="multiclass", metrics=[Accuracy, F1Score])
nuniques = df.col1.nunique() + df.col2.nunique() return df, nuniques some_letters = ["a", "b", "c", "d", "e"] some_numbers = [1, 2, 3, 4, 5] wide_cols = ["col1", "col2"] cross_cols = [("col1", "col2")] ############################################################################### # Simple test of functionality making sure the shape match ############################################################################### df_letters, unique_letters = create_test_dataset(some_letters) df_numbers, unique_numbers = create_test_dataset(some_numbers) preprocessor1 = WidePreprocessor(wide_cols, cross_cols) @pytest.mark.parametrize( "input_df, expected_shape", [(df_letters, unique_letters), (df_numbers, unique_numbers)], ) def test_preprocessor1(input_df, expected_shape): wide_mtx = preprocessor1.fit_transform(input_df) assert wide_mtx.shape[1] == expected_shape ############################################################################### # Same test as before but checking that all works when no passing crossed cols ############################################################################### df_letters_wo_crossed, unique_letters_wo_crossed = create_test_dataset(
if args.with_wide: cat_embed_cols = [] for col in train.columns: if train[col].nunique() >= 5 and train[col].nunique( ) < 200 and col != "target": cat_embed_cols.append(col) num_cols = [ c for c in train.columns if c not in cat_embed_cols + ["target"] ] wide_cols = [] for col in train.columns: if train[col].nunique() < 40 and col != "target": wide_cols.append(col) prepare_wide = WidePreprocessor(wide_cols) X_wide_train = prepare_wide.fit_transform(train) X_wide_valid = prepare_wide.transform(valid) prepare_tab = TabPreprocessor( embed_cols=cat_embed_cols, continuous_cols=num_cols, for_tabtransformer=True, scale=False, ) X_tab_train = prepare_tab.fit_transform(train) X_tab_valid = prepare_tab.transform(valid) y_train = train.target.values y_valid = valid.target.values
def test_notfittederror(): processor = WidePreprocessor(wide_cols, cross_cols) with pytest.raises(NotFittedError): processor.transform(df_letters)
# Deep deep_embedding_columns_list = [("education", 16), ("workclass", 16), ("occupation", 16), ("native-country", 32)] deep_continuous_column_list = ["age", "hours-per-week"] # Target target_column_list = ["income_label"] target = train_df[target_column_list].values """ Preprocessing """ # Wide wide_preprocessor = WidePreprocessor(wide_cols=wide_columns_list, crossed_cols=wide_cross_column_list) x_wide = wide_preprocessor.fit_transform(train_df) # Deep tab_preprocessor = TabPreprocessor( embed_cols=deep_embedding_columns_list, continuous_cols=deep_continuous_column_list) x_deep = tab_preprocessor.fit_transform(train_df) """ Model 구조 정의 """ # Model wide = Wide(wide_dim=np.unique(x_wide).shape[0], pred_dim=1) deeptabular = TabMlp(mlp_hidden_dims=[64, 32], column_idx=tab_preprocessor.column_idx,