] continuous_cols = [ "latitude", "longitude", "security_deposit", "extra_people" ] already_standard = ["latitude", "longitude"] text_col = "description" word_vectors_path = "data/glove.6B/glove.6B.100d.txt" img_col = "id" img_path = "data/airbnb/property_picture" target = "yield" target = df[target].values wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df) tab_preprocessor = TabPreprocessor( embed_cols=cat_embed_cols, # type: ignore[arg-type] continuous_cols=continuous_cols, already_standard=already_standard, ) X_tab = tab_preprocessor.fit_transform(df) text_processor = TextPreprocessor(word_vectors_path=word_vectors_path, text_col=text_col) X_text = text_processor.fit_transform(df) image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path) X_images = image_processor.fit_transform(df)
cat_embed_cols = [] for col in train.columns: if train[col].nunique() >= 5 and train[col].nunique( ) < 200 and col != "target": cat_embed_cols.append(col) num_cols = [ c for c in train.columns if c not in cat_embed_cols + ["target"] ] wide_cols = [] for col in train.columns: if train[col].nunique() < 40 and col != "target": wide_cols.append(col) prepare_wide = WidePreprocessor(wide_cols) X_wide_train = prepare_wide.fit_transform(train) X_wide_valid = prepare_wide.transform(valid) prepare_tab = TabPreprocessor( embed_cols=cat_embed_cols, continuous_cols=num_cols, for_tabtransformer=True, scale=False, ) X_tab_train = prepare_tab.fit_transform(train) X_tab_valid = prepare_tab.transform(valid) y_train = train.target.values y_valid = valid.target.values wide = Wide(wide_dim=np.unique(X_wide_train).shape[0])
("cancellation_policy", 16), ] continuous_cols = [ "latitude", "longitude", "security_deposit", "extra_people" ] already_standard = ["latitude", "longitude"] df["yield_cat"] = pd.cut(df["yield"], bins=[0.2, 65, 163, 600], labels=[0, 1, 2]) df.drop("yield", axis=1, inplace=True) target = "yield_cat" target = np.array(df[target].values) prepare_wide = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = prepare_wide.fit_transform(df) prepare_deep = DensePreprocessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols) X_deep = prepare_deep.fit_transform(df) wide = Wide(wide_dim=X_wide.shape[1], pred_dim=3) deepdense = DeepDense( hidden_layers=[64, 32], dropout=[0.2, 0.2], deep_column_idx=prepare_deep.deep_column_idx, embed_input=prepare_deep.embeddings_input, continuous_cols=continuous_cols, ) model = WideDeep(wide=wide, deepdense=deepdense, pred_dim=3) model.compile(method="multiclass", metrics=[Accuracy, F1Score])
("occupation", 16), ("native-country", 32)] deep_continuous_column_list = ["age", "hours-per-week"] # Target target_column_list = ["income_label"] target = train_df[target_column_list].values """ Preprocessing """ # Wide wide_preprocessor = WidePreprocessor(wide_cols=wide_columns_list, crossed_cols=wide_cross_column_list) x_wide = wide_preprocessor.fit_transform(train_df) # Deep tab_preprocessor = TabPreprocessor( embed_cols=deep_embedding_columns_list, continuous_cols=deep_continuous_column_list) x_deep = tab_preprocessor.fit_transform(train_df) """ Model 구조 정의 """ # Model wide = Wide(wide_dim=np.unique(x_wide).shape[0], pred_dim=1) deeptabular = TabMlp(mlp_hidden_dims=[64, 32], column_idx=tab_preprocessor.column_idx, embed_input=tab_preprocessor.embeddings_input, continuous_cols=deep_continuous_column_list)