def prepare_data(results_dir): train, test = load_dataset() cat_embed_cols = [] for col in train.columns: if train[col].dtype == "O" or train[col].nunique( ) < 200 and col != "target": cat_embed_cols.append(col) num_cols = [ c for c in train.columns if c not in cat_embed_cols + ["target"] ] args = read_best_model_args(results_dir) prepare_tab = TabPreprocessor( embed_cols=cat_embed_cols, continuous_cols=num_cols, scale=True, for_tabtransformer=True, ) X_train = prepare_tab.fit_transform(train) y_train = train.target.values X_test = prepare_tab.transform(test) y_test = test.target.values mlp_hidden_dims_same = len(cat_embed_cols) return mlp_hidden_dims_same, args, prepare_tab, X_train, X_test, y_train, y_test
embed_dropout=args.embed_dropout, ) model = WideDeep(deeptabular=deeptabular) return model if __name__ == "__main__": model_name = "tabresnet" results_dir, models_dir = set_dirs(model_name) prepare_tab, X_train, X_test, y_train, y_test = prepare_data(results_dir) args = read_best_model_args(results_dir) model = set_model(args, prepare_tab) run_experiment_and_save( model, model_name, results_dir, models_dir, args, X_train, X_test, y_train, y_test, )
# 200 is rather arbitraty but one has to make a decision as to how to decide # if something will be represented as embeddings or continuous in a "kind-of" # automated way cat_embed_cols = [] for col in train.columns: if train[col].dtype == "O" or train[col].nunique() < 200 and col != "target": cat_embed_cols.append(col) # all columns will be represented by embeddings prepare_tab = TabPreprocessor(embed_cols=cat_embed_cols, for_tabtransformer=True) X_train = prepare_tab.fit_transform(train) y_train = train.target.values X_test = prepare_tab.transform(test) y_test = test.target.values args = read_best_model_args(RESULTS_DIR) if args.mlp_hidden_dims == "same": mlp_hidden_dims = [ len(cat_embed_cols) * args.input_dim, len(cat_embed_cols) * args.input_dim, (len(cat_embed_cols) * args.input_dim) // 2, ] elif args.mlp_hidden_dims == "None": mlp_hidden_dims = None else: mlp_hidden_dims = eval(args.mlp_hidden_dims) deeptabular = TabTransformer( column_idx=prepare_tab.column_idx, embed_input=prepare_tab.embeddings_input,
results_d["trainer_history"] = trainer.history results_d["trainer_lr_history"] = trainer.lr_history results_d["runtime"] = runtime with open(results_dir / filename, "wb") as f: pickle.dump(results_d, f) if __name__ == "__main__": model_name = "tabmlp" results_dir, models_dir = set_dirs(model_name) prepare_tab, X_train, X_test, y_train, y_test = prepare_data(results_dir) args = read_best_model_args(results_dir, exp_idx=0) model = set_model(args, prepare_tab) run_experiment_and_save( model, model_name, results_dir, models_dir, args, X_train, X_test, y_train, y_test, fl_exp_indx=0, )