def estimate_importance(dataset, model_name): if os.path.exists( os.path.join('feature_importance', dataset, model_name, 'importance.csv')): print(f'Found {dataset}, {model_name}') return model_remote_path = stat_df.loc[model_name, dataset] postfix = '/test_score.json' remote_dir_name = model_remote_path[:-len(postfix)] def downloadDirectoryFroms3(bucketName, remoteDirectoryName, local_dir_path): s3_resource = boto3.resource('s3') bucket = s3_resource.Bucket(bucketName) for obj in bucket.objects.filter(Prefix=remoteDirectoryName): print(obj.key) download_path = os.path.join(local_dir_path, obj.key) if not os.path.exists(os.path.dirname(download_path)): os.makedirs(os.path.dirname(download_path), exist_ok=True) bucket.download_file(obj.key, download_path) local_dir_name = os.path.join(download_path, remote_dir_name) if os.path.exists(local_dir_name): pass else: downloadDirectoryFroms3('automl-mm-bench', remote_dir_name, download_path) test_dataset = dataset_registry.create(dataset, 'test') if model_name == MULTIMODAL_TEXT_MODEL_NAME: predictor = MultiModalTextModel.load( os.path.join(local_dir_name, 'saved_model')) elif model_name == TABULAR_MODEL_NAME: predictor = TabularPredictor.load(os.path.join(local_dir_name)) elif model_name == STACK_ENSEMBLE_MODEL_NAME: predictor = TabularPredictor.load(os.path.join(local_dir_name)) else: raise NotImplementedError sample_size = min(len(test_dataset.data), 1000) if model_name == TABULAR_MODEL_NAME: importance_df = predictor.feature_importance( test_dataset.data[test_dataset.feature_columns + test_dataset.label_columns], subsample_size=sample_size) else: importance_df = compute_permutation_feature_importance( test_dataset.data[test_dataset.feature_columns], test_dataset.data[test_dataset.label_columns[0]], predict_func=predictor.predict, eval_metric=get_metric(test_dataset.metric), subsample_size=sample_size, num_shuffle_sets=3) os.makedirs(os.path.join('feature_importance', dataset, model_name), exist_ok=True) importance_df.to_csv( os.path.join('feature_importance', dataset, model_name, 'importance.csv')) print(importance_df)
def extract_pretrained_embedding(dataset): hyperparameters = ag_text_presets.create('default') hyperparameters['models']['MultimodalTextModel']['search_space'][ 'model.num_trainable_layers'] = 0 hyperparameters['models']['MultimodalTextModel']['search_space'][ 'model._disable_update'] = True hyperparameters['models']['MultimodalTextModel']['search_space'][ 'optimization.num_train_epochs'] = 1 hyperparameters['models']['MultimodalTextModel']['search_space'][ 'preprocessing.categorical.convert_to_text'] = True hyperparameters['models']['MultimodalTextModel']['search_space']['optimization.lr'] = 0. seed = 123 train_dataset = dataset_registry.create(dataset, 'train') test_dataset = dataset_registry.create(dataset, 'test') train_data1, tuning_data1 = sklearn.model_selection.train_test_split( train_dataset.data, test_size=0.05, random_state=np.random.RandomState(seed)) column_types, inferred_problem_type = infer_column_problem_types(train_data1, tuning_data1, label_columns=train_dataset.label_columns, problem_type=train_dataset.problem_type) text_feature_columns = [col_name for col_name in train_dataset.feature_columns if column_types[col_name] == 'text'] train_text_only_data = train_dataset.data[text_feature_columns + train_dataset.label_columns] test_text_only_data = test_dataset.data[text_feature_columns + test_dataset.label_columns] sampled_train_data = train_text_only_data.sample(10) predictor = TextPredictor(label=train_dataset.label_columns) predictor.fit(train_data=sampled_train_data, column_types=column_types, hyperparameters=hyperparameters) train_features = predictor.extract_embedding(train_text_only_data) test_features = predictor.extract_embedding(test_text_only_data) save_base_dir = f'embeddings/{dataset}/pretrain_text_embedding' os.makedirs(save_base_dir, exist_ok=True) np.save(os.path.join(save_base_dir, 'train.npy'), train_features) np.save(os.path.join(save_base_dir, 'test.npy'), test_features) with open(os.path.join(save_base_dir, 'text_columns.json'), 'w') as in_f: json.dump(text_feature_columns, in_f)
def extract_finetuned_embedding(dataset, stat_df, verify_performance): model_l = stat_df['Unnamed: 0'] multimodal_idx = None text_only_idx = None for i, model in enumerate(model_l): if model == MULTIMODAL_TEXT_MODEL_NAME: multimodal_idx = i elif model == TEXT_MODEL_NAME: text_only_idx = i if multimodal_idx is None or text_only_idx is None: raise NotImplementedError("Model not found!") multimodal_model_remote_path = stat_df[dataset].iloc[multimodal_idx] text_model_remote_path = stat_df[dataset].iloc[text_only_idx] postfix = '/test_score.json' multimodal_remote_dir_name = multimodal_model_remote_path[:-len(postfix)] text_remote_dir_name = text_model_remote_path[:-len(postfix)] print(multimodal_remote_dir_name) def downloadDirectoryFroms3(bucketName, remoteDirectoryName, local_dir_path): s3_resource = boto3.resource('s3') bucket = s3_resource.Bucket(bucketName) for obj in bucket.objects.filter(Prefix=remoteDirectoryName): print(obj.key) download_path = os.path.join(local_dir_path, obj.key) if not os.path.exists(os.path.dirname(download_path)): os.makedirs(os.path.dirname(download_path), exist_ok=True) bucket.download_file(obj.key, download_path) downloadDirectoryFroms3('automl-mm-bench', multimodal_remote_dir_name, args.model_download_path) downloadDirectoryFroms3('automl-mm-bench', text_remote_dir_name, args.model_download_path) # Multimodal Embedding multimodal_text_nn = MultiModalTextModel.load(os.path.join(args.model_download_path, multimodal_remote_dir_name, 'saved_model')) print(multimodal_text_nn) with open(os.path.join(args.model_download_path, multimodal_remote_dir_name, 'test_score.json'), 'r') as in_f: model_test_score = json.load(in_f) multimodal_loaded_score_val = list(model_test_score.values())[0] train_dataset = dataset_registry.create(dataset, 'train') test_dataset = dataset_registry.create(dataset, 'test') train_features = multimodal_text_nn.extract_embedding(train_dataset.data) test_features = multimodal_text_nn.extract_embedding(test_dataset.data) if verify_performance: multimodal_pred_score = multimodal_text_nn.evaluate(test_dataset.data) assert multimodal_pred_score == multimodal_loaded_score_val,\ f"MultiModalText NN: Predicted score={multimodal_pred_score}, " \ f"Loaded score={multimodal_loaded_score_val}, " \ f"Dataset={dataset}" os.makedirs(f'embeddings/{dataset}/multimodal_embedding', exist_ok=True) np.save(os.path.join(f'embeddings/{dataset}/multimodal_embedding', 'train.npy'), train_features) np.save(os.path.join(f'embeddings/{dataset}/multimodal_embedding', 'test.npy'), test_features) # Text Embedding text_nn = MultiModalTextModel.load(os.path.join(args.model_download_path, text_remote_dir_name, 'saved_model')) with open(os.path.join(args.model_download_path, text_remote_dir_name, 'test_score.json'), 'r') as in_f: model_test_score = json.load(in_f) text_loaded_score_val = list(model_test_score.values())[0] train_text_embeddings = text_nn.extract_embedding(train_dataset.data) test_text_embeddings = text_nn.extract_embedding(test_dataset.data) if verify_performance: text_pred_score = text_nn.evaluate(test_dataset.data) assert text_pred_score == text_loaded_score_val,\ f"Text-only network: Predicted score={text_pred_score}, " \ f"Loaded score={text_loaded_score_val}, " \ f"Dataset={dataset}" os.makedirs(f'embeddings/{dataset}/tuned_text_embedding', exist_ok=True) np.save(os.path.join(f'embeddings/{dataset}/tuned_text_embedding', 'train.npy'), train_text_embeddings) np.save(os.path.join(f'embeddings/{dataset}/tuned_text_embedding', 'test.npy'), test_text_embeddings)
def train_model(dataset_name, text_presets, save_dir, model, tabular_presets, num_gpus=None, get_competition_results=False, seed=123): set_seed(seed) if get_competition_results: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'competition') else: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'test') feature_columns = train_dataset.feature_columns label_columns = train_dataset.label_columns metric = train_dataset.metric problem_type = train_dataset.problem_type train_data1, tuning_data1 = sklearn.model_selection.train_test_split( train_dataset.data, test_size=0.05, random_state=np.random.RandomState(seed)) train_data = train_dataset.data test_data = test_dataset.data column_types, inferred_problem_type = infer_column_problem_types( train_data1, tuning_data1, label_columns=label_columns, problem_type=problem_type) train_data = train_data[feature_columns + label_columns] # tuning_data = tuning_data[feature_columns + label_columns] if not get_competition_results: test_data = test_data[feature_columns + label_columns] train_tic = time.time() if model == 'ag_tabular_quick': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8)) predictor = TabularPredictor(label=label_columns[0], path=save_dir, problem_type=problem_type) predictor.fit(train_data, time_limit=30, feature_generator=feature_generator) elif model == 'ag_tabular_without_text': no_text_feature_columns = [] for col_name in feature_columns: if column_types[col_name] != _TEXT: no_text_feature_columns.append(col_name) train_data = train_data[no_text_feature_columns + label_columns] # tuning_data = tuning_data[no_text_feature_columns + label_columns] test_data = test_data[no_text_feature_columns + label_columns] predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets in ['best_quality']: predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, presets=tabular_presets) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, num_bag_folds=5, num_stack_levels=1) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_tabular_old': predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_text_only': text_feature_columns = [ col_name for col_name in feature_columns if column_types[col_name] == _TEXT ] train_data = train_data[text_feature_columns + label_columns] test_data = test_data[text_feature_columns + label_columns] predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'ag_text_multimodal': predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'pre_embedding' or model == 'tune_embedding_multimodal' or model == 'tune_embedding_text': feature_generator = AutoMLPipelineFeatureGenerator( enable_text_special_features=False, enable_text_ngram_features=False) pre_embedding_folder = os.path.join(_CURR_DIR, 'pre_computed_embeddings') if model == 'pre_embedding': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'test.npy')) elif model == 'tune_embedding_multimodal': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'test.npy')) elif model == 'tune_embedding_text': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'test.npy')) else: raise NotImplementedError train_data = train_data.join( pd.DataFrame(train_features, columns=[ f'pre_feat{i}' for i in range(train_features.shape[1]) ])) train_data.reset_index(drop=True, inplace=True) test_data = test_data.join( pd.DataFrame(test_features, columns=[ f'pre_feat{i}' for i in range(test_features.shape[1]) ])) test_data.reset_index(drop=True, inplace=True) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'tabular_multimodal' or model == 'tabular_multimodal_just_table': if model == 'tabular_multimodal': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True) hyperparameters = get_multimodal_tabular_hparam_just_gbm( text_presets=text_presets) else: MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True, enable_text_special_features=False, enable_text_ngram_features=False) hyperparameters = multimodal_tabular_just_table_hparam( text_presets=text_presets) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '3fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=3, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError else: raise NotImplementedError train_toc = time.time() inference_tic = time.time() predictions = predictor.predict(test_data, as_pandas=True) predictor.save() inference_toc = time.time() if problem_type == MULTICLASS or problem_type == BINARY: prediction_prob = predictor.predict_proba(test_data, as_pandas=True) prediction_prob.to_csv( os.path.join(save_dir, 'test_prediction_prob.csv')) predictions.to_csv(os.path.join(save_dir, 'test_prediction.csv')) gt = test_data[label_columns[0]] gt.to_csv(os.path.join(save_dir, 'ground_truth.csv')) if not get_competition_results: score = predictor.evaluate(test_data) with open(os.path.join(save_dir, 'test_score.json'), 'w') as of: json.dump({metric: score}, of) with open(os.path.join(save_dir, 'speed_stats.json'), 'w') as of: json.dump( { 'train_time': train_toc - train_tic, 'inference_time': inference_toc - inference_tic, 'cpuinfo': cpuinfo.get_cpu_info() }, of)
"product_sentiment_machine_hack", "google_qa_question_type_reason_explanation", "google_qa_answer_type_reason_explanation", "women_clothing_review", "melbourne_airbnb", "ae_price_prediction", "mercari_price_suggestion100K", "jigsaw_unintended_bias100K", "imdb_genre_prediction", "fake_job_postings2", "kick_starter_funding", "jc_penney_products", "wine_reviews", "news_popularity2", "news_channel" ] embedding_type = 'pretrain_text_embedding' base_dir = 'embeddings' out_dir = 'pre_embedding_data' for dataset in dataset_l: print('Processing', dataset) train_dataset = dataset_registry.create(dataset, 'train') test_dataset = dataset_registry.create(dataset, 'test') train_features = np.load( os.path.join(base_dir, dataset, embedding_type, 'train.npy')) test_features = np.load( os.path.join(base_dir, dataset, embedding_type, 'test.npy')) with open( os.path.join(base_dir, dataset, embedding_type, 'text_columns.json'), 'r') as in_f: text_columns = json.load(in_f) other_columns = [ col for col in train_dataset.feature_columns if col not in text_columns ] train_data = train_dataset.data[other_columns + train_dataset.label_columns] test_data = test_dataset.data[other_columns + train_dataset.label_columns]
def train_baseline(dataset_name, save_dir, baseline, time_limit_sec=None, w2v_epochs=None, embed_dir=None, seed=123): set_seed(seed) train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'test') feature_columns = train_dataset.feature_columns label_columns = train_dataset.label_columns label_column = label_columns[0] eval_metric = train_dataset.metric problem_type = train_dataset.problem_type train_data = train_dataset.data test_data = test_dataset.data train_data = train_data[feature_columns + label_columns] test_data = test_data[feature_columns + label_columns] print("Running: ",baseline) # Train baseline: train_tic = time.time() if baseline == 'h2o_word2vec': h2o.init() w2v_model = train_w2v(train_data, epochs=w2v_epochs, save_dir=save_dir) train_data = process_w2v(train_data, w2v_model) test_data = process_w2v(test_data, w2v_model) elif baseline == 'h2o_embedding': train_data, test_data = get_embedded(train_data, test_data, dataset_name, embed_dir=embed_dir) print("Train/test data shapes: ") print(train_data.shape) print(test_data.shape) h2o_model = H2OBaseline() num_models_trained, fit_time = h2o_model.fit(train_data=train_data, label_column=label_column, problem_type=problem_type, eval_metric=eval_metric, time_limit_sec=time_limit_sec, output_directory=save_dir) train_toc = time.time() print("H2O fit runtime: %s" % fit_time) # Predict with baseline: inference_tic = time.time() y_pred, y_prob, predict_time = h2o_model.predict(test_data.drop(columns=[label_column]), pred_class_and_proba=True) inference_toc = time.time() print("H2O predict runtime: %s" % predict_time) # Evaluate predictions: # class_order = h2o_model.classes preds_toevaluate = y_pred if eval_metric is not None: if eval_metric == 'roc_auc': preds_toevaluate = y_prob.iloc[:,1] elif eval_metric == 'log_loss': preds_toevaluate = y_prob gt = test_data[label_column] gt.to_csv(os.path.join(save_dir, 'ground_truth.csv')) y_pred.to_csv(os.path.join(save_dir, 'h2o_test_prediction.csv')) if problem_type == MULTICLASS or problem_type == BINARY: y_prob.to_csv(os.path.join(save_dir, 'h2o_test_prediction_prob.csv')) if len(gt) != len(y_pred): print("WARNING: length of gt, y_pred dont match!") print("len(gt) ",len(gt)) print("len(y_pred) ",len(y_pred)) print("test_data.shape ", test_data.shape) if len(y_pred) > len(gt): print("WARNING: truncating predictions length to length of labels in test data ...") y_pred = y_pred[:len(gt)] y_prob = y_prob[:len(gt)] scorer = TabularPredictor(label=label_column, problem_type=problem_type, eval_metric=eval_metric) # scorer.fit(train_data.sample(:200], hyperparameters={'GBM': {'num_boost_round': 1}}, presets='ignore_text') score = scorer._learner.eval_metric(gt, preds_toevaluate) print("H2O score: ", score) with open(os.path.join(save_dir, 'test_score.json'), 'w') as of: json.dump({eval_metric: score}, of) with open(os.path.join(save_dir, 'speed_stats.json'), 'w') as of: json.dump({'train_time': train_toc - train_tic, 'inference_time': inference_toc - inference_tic, 'cpuinfo': cpuinfo.get_cpu_info()}, of)