class AGLearner(object): def __init__(self, path=None): self.path = path def fit(self, x, y): ''' ''' x = x if len(x.shape) > 1 else x[:, None] y = y if len(y.shape) > 1 else y[:, None] x_columns = ['x_%d' % i for i in range(x.shape[1])] self.x_columns = x_columns y_column = 'target' columns = x_columns + [y_column] train_data = pd.DataFrame(np.concatenate([x, y], axis=1), columns=columns) self._model = TabularPredictor(y_column, problem_type=problem_type, eval_metric=eval_metric, \ path=self.path, verbosity=verbosity, sample_weight=sample_weight, weight_evaluation=weight_evaluation, \ groups=groups, **kwargs).fit(train_data, **fit_kwargs) def predict(self, x): ''' ''' assert hasattr(self, '_model'), 'The model has not been fitted yet' x = x if len(x.shape) > 1 else x[:, None] if not hasattr(self, 'x_columns'): self.x_columns = ['x_%d' % i for i in range(x.shape[1])] assert x.shape[1] == len( self.x_columns ), 'x has a shape incompatible with training data' data = pd.DataFrame(x, columns=self.x_columns) y_pred = self._model.predict(data, as_pandas=False) return y_pred @property def feature_importances_(self): try: importance_df = self._model.feature_importance() importances = [ importance_df.at[col, 'importance'] for col in self.x_columns ] return importances except: return [] def save(self, path): self._model.save() @classmethod def load(cls, path): learner = AGLearner(path=path) learner._model = TabularPredictor.load(path) return learner
def test_advanced_functionality(): fast_benchmark = True dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip', 'name': 'AdultIncomeBinaryClassification', 'problem_type': BINARY} label = 'class' directory_prefix = './datasets/' train_file = 'train_data.csv' test_file = 'test_data.csv' train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) if fast_benchmark: # subsample for fast_benchmark subsample_size = 100 train_data = train_data.head(subsample_size) test_data = test_data.head(subsample_size) print(f"Evaluating Advanced Functionality on Benchmark Dataset {dataset['name']}") directory = directory_prefix + 'advanced/' + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed. predictor = TabularPredictor(label=label, path=savedir).fit(train_data) leaderboard = predictor.leaderboard(data=test_data) extra_metrics = ['accuracy', 'roc_auc', 'log_loss'] leaderboard_extra = predictor.leaderboard(data=test_data, extra_info=True, extra_metrics=extra_metrics) assert set(predictor.get_model_names()) == set(leaderboard['model']) assert set(predictor.get_model_names()) == set(leaderboard_extra['model']) assert set(leaderboard_extra.columns).issuperset(set(leaderboard.columns)) assert len(leaderboard) == len(leaderboard_extra) assert set(leaderboard_extra.columns).issuperset(set(extra_metrics)) # Assert that extra_metrics are present in output num_models = len(predictor.get_model_names()) feature_importances = predictor.feature_importance(data=test_data) original_features = set(train_data.columns) original_features.remove(label) assert set(feature_importances.index) == original_features assert set(feature_importances.columns) == {'importance', 'stddev', 'p_value', 'n', 'p99_high', 'p99_low'} predictor.transform_features() predictor.transform_features(data=test_data) predictor.info() assert predictor.get_model_names_persisted() == [] # Assert that no models were persisted during training assert predictor.unpersist_models() == [] # Assert that no models were unpersisted persisted_models = predictor.persist_models(models='all', max_memory=None) assert set(predictor.get_model_names_persisted()) == set(persisted_models) # Ensure all models are persisted assert predictor.persist_models(models='all', max_memory=None) == [] # Ensure that no additional models are persisted on repeated calls unpersised_models = predictor.unpersist_models() assert set(unpersised_models) == set(persisted_models) assert predictor.get_model_names_persisted() == [] # Assert that all models were unpersisted # Raise exception with pytest.raises(NetworkXError): predictor.persist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2']) assert predictor.get_model_names_persisted() == [] assert predictor.unpersist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2']) == [] predictor.persist_models(models='all', max_memory=None) predictor.save() # Save predictor while models are persisted: Intended functionality is that they won't be persisted when loaded. predictor_loaded = TabularPredictor.load(predictor.path) # Assert that predictor loading works leaderboard_loaded = predictor_loaded.leaderboard(data=test_data) assert len(leaderboard) == len(leaderboard_loaded) assert predictor_loaded.get_model_names_persisted() == [] # Assert that models were not still persisted after loading predictor assert(predictor.get_model_full_dict() == dict()) predictor.refit_full() assert(len(predictor.get_model_full_dict()) == num_models) assert(len(predictor.get_model_names()) == num_models * 2) for model in predictor.get_model_names(): predictor.predict(data=test_data, model=model) predictor.refit_full() # Confirm that refit_models aren't further refit. assert(len(predictor.get_model_full_dict()) == num_models) assert(len(predictor.get_model_names()) == num_models * 2) predictor.delete_models(models_to_keep=[]) # Test that dry-run doesn't delete models assert(len(predictor.get_model_names()) == num_models * 2) predictor.predict(data=test_data) predictor.delete_models(models_to_keep=[], dry_run=False) # Test that dry-run deletes models assert len(predictor.get_model_names()) == 0 assert len(predictor.leaderboard()) == 0 assert len(predictor.leaderboard(extra_info=True)) == 0 try: predictor.predict(data=test_data) except: pass else: raise AssertionError('predictor.predict should raise exception after all models are deleted') print('Tabular Advanced Functionality Test Succeeded.')
predictor = TabularPredictor(path=os.path.join(args.save_dir, args.model_type, time_str), problem_type=train_dataset.problem_type, eval_metric=train_dataset.metric, label=label_columns[0]) if args.ensemble_type == 'weighted': predictor.fit(concat_df[feature_columns + [label_columns[0]]], feature_generator=feature_generator, hyperparameters=tabular_hparams) else: predictor.fit(concat_df[feature_columns + [label_columns[0]]], feature_generator=feature_generator, num_bag_folds=5, num_stack_levels=1, hyperparameters=tabular_hparams) predictor.save() else: predictor = TextPredictor(path=os.path.join(args.save_dir, args.model_type, time_str), problem_type=train_dataset.problem_type, eval_metric=train_dataset.metric, label=label_columns[0]) predictor.fit(concat_df[feature_columns + [label_columns[0]]], presets='electra_base_late_fusion_concate_e10_avg3') predictor.save( os.path.join(args.save_dir, args.model_type, time_str, 'text_prediction')) predictions = predictor.predict(competition_df, as_pandas=True) predictions.to_csv( os.path.join(args.save_dir, args.model_type, time_str, 'pred.csv'))
def train_model(dataset_name, text_presets, save_dir, model, tabular_presets, num_gpus=None, get_competition_results=False, seed=123): set_seed(seed) if get_competition_results: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'competition') else: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'test') feature_columns = train_dataset.feature_columns label_columns = train_dataset.label_columns metric = train_dataset.metric problem_type = train_dataset.problem_type train_data1, tuning_data1 = sklearn.model_selection.train_test_split( train_dataset.data, test_size=0.05, random_state=np.random.RandomState(seed)) train_data = train_dataset.data test_data = test_dataset.data column_types, inferred_problem_type = infer_column_problem_types( train_data1, tuning_data1, label_columns=label_columns, problem_type=problem_type) train_data = train_data[feature_columns + label_columns] # tuning_data = tuning_data[feature_columns + label_columns] if not get_competition_results: test_data = test_data[feature_columns + label_columns] train_tic = time.time() if model == 'ag_tabular_quick': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8)) predictor = TabularPredictor(label=label_columns[0], path=save_dir, problem_type=problem_type) predictor.fit(train_data, time_limit=30, feature_generator=feature_generator) elif model == 'ag_tabular_without_text': no_text_feature_columns = [] for col_name in feature_columns: if column_types[col_name] != _TEXT: no_text_feature_columns.append(col_name) train_data = train_data[no_text_feature_columns + label_columns] # tuning_data = tuning_data[no_text_feature_columns + label_columns] test_data = test_data[no_text_feature_columns + label_columns] predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets in ['best_quality']: predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, presets=tabular_presets) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, num_bag_folds=5, num_stack_levels=1) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_tabular_old': predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_text_only': text_feature_columns = [ col_name for col_name in feature_columns if column_types[col_name] == _TEXT ] train_data = train_data[text_feature_columns + label_columns] test_data = test_data[text_feature_columns + label_columns] predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'ag_text_multimodal': predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'pre_embedding' or model == 'tune_embedding_multimodal' or model == 'tune_embedding_text': feature_generator = AutoMLPipelineFeatureGenerator( enable_text_special_features=False, enable_text_ngram_features=False) pre_embedding_folder = os.path.join(_CURR_DIR, 'pre_computed_embeddings') if model == 'pre_embedding': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'test.npy')) elif model == 'tune_embedding_multimodal': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'test.npy')) elif model == 'tune_embedding_text': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'test.npy')) else: raise NotImplementedError train_data = train_data.join( pd.DataFrame(train_features, columns=[ f'pre_feat{i}' for i in range(train_features.shape[1]) ])) train_data.reset_index(drop=True, inplace=True) test_data = test_data.join( pd.DataFrame(test_features, columns=[ f'pre_feat{i}' for i in range(test_features.shape[1]) ])) test_data.reset_index(drop=True, inplace=True) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'tabular_multimodal' or model == 'tabular_multimodal_just_table': if model == 'tabular_multimodal': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True) hyperparameters = get_multimodal_tabular_hparam_just_gbm( text_presets=text_presets) else: MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True, enable_text_special_features=False, enable_text_ngram_features=False) hyperparameters = multimodal_tabular_just_table_hparam( text_presets=text_presets) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '3fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=3, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError else: raise NotImplementedError train_toc = time.time() inference_tic = time.time() predictions = predictor.predict(test_data, as_pandas=True) predictor.save() inference_toc = time.time() if problem_type == MULTICLASS or problem_type == BINARY: prediction_prob = predictor.predict_proba(test_data, as_pandas=True) prediction_prob.to_csv( os.path.join(save_dir, 'test_prediction_prob.csv')) predictions.to_csv(os.path.join(save_dir, 'test_prediction.csv')) gt = test_data[label_columns[0]] gt.to_csv(os.path.join(save_dir, 'ground_truth.csv')) if not get_competition_results: score = predictor.evaluate(test_data) with open(os.path.join(save_dir, 'test_score.json'), 'w') as of: json.dump({metric: score}, of) with open(os.path.join(save_dir, 'speed_stats.json'), 'w') as of: json.dump( { 'train_time': train_toc - train_tic, 'inference_time': inference_toc - inference_tic, 'cpuinfo': cpuinfo.get_cpu_info() }, of)
label_columns = train_dataset.label_columns train_data = train_dataset.data test_data = test_dataset.data concat_df = pd.concat([train_data, test_data]) concat_df.reset_index(drop=True, inplace=True) competition_df = competition_dataset.data[feature_columns] if args.model_type == 'base': tabular_hparams = get_tabular_hparams(electra_base_late_fusion_concate_e10_avg3()) elif args.model_type == 'large': tabular_hparams = get_tabular_hparams(electra_large_late_fusion_concate_e10_avg3()) else: raise NotImplementedError time_str = strftime("%Y-%m-%d_%H-%M-%S", gmtime()) predictor = TabularPredictor( path=os.path.join(args.save_dir, args.model_type, time_str), problem_type=train_dataset.problem_type, eval_metric='log_loss', label=label_columns[0]) predictor.fit(concat_df[feature_columns + [label_columns[0]]], feature_generator=feature_generator, num_bag_folds=5, num_stack_levels=1, hyperparameters=tabular_hparams) predictor.save() predictions = predictor.predict_proba(competition_df, as_pandas=True) predictions.to_csv(os.path.join(args.save_dir, args.model_type, time_str, 'pred_probabilities.csv'))