def run_tabular_benchmark_toy(fit_args): dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/toyClassification.zip', 'name': 'toyClassification', 'problem_type': MULTICLASS, 'label_column': 'y', 'performance_val': 0.436} # 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data # toyclassif_dataset should produce 1 warning and 1 error during inference: # Warning: Ignoring 181 (out of 1000) training examples for which the label value in column 'y' is missing # ValueError: Required columns are missing from the provided dataset. Missing columns: ['lostcolumn'] # Additional warning that would have occurred if ValueError was not triggered: # UserWarning: These columns from this dataset were not present in the training dataset (AutoGluon will ignore them): ['distractioncolumn1', 'distractioncolumn2'] directory_prefix = './datasets/' train_file = 'train_data.csv' test_file = 'test_data.csv' train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) print(f"Evaluating Benchmark Dataset {dataset['name']}") directory = directory_prefix + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed. predictor = task.fit(train_data=train_data, label=dataset['label_column'], output_directory=savedir, **fit_args) print(predictor.feature_metadata) print(predictor.feature_metadata.type_map_raw) print(predictor.feature_metadata.type_group_map_special) try: predictor.predict(test_data) except KeyError: # KeyError should be raised because test_data has missing column 'lostcolumn' pass else: raise AssertionError(f'{dataset["name"]} should raise an exception.')
def run(X_train, y_train, label: str, fit_args: dict = None): if fit_args is None: fit_args = {} X_train[label] = y_train predictor = ag_task.fit( train_data=X_train, label=label, **fit_args, ) return predictor
'num_epochs': 10, 'activation': 'relu', 'dropout_prob': ag.Real(0.0, 0.5) }, 'GBM': { 'num_boost_round': 1000, 'learning_rate': ag.Real(0.01, 0.1, log=True) } } predictor = task.fit( train_data=train_data, label=label_column, output_directory=savedir, hyperparameter_tune=True, hyperparameters=hyperparams, num_trials=5, time_limits=1 * 60, num_bagging_folds=0, stack_ensemble_levels=0 ) # since tuning_data = None, automatically determines train/validation split results = predictor.fit_summary() # display detailed summary of fit() process # Inference time: test_data = task.Dataset( file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv' ) # another Pandas DataFrame print(test_data.head()) perf = predictor.evaluate(
def test_advanced_functionality(): fast_benchmark = True dataset = { 'url': 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip', 'name': 'AdultIncomeBinaryClassification', 'problem_type': BINARY } label = 'class' directory_prefix = './datasets/' train_file = 'train_data.csv' test_file = 'test_data.csv' train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) if fast_benchmark: # subsample for fast_benchmark subsample_size = 100 train_data = train_data.head(subsample_size) test_data = test_data.head(subsample_size) print( f"Evaluating Advanced Functionality on Benchmark Dataset {dataset['name']}" ) directory = directory_prefix + 'advanced/' + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree( savedir, ignore_errors=True ) # Delete AutoGluon output directory to ensure previous runs' information has been removed. predictor = task.fit(train_data=train_data, label=label, output_directory=savedir) leaderboard = predictor.leaderboard(dataset=test_data) leaderboard_extra = predictor.leaderboard(dataset=test_data, extra_info=True) assert set(predictor.get_model_names()) == set(leaderboard['model']) assert set(predictor.get_model_names()) == set(leaderboard_extra['model']) assert set(leaderboard_extra.columns).issuperset(set(leaderboard.columns)) assert len(leaderboard) == len(leaderboard_extra) num_models = len(predictor.get_model_names()) feature_importances = predictor.feature_importance(dataset=test_data) original_features = set(train_data.columns) original_features.remove(label) assert (set(feature_importances.keys()) == original_features) predictor.transform_features() predictor.transform_features(dataset=test_data) predictor.info() assert predictor.get_model_names_persisted() == [ ] # Assert that no models were persisted during training assert predictor.unpersist_models() == [ ] # Assert that no models were unpersisted persisted_models = predictor.persist_models(models='all', max_memory=None) assert set(predictor.get_model_names_persisted()) == set( persisted_models) # Ensure all models are persisted assert predictor.persist_models(models='all', max_memory=None) == [ ] # Ensure that no additional models are persisted on repeated calls unpersised_models = predictor.unpersist_models() assert set(unpersised_models) == set(persisted_models) assert predictor.get_model_names_persisted() == [ ] # Assert that all models were unpersisted # Raise exception with pytest.raises(NetworkXError): predictor.persist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2']) assert predictor.get_model_names_persisted() == [] assert predictor.unpersist_models( models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2']) == [] predictor.persist_models(models='all', max_memory=None) predictor.save( ) # Save predictor while models are persisted: Intended functionality is that they won't be persisted when loaded. predictor_loaded = TabularPredictor.load( output_directory=predictor.output_directory ) # Assert that predictor loading works leaderboard_loaded = predictor_loaded.leaderboard(dataset=test_data) assert len(leaderboard) == len(leaderboard_loaded) assert predictor_loaded.get_model_names_persisted() == [ ] # Assert that models were not still persisted after loading predictor assert (predictor.get_model_full_dict() == dict()) predictor.refit_full() assert (len(predictor.get_model_full_dict()) == num_models) assert (len(predictor.get_model_names()) == num_models * 2) for model in predictor.get_model_names(): predictor.predict(dataset=test_data, model=model) predictor.refit_full() # Confirm that refit_models aren't further refit. assert (len(predictor.get_model_full_dict()) == num_models) assert (len(predictor.get_model_names()) == num_models * 2) predictor.delete_models( models_to_keep=[]) # Test that dry-run doesn't delete models assert (len(predictor.get_model_names()) == num_models * 2) predictor.predict(dataset=test_data) predictor.delete_models(models_to_keep=[], dry_run=False) # Test that dry-run deletes models assert len(predictor.get_model_names()) == 0 assert len(predictor.leaderboard()) == 0 assert len(predictor.leaderboard(extra_info=True)) == 0 try: predictor.predict(dataset=test_data) except: pass else: raise AssertionError( 'predictor.predict should raise exception after all models are deleted' ) print('Tabular Advanced Functionality Test Succeeded.')
def run_tabular_benchmarks(fast_benchmark, subsample_size, perf_threshold, seed_val, fit_args, dataset_indices=None, run_distill=False): print("Running fit with args:") print(fit_args) # Each train/test dataset must be located in single directory with the given names. train_file = 'train_data.csv' test_file = 'test_data.csv' EPS = 1e-10 # Information about each dataset in benchmark is stored in dict. # performance_val = expected performance on this dataset (lower = better),should update based on previously run benchmarks binary_dataset = { 'url': 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip', 'name': 'AdultIncomeBinaryClassification', 'problem_type': BINARY, 'label_column': 'class', 'performance_val': 0.129 } # Mixed types of features. multi_dataset = { 'url': 'https://autogluon.s3.amazonaws.com/datasets/CoverTypeMulticlassClassification.zip', 'name': 'CoverTypeMulticlassClassification', 'problem_type': MULTICLASS, 'label_column': 'Cover_Type', 'performance_val': 0.032 } # big dataset with 7 classes, all features are numeric. Runs SLOW. regression_dataset = { 'url': 'https://autogluon.s3.amazonaws.com/datasets/AmesHousingPriceRegression.zip', 'name': 'AmesHousingPriceRegression', 'problem_type': REGRESSION, 'label_column': 'SalePrice', 'performance_val': 0.076 } # Regression with mixed feature-types, skewed Y-values. toyregres_dataset = { 'url': 'https://autogluon.s3.amazonaws.com/datasets/toyRegression.zip', 'name': 'toyRegression', 'problem_type': REGRESSION, 'label_column': 'y', 'performance_val': 0.183 } # 1-D toy deterministic regression task with: heavy label+feature missingness, extra distraction column in test data # List containing dicts for each dataset to include in benchmark (try to order based on runtimes) datasets = [ toyregres_dataset, binary_dataset, regression_dataset, multi_dataset ] if dataset_indices is not None: # only run some datasets datasets = [datasets[i] for i in dataset_indices] # Aggregate performance summaries obtained in previous benchmark run: prev_perf_vals = [dataset['performance_val'] for dataset in datasets] previous_avg_performance = np.mean(prev_perf_vals) previous_median_performance = np.median(prev_perf_vals) previous_worst_performance = np.max(prev_perf_vals) # Run benchmark: performance_vals = [0.0] * len( datasets) # performance obtained in this run directory_prefix = './datasets/' with warnings.catch_warnings(record=True) as caught_warnings: for idx in range(len(datasets)): dataset = datasets[idx] train_data, test_data = load_data( directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) if seed_val is not None: seed(seed_val) np.random.seed(seed_val) mx.random.seed(seed_val) print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx + 1, len(datasets))) directory = directory_prefix + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree( savedir, ignore_errors=True ) # Delete AutoGluon output directory to ensure previous runs' information has been removed. label_column = dataset['label_column'] y_test = test_data[label_column] test_data = test_data.drop(labels=[label_column], axis=1) if fast_benchmark: if subsample_size is None: raise ValueError( "fast_benchmark specified without subsample_size") train_data = train_data.head( subsample_size) # subsample for fast_benchmark predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, **fit_args) results = predictor.fit_summary(verbosity=4) if predictor.problem_type != dataset['problem_type']: warnings.warn( "For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type'])) predictor = task.load( savedir) # Test loading previously-trained predictor from file y_pred = predictor.predict(test_data) perf_dict = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) if dataset['problem_type'] != REGRESSION: perf = 1.0 - perf_dict[ 'accuracy_score'] # convert accuracy to error-rate else: perf = 1.0 - perf_dict[ 'r2_score'] # unexplained variance score. performance_vals[idx] = perf print("Performance on dataset %s: %s (previous perf=%s)" % (dataset['name'], performance_vals[idx], dataset['performance_val'])) if (not fast_benchmark) and ( performance_vals[idx] > dataset['performance_val'] * perf_threshold): warnings.warn( "Performance on dataset %s is %s times worse than previous performance." % (dataset['name'], performance_vals[idx] / (EPS + dataset['performance_val']))) if run_distill: predictor.distill(time_limits=60, augment_args={'size_factor': 0.5}) # Summarize: avg_perf = np.mean(performance_vals) median_perf = np.median(performance_vals) worst_perf = np.max(performance_vals) for idx in range(len(datasets)): print("Performance on dataset %s: %s (previous perf=%s)" % (datasets[idx]['name'], performance_vals[idx], datasets[idx]['performance_val'])) print("Average performance: %s" % avg_perf) print("Median performance: %s" % median_perf) print("Worst performance: %s" % worst_perf) if not fast_benchmark: if avg_perf > previous_avg_performance * perf_threshold: warnings.warn( "Average Performance is %s times worse than previously." % (avg_perf / (EPS + previous_avg_performance))) if median_perf > previous_median_performance * perf_threshold: warnings.warn( "Median Performance is %s times worse than previously." % (median_perf / (EPS + previous_median_performance))) if worst_perf > previous_worst_performance * perf_threshold: warnings.warn( "Worst Performance is %s times worse than previously." % (worst_perf / (EPS + previous_worst_performance))) print("Ran fit with args:") print(fit_args) # List all warnings again to make sure they are seen: print("\n\n WARNINGS:") for w in caught_warnings: warnings.warn(w.message)
""" Example script for predicting columns of tables, demonstrating simple use-case """ from autogluon.tabular import TabularPrediction as task # Training time: train_data = task.Dataset( file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv' ) # can be local CSV file as well, returns Pandas DataFrame train_data = train_data.head(500) # subsample for faster demo print(train_data.head()) label_column = 'class' # specifies which column do we want to predict savedir = 'ag_models/' # where to save trained models predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir) # NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead: predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, presets='best_quality', eval_metric=YOUR_METRIC_NAME) results = predictor.fit_summary() # Inference time: test_data = task.Dataset( file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv' ) # another Pandas DataFrame y_test = test_data[label_column] test_data = test_data.drop( labels=[label_column], axis=1 ) # delete labels from test data since we wouldn't have them in practice print(test_data.head()) predictor = task.load( savedir
X_transform = auto_ml_pipeline_feature_generator.fit_transform(X=X) print(X_transform.head(5)) X_test_transform = auto_ml_pipeline_feature_generator.transform(X=X_test) print(X_test_transform.head(5)) ##################################################### # Specifying custom feature generator to task.fit() # ##################################################### example_models = {'GBM': {}, 'CAT': {}} example_models_2 = {'RF': {}, 'KNN': {}} # Because auto_ml_pipeline_feature_generator is already fit, it doesn't need to be fit again in predictor. Instead, train_data is just transformed by auto_ml_pipeline_feature_generator.transform(train_data). # This allows the feature transformation to be completely independent of the training data, we could have used a completely different data source to fit the generator. predictor = task.fit(train_data=train_data, label='class', hyperparameters=example_models, feature_generator=auto_ml_pipeline_feature_generator) X_test_transform_2 = predictor.transform_features( X_test ) # This is the same as calling auto_ml_pipeline_feature_generator.transform(X_test) assert (X_test_transform.equals(X_test_transform_2)) # The feature metadata of the feature generator is also preserved. All downstream models will get this feature metadata information to make decisions on how they use the data. assert (predictor.feature_metadata.to_dict() == auto_ml_pipeline_feature_generator.feature_metadata.to_dict()) predictor.leaderboard(test_data) # We can train multiple predictors with the same pre-fit feature generator. This can save a lot of time during experimentation if the fitting of the generator is expensive. predictor_2 = task.fit(train_data=train_data, label='class', hyperparameters=example_models_2, feature_generator=auto_ml_pipeline_feature_generator)
def fit_dataset(train_data, fit_args, sample_size=None): if sample_size is not None and sample_size < len(train_data): train_data = train_data.sample(n=sample_size, random_state=0) return task.fit(train_data=train_data, **fit_args)
y_pred = naive_bayes_model.predict(X_test) print(y_pred) y_pred_orig = label_cleaner.inverse_transform(y_pred) print(y_pred_orig) score = naive_bayes_model.score(X_test, y_test_clean) print(f'test score ({naive_bayes_model.eval_metric.name}) = {score}') ######################################## # Training custom model using task.fit # ######################################## custom_hyperparameters = {NaiveBayesModel: {}} # custom_hyperparameters = {NaiveBayesModel: [{}, {'var_smoothing': 0.00001}, {'var_smoothing': 0.000002}]} # Train 3 NaiveBayes models with different hyperparameters predictor = task.fit(train_data=train_data, label=label_column, hyperparameters=custom_hyperparameters) # Train a single default NaiveBayesModel predictor.leaderboard(test_data) y_pred = predictor.predict(test_data) print(y_pred) time.sleep(1) # Ensure we don't use the same train directory ############################################################### # Training custom model alongside other models using task.fit # ############################################################### # Now we add the custom model to be trained alongside the default models: custom_hyperparameters.update(get_hyperparameter_config('default')) predictor = task.fit(train_data=train_data, label=label_column, hyperparameters=custom_hyperparameters) # Train the default models plus a single default NaiveBayesModel # predictor = task.fit(train_data=train_data, label=label_column, auto_stack=True, hyperparameters=custom_hyperparameters) # We can even use the custom model in a multi-layer stack ensemble
" -O temp.zip && unzip -o temp.zip && rm temp.zip") savedir = directory + 'agModels/' label_column = dataset['label_column'] train_data = task.Dataset(file_path=train_file_path) test_data = task.Dataset(file_path=test_file_path) train_data = train_data.head(subsample_size) # subsample for faster demo test_data = test_data.head(subsample_size) # subsample for faster run print(train_data.head()) # Fit model ensemble: predictor = task.fit(train_data=train_data, label=label_column, problem_type='multiclass', output_directory=savedir, cache_data=True, auto_stack=True, time_limits=time_limits) # Distill ensemble-predictor into single model: time_limits = 60 # None # aug_data below is optional, but this could be additional unlabeled data you may have. Here we use the training data for demonstration, but you should only use new data here: aug_data = task.Dataset(file_path=train_file_path) aug_data = aug_data.head(subsample_size) # subsample for faster demo distilled_model_names = predictor.distill( time_limits=time_limits, augment_args={'num_augmented_samples': 100} ) # default distillation (time_limits & augment_args are also optional, here set to suboptimal values to ensure quick runtime)
# shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed. label_column = 'class' # specifies which column do we want to predict train_file_path = 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/Inc/train.csv' test_file_path = 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/Inc/test.csv' train_data = task.Dataset(file_path=train_file_path) train_data = train_data.head(subsample_size) # subsample for faster demo test_data = task.Dataset(file_path=test_file_path) test_data = test_data.head(subsample_size) # subsample for faster run # Fit model ensemble: predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, cache_data=True, auto_stack=True, time_limits=time_limits) # Distill ensemble-predictor into single model: time_limits = 60 # set = None to fully train distilled models # aug_data below is optional, but this could be additional unlabeled data you may have. Here we use the training data for demonstration, but you should only use new data here: aug_data = task.Dataset(file_path=train_file_path) aug_data = aug_data.head(subsample_size) distilled_model_names = predictor.distill( time_limits=time_limits, augment_args={'num_augmented_samples': 100} ) # default distillation (time_limits & augment_args are also optional, here set to suboptimal values to ensure quick runtime)
train_data = task.Dataset( file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') test_data = task.Dataset( file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') label = 'class' eval_metric = 'roc_auc' hyperparameters = {'RF': {}} train_data = train_data.head(1000) # subsample for faster demo ################################## # Fitting with the old Predictor # ################################## predictor1 = task.fit(train_data, label=label, eval_metric=eval_metric, hyperparameters=hyperparameters, num_bagging_folds=2) predictor1.leaderboard(test_data) ################################## # Fitting with the new Predictor # ################################## predictor2 = TabularPredictorV2(label, eval_metric=eval_metric) predictor2.fit(train_data, hyperparameters=hyperparameters, num_bag_folds=2) predictor2.leaderboard(test_data) #################################### # Advanced fit_extra functionality # ####################################
# num_bagging_sets = # num_trials = # max number of trials for each parameter combination # search_strategy = # "skopt", etc... # ngpus_per_trial = # automatically determined if unspecified # tuning_data = # validation data (don't use if bagging/stacking) # holdout_frac = # %% train model predictor = task.fit( train_data=train_data.drop(labels=cols_2_drop_4_training, axis=1), tuning_data=valid_data.drop(labels=cols_2_drop_4_training, axis=1), label=target_col, #hyperparameter_tune=hyperparameter_tune, auto_stack=True, time_limits=time_limit, output_directory=output_dir, eval_metric=metric, keep_only_best=True, save_space=True, ngpus_per_trial=1, presets=preset) # %% output model info results = predictor.fit_summary() performance = predictor.evaluate( test_data.drop(labels=cols_2_drop_4_training, axis=1)) predictor.leaderboard(test_data, silent=True) predictor.get_model_best() # get name of best model predictor.get_model_names() # get list of model names
print("%s data not found locally, so fetching from %s" % (dataset['name'], dataset['url'])) os.system("wget " + dataset['url'] + " -O temp.zip && unzip -o temp.zip && rm temp.zip") train_data = task.Dataset(file_path=train_file_path) test_data = task.Dataset(file_path=test_file_path) train_data = train_data.head(subsample_size) # subsample for faster demo test_data = test_data.head(subsample_size) # subsample for faster run label_column = dataset['label_column'] # Fit model ensemble: predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, cache_data=True, auto_stack=True, time_limits=time_limits, eval_metric='mean_absolute_error') # Distill ensemble-predictor into single model: time_limits = 60 # set = None to fully train distilled models # aug_data below is optional, but this could be additional unlabeled data you may have. Here we use the training data for demonstration, but you should only use new data here: aug_data = task.Dataset(file_path=train_file_path) aug_data = aug_data.head(subsample_size) # subsample for faster demo distilled_model_names = predictor.distill( time_limits=time_limits, augment_args={'num_augmented_samples': 100} ) # default distillation (time_limits & augment_args are also optional, here set to suboptimal values to ensure quick runtime)