Python TabularPrediction.fit Examples, autogluon.tabular.TabularPrediction.fit Python Examples

Example #1

0

Show file

def run_tabular_benchmark_toy(fit_args):
    dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/toyClassification.zip',
                          'name': 'toyClassification',
                          'problem_type': MULTICLASS,
                          'label_column': 'y',
                          'performance_val': 0.436}
    # 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data
    # toyclassif_dataset should produce 1 warning and 1 error during inference:
    # Warning: Ignoring 181 (out of 1000) training examples for which the label value in column 'y' is missing
    # ValueError: Required columns are missing from the provided dataset. Missing columns: ['lostcolumn']

    # Additional warning that would have occurred if ValueError was not triggered:
    # UserWarning: These columns from this dataset were not present in the training dataset (AutoGluon will ignore them):  ['distractioncolumn1', 'distractioncolumn2']

    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url'])
    print(f"Evaluating Benchmark Dataset {dataset['name']}")
    directory = directory_prefix + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    predictor = task.fit(train_data=train_data, label=dataset['label_column'], output_directory=savedir, **fit_args)
    print(predictor.feature_metadata)
    print(predictor.feature_metadata.type_map_raw)
    print(predictor.feature_metadata.type_group_map_special)
    try:
        predictor.predict(test_data)
    except KeyError:  # KeyError should be raised because test_data has missing column 'lostcolumn'
        pass
    else:
        raise AssertionError(f'{dataset["name"]} should raise an exception.')

Example #2

0

Show file

def run(X_train, y_train, label: str, fit_args: dict = None):
    if fit_args is None:
        fit_args = {}
    X_train[label] = y_train

    predictor = ag_task.fit(
        train_data=X_train,
        label=label,
        **fit_args,
    )

    return predictor

Example #3

0

Show file

        'num_epochs': 10,
        'activation': 'relu',
        'dropout_prob': ag.Real(0.0, 0.5)
    },
    'GBM': {
        'num_boost_round': 1000,
        'learning_rate': ag.Real(0.01, 0.1, log=True)
    }
}

predictor = task.fit(
    train_data=train_data,
    label=label_column,
    output_directory=savedir,
    hyperparameter_tune=True,
    hyperparameters=hyperparams,
    num_trials=5,
    time_limits=1 * 60,
    num_bagging_folds=0,
    stack_ensemble_levels=0
)  # since tuning_data = None, automatically determines train/validation split

results = predictor.fit_summary()  # display detailed summary of fit() process

# Inference time:
test_data = task.Dataset(
    file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv'
)  # another Pandas DataFrame
print(test_data.head())

perf = predictor.evaluate(

Example #4

0

Show file

File: test_tabular.py Project: yoon-gu/autogluon

def test_advanced_functionality():
    fast_benchmark = True
    dataset = {
        'url':
        'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
        'name': 'AdultIncomeBinaryClassification',
        'problem_type': BINARY
    }
    label = 'class'
    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix,
                                      train_file=train_file,
                                      test_file=test_file,
                                      name=dataset['name'],
                                      url=dataset['url'])
    if fast_benchmark:  # subsample for fast_benchmark
        subsample_size = 100
        train_data = train_data.head(subsample_size)
        test_data = test_data.head(subsample_size)
    print(
        f"Evaluating Advanced Functionality on Benchmark Dataset {dataset['name']}"
    )
    directory = directory_prefix + 'advanced/' + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(
        savedir, ignore_errors=True
    )  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    predictor = task.fit(train_data=train_data,
                         label=label,
                         output_directory=savedir)
    leaderboard = predictor.leaderboard(dataset=test_data)
    leaderboard_extra = predictor.leaderboard(dataset=test_data,
                                              extra_info=True)
    assert set(predictor.get_model_names()) == set(leaderboard['model'])
    assert set(predictor.get_model_names()) == set(leaderboard_extra['model'])
    assert set(leaderboard_extra.columns).issuperset(set(leaderboard.columns))
    assert len(leaderboard) == len(leaderboard_extra)
    num_models = len(predictor.get_model_names())
    feature_importances = predictor.feature_importance(dataset=test_data)
    original_features = set(train_data.columns)
    original_features.remove(label)
    assert (set(feature_importances.keys()) == original_features)
    predictor.transform_features()
    predictor.transform_features(dataset=test_data)
    predictor.info()

    assert predictor.get_model_names_persisted() == [
    ]  # Assert that no models were persisted during training
    assert predictor.unpersist_models() == [
    ]  # Assert that no models were unpersisted

    persisted_models = predictor.persist_models(models='all', max_memory=None)
    assert set(predictor.get_model_names_persisted()) == set(
        persisted_models)  # Ensure all models are persisted
    assert predictor.persist_models(models='all', max_memory=None) == [
    ]  # Ensure that no additional models are persisted on repeated calls
    unpersised_models = predictor.unpersist_models()
    assert set(unpersised_models) == set(persisted_models)
    assert predictor.get_model_names_persisted() == [
    ]  # Assert that all models were unpersisted

    # Raise exception
    with pytest.raises(NetworkXError):
        predictor.persist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2'])

    assert predictor.get_model_names_persisted() == []

    assert predictor.unpersist_models(
        models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2']) == []

    predictor.persist_models(models='all', max_memory=None)
    predictor.save(
    )  # Save predictor while models are persisted: Intended functionality is that they won't be persisted when loaded.
    predictor_loaded = TabularPredictor.load(
        output_directory=predictor.output_directory
    )  # Assert that predictor loading works
    leaderboard_loaded = predictor_loaded.leaderboard(dataset=test_data)
    assert len(leaderboard) == len(leaderboard_loaded)
    assert predictor_loaded.get_model_names_persisted() == [
    ]  # Assert that models were not still persisted after loading predictor

    assert (predictor.get_model_full_dict() == dict())
    predictor.refit_full()
    assert (len(predictor.get_model_full_dict()) == num_models)
    assert (len(predictor.get_model_names()) == num_models * 2)
    for model in predictor.get_model_names():
        predictor.predict(dataset=test_data, model=model)
    predictor.refit_full()  # Confirm that refit_models aren't further refit.
    assert (len(predictor.get_model_full_dict()) == num_models)
    assert (len(predictor.get_model_names()) == num_models * 2)
    predictor.delete_models(
        models_to_keep=[])  # Test that dry-run doesn't delete models
    assert (len(predictor.get_model_names()) == num_models * 2)
    predictor.predict(dataset=test_data)
    predictor.delete_models(models_to_keep=[],
                            dry_run=False)  # Test that dry-run deletes models
    assert len(predictor.get_model_names()) == 0
    assert len(predictor.leaderboard()) == 0
    assert len(predictor.leaderboard(extra_info=True)) == 0
    try:
        predictor.predict(dataset=test_data)
    except:
        pass
    else:
        raise AssertionError(
            'predictor.predict should raise exception after all models are deleted'
        )
    print('Tabular Advanced Functionality Test Succeeded.')

Example #5

0

Show file

File: test_tabular.py Project: yoon-gu/autogluon

def run_tabular_benchmarks(fast_benchmark,
                           subsample_size,
                           perf_threshold,
                           seed_val,
                           fit_args,
                           dataset_indices=None,
                           run_distill=False):
    print("Running fit with args:")
    print(fit_args)
    # Each train/test dataset must be located in single directory with the given names.
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    EPS = 1e-10

    # Information about each dataset in benchmark is stored in dict.
    # performance_val = expected performance on this dataset (lower = better),should update based on previously run benchmarks
    binary_dataset = {
        'url':
        'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
        'name': 'AdultIncomeBinaryClassification',
        'problem_type': BINARY,
        'label_column': 'class',
        'performance_val': 0.129
    }  # Mixed types of features.

    multi_dataset = {
        'url':
        'https://autogluon.s3.amazonaws.com/datasets/CoverTypeMulticlassClassification.zip',
        'name': 'CoverTypeMulticlassClassification',
        'problem_type': MULTICLASS,
        'label_column': 'Cover_Type',
        'performance_val': 0.032
    }  # big dataset with 7 classes, all features are numeric. Runs SLOW.

    regression_dataset = {
        'url':
        'https://autogluon.s3.amazonaws.com/datasets/AmesHousingPriceRegression.zip',
        'name': 'AmesHousingPriceRegression',
        'problem_type': REGRESSION,
        'label_column': 'SalePrice',
        'performance_val': 0.076
    }  # Regression with mixed feature-types, skewed Y-values.

    toyregres_dataset = {
        'url': 'https://autogluon.s3.amazonaws.com/datasets/toyRegression.zip',
        'name': 'toyRegression',
        'problem_type': REGRESSION,
        'label_column': 'y',
        'performance_val': 0.183
    }
    # 1-D toy deterministic regression task with: heavy label+feature missingness, extra distraction column in test data

    # List containing dicts for each dataset to include in benchmark (try to order based on runtimes)
    datasets = [
        toyregres_dataset, binary_dataset, regression_dataset, multi_dataset
    ]
    if dataset_indices is not None:  # only run some datasets
        datasets = [datasets[i] for i in dataset_indices]

    # Aggregate performance summaries obtained in previous benchmark run:
    prev_perf_vals = [dataset['performance_val'] for dataset in datasets]
    previous_avg_performance = np.mean(prev_perf_vals)
    previous_median_performance = np.median(prev_perf_vals)
    previous_worst_performance = np.max(prev_perf_vals)

    # Run benchmark:
    performance_vals = [0.0] * len(
        datasets)  # performance obtained in this run
    directory_prefix = './datasets/'
    with warnings.catch_warnings(record=True) as caught_warnings:
        for idx in range(len(datasets)):
            dataset = datasets[idx]
            train_data, test_data = load_data(
                directory_prefix=directory_prefix,
                train_file=train_file,
                test_file=test_file,
                name=dataset['name'],
                url=dataset['url'])
            if seed_val is not None:
                seed(seed_val)
                np.random.seed(seed_val)
                mx.random.seed(seed_val)
            print("Evaluating Benchmark Dataset %s (%d of %d)" %
                  (dataset['name'], idx + 1, len(datasets)))
            directory = directory_prefix + dataset['name'] + "/"
            savedir = directory + 'AutogluonOutput/'
            shutil.rmtree(
                savedir, ignore_errors=True
            )  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
            label_column = dataset['label_column']
            y_test = test_data[label_column]
            test_data = test_data.drop(labels=[label_column], axis=1)
            if fast_benchmark:
                if subsample_size is None:
                    raise ValueError(
                        "fast_benchmark specified without subsample_size")
                train_data = train_data.head(
                    subsample_size)  # subsample for fast_benchmark
            predictor = task.fit(train_data=train_data,
                                 label=label_column,
                                 output_directory=savedir,
                                 **fit_args)
            results = predictor.fit_summary(verbosity=4)
            if predictor.problem_type != dataset['problem_type']:
                warnings.warn(
                    "For dataset %s: Autogluon inferred problem_type = %s, but should = %s"
                    % (dataset['name'], predictor.problem_type,
                       dataset['problem_type']))
            predictor = task.load(
                savedir)  # Test loading previously-trained predictor from file
            y_pred = predictor.predict(test_data)
            perf_dict = predictor.evaluate_predictions(y_true=y_test,
                                                       y_pred=y_pred,
                                                       auxiliary_metrics=True)
            if dataset['problem_type'] != REGRESSION:
                perf = 1.0 - perf_dict[
                    'accuracy_score']  # convert accuracy to error-rate
            else:
                perf = 1.0 - perf_dict[
                    'r2_score']  # unexplained variance score.
            performance_vals[idx] = perf
            print("Performance on dataset %s: %s   (previous perf=%s)" %
                  (dataset['name'], performance_vals[idx],
                   dataset['performance_val']))
            if (not fast_benchmark) and (
                    performance_vals[idx] >
                    dataset['performance_val'] * perf_threshold):
                warnings.warn(
                    "Performance on dataset %s is %s times worse than previous performance."
                    % (dataset['name'], performance_vals[idx] /
                       (EPS + dataset['performance_val'])))
            if run_distill:
                predictor.distill(time_limits=60,
                                  augment_args={'size_factor': 0.5})
    # Summarize:
    avg_perf = np.mean(performance_vals)
    median_perf = np.median(performance_vals)
    worst_perf = np.max(performance_vals)
    for idx in range(len(datasets)):
        print("Performance on dataset %s: %s   (previous perf=%s)" %
              (datasets[idx]['name'], performance_vals[idx],
               datasets[idx]['performance_val']))

    print("Average performance: %s" % avg_perf)
    print("Median performance: %s" % median_perf)
    print("Worst performance: %s" % worst_perf)

    if not fast_benchmark:
        if avg_perf > previous_avg_performance * perf_threshold:
            warnings.warn(
                "Average Performance is %s times worse than previously." %
                (avg_perf / (EPS + previous_avg_performance)))
        if median_perf > previous_median_performance * perf_threshold:
            warnings.warn(
                "Median Performance is %s times worse than previously." %
                (median_perf / (EPS + previous_median_performance)))
        if worst_perf > previous_worst_performance * perf_threshold:
            warnings.warn(
                "Worst Performance is %s times worse than previously." %
                (worst_perf / (EPS + previous_worst_performance)))

    print("Ran fit with args:")
    print(fit_args)
    # List all warnings again to make sure they are seen:
    print("\n\n WARNINGS:")
    for w in caught_warnings:
        warnings.warn(w.message)

Example #6

0

Show file

File: example_simple_tabular.py Project: mseeger/autogluon-1

""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon.tabular import TabularPrediction as task

# Training time:
train_data = task.Dataset(
    file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv'
)  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())
label_column = 'class'  # specifies which column do we want to predict
savedir = 'ag_models/'  # where to save trained models

predictor = task.fit(train_data=train_data,
                     label=label_column,
                     output_directory=savedir)
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:  predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, presets='best_quality', eval_metric=YOUR_METRIC_NAME)
results = predictor.fit_summary()

# Inference time:
test_data = task.Dataset(
    file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv'
)  # another Pandas DataFrame
y_test = test_data[label_column]
test_data = test_data.drop(
    labels=[label_column], axis=1
)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())

predictor = task.load(
    savedir

Example #7

0

Show file

X_transform = auto_ml_pipeline_feature_generator.fit_transform(X=X)
print(X_transform.head(5))
X_test_transform = auto_ml_pipeline_feature_generator.transform(X=X_test)
print(X_test_transform.head(5))

#####################################################
# Specifying custom feature generator to task.fit() #
#####################################################

example_models = {'GBM': {}, 'CAT': {}}
example_models_2 = {'RF': {}, 'KNN': {}}

# Because auto_ml_pipeline_feature_generator is already fit, it doesn't need to be fit again in predictor. Instead, train_data is just transformed by auto_ml_pipeline_feature_generator.transform(train_data).
# This allows the feature transformation to be completely independent of the training data, we could have used a completely different data source to fit the generator.
predictor = task.fit(train_data=train_data,
                     label='class',
                     hyperparameters=example_models,
                     feature_generator=auto_ml_pipeline_feature_generator)
X_test_transform_2 = predictor.transform_features(
    X_test
)  # This is the same as calling auto_ml_pipeline_feature_generator.transform(X_test)
assert (X_test_transform.equals(X_test_transform_2))
# The feature metadata of the feature generator is also preserved. All downstream models will get this feature metadata information to make decisions on how they use the data.
assert (predictor.feature_metadata.to_dict() ==
        auto_ml_pipeline_feature_generator.feature_metadata.to_dict())
predictor.leaderboard(test_data)

# We can train multiple predictors with the same pre-fit feature generator. This can save a lot of time during experimentation if the fitting of the generator is expensive.
predictor_2 = task.fit(train_data=train_data,
                       label='class',
                       hyperparameters=example_models_2,
                       feature_generator=auto_ml_pipeline_feature_generator)

Example #8

0

Show file

 def fit_dataset(train_data, fit_args, sample_size=None):
     if sample_size is not None and sample_size < len(train_data):
         train_data = train_data.sample(n=sample_size, random_state=0)
     return task.fit(train_data=train_data, **fit_args)

Example #9

0

Show file

File: example_custom_model_tabular.py Project: sailfish009/autogluon

y_pred = naive_bayes_model.predict(X_test)
print(y_pred)
y_pred_orig = label_cleaner.inverse_transform(y_pred)
print(y_pred_orig)

score = naive_bayes_model.score(X_test, y_test_clean)
print(f'test score ({naive_bayes_model.eval_metric.name}) = {score}')

########################################
# Training custom model using task.fit #
########################################

custom_hyperparameters = {NaiveBayesModel: {}}
# custom_hyperparameters = {NaiveBayesModel: [{}, {'var_smoothing': 0.00001}, {'var_smoothing': 0.000002}]}  # Train 3 NaiveBayes models with different hyperparameters
predictor = task.fit(train_data=train_data, label=label_column, hyperparameters=custom_hyperparameters)  # Train a single default NaiveBayesModel
predictor.leaderboard(test_data)

y_pred = predictor.predict(test_data)
print(y_pred)

time.sleep(1)  # Ensure we don't use the same train directory

###############################################################
# Training custom model alongside other models using task.fit #
###############################################################

# Now we add the custom model to be trained alongside the default models:
custom_hyperparameters.update(get_hyperparameter_config('default'))
predictor = task.fit(train_data=train_data, label=label_column, hyperparameters=custom_hyperparameters)  # Train the default models plus a single default NaiveBayesModel
# predictor = task.fit(train_data=train_data, label=label_column, auto_stack=True, hyperparameters=custom_hyperparameters)  # We can even use the custom model in a multi-layer stack ensemble

Example #10

0

Show file

              " -O temp.zip && unzip -o temp.zip && rm temp.zip")

savedir = directory + 'agModels/'

label_column = dataset['label_column']
train_data = task.Dataset(file_path=train_file_path)
test_data = task.Dataset(file_path=test_file_path)
train_data = train_data.head(subsample_size)  # subsample for faster demo
test_data = test_data.head(subsample_size)  # subsample for faster run
print(train_data.head())

# Fit model ensemble:
predictor = task.fit(train_data=train_data,
                     label=label_column,
                     problem_type='multiclass',
                     output_directory=savedir,
                     cache_data=True,
                     auto_stack=True,
                     time_limits=time_limits)

# Distill ensemble-predictor into single model:
time_limits = 60  # None

# aug_data below is optional, but this could be additional unlabeled data you may have. Here we use the training data for demonstration, but you should only use new data here:
aug_data = task.Dataset(file_path=train_file_path)
aug_data = aug_data.head(subsample_size)  # subsample for faster demo

distilled_model_names = predictor.distill(
    time_limits=time_limits, augment_args={'num_augmented_samples': 100}
)  # default distillation (time_limits & augment_args are also optional, here set to suboptimal values to ensure quick runtime)

Example #11

0

Show file

File: example_distill_binary.py Project: stacy0416/autogluon

# shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.

label_column = 'class'  # specifies which column do we want to predict
train_file_path = 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/Inc/train.csv'
test_file_path = 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/Inc/test.csv'

train_data = task.Dataset(file_path=train_file_path)
train_data = train_data.head(subsample_size)  # subsample for faster demo

test_data = task.Dataset(file_path=test_file_path)
test_data = test_data.head(subsample_size)  # subsample for faster run

# Fit model ensemble:
predictor = task.fit(train_data=train_data,
                     label=label_column,
                     output_directory=savedir,
                     cache_data=True,
                     auto_stack=True,
                     time_limits=time_limits)

# Distill ensemble-predictor into single model:

time_limits = 60  # set = None to fully train distilled models

# aug_data below is optional, but this could be additional unlabeled data you may have. Here we use the training data for demonstration, but you should only use new data here:
aug_data = task.Dataset(file_path=train_file_path)
aug_data = aug_data.head(subsample_size)

distilled_model_names = predictor.distill(
    time_limits=time_limits, augment_args={'num_augmented_samples': 100}
)  # default distillation (time_limits & augment_args are also optional, here set to suboptimal values to ensure quick runtime)

Example #12

0

Show file

train_data = task.Dataset(
    file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')
test_data = task.Dataset(
    file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')
label = 'class'
eval_metric = 'roc_auc'
hyperparameters = {'RF': {}}
train_data = train_data.head(1000)  # subsample for faster demo

##################################
# Fitting with the old Predictor #
##################################

predictor1 = task.fit(train_data,
                      label=label,
                      eval_metric=eval_metric,
                      hyperparameters=hyperparameters,
                      num_bagging_folds=2)
predictor1.leaderboard(test_data)

##################################
# Fitting with the new Predictor #
##################################

predictor2 = TabularPredictorV2(label, eval_metric=eval_metric)
predictor2.fit(train_data, hyperparameters=hyperparameters, num_bag_folds=2)
predictor2.leaderboard(test_data)

####################################
# Advanced fit_extra functionality #
####################################

Example #13

0

Show file

File: scratch.py Project: wwolfyy/jp-codes-autogluon

# num_bagging_sets =
# num_trials =  # max number of trials for each parameter combination
# search_strategy = # "skopt", etc...
# ngpus_per_trial = # automatically determined if unspecified
# tuning_data = # validation data (don't use if bagging/stacking)
# holdout_frac =

# %% train model

predictor = task.fit(
    train_data=train_data.drop(labels=cols_2_drop_4_training, axis=1),
    tuning_data=valid_data.drop(labels=cols_2_drop_4_training, axis=1),
    label=target_col,
    #hyperparameter_tune=hyperparameter_tune,
    auto_stack=True,
    time_limits=time_limit,
    output_directory=output_dir,
    eval_metric=metric,
    keep_only_best=True,
    save_space=True,
    ngpus_per_trial=1,
    presets=preset)

# %% output model info

results = predictor.fit_summary()
performance = predictor.evaluate(
    test_data.drop(labels=cols_2_drop_4_training, axis=1))
predictor.leaderboard(test_data, silent=True)
predictor.get_model_best()  # get name of best model
predictor.get_model_names()  # get list of model names

Example #14

0

Show file

File: example_distill_regression.py Project: stacy0416/autogluon

    print("%s data not found locally, so fetching from %s" %
          (dataset['name'], dataset['url']))
    os.system("wget " + dataset['url'] +
              " -O temp.zip && unzip -o temp.zip && rm temp.zip")

train_data = task.Dataset(file_path=train_file_path)
test_data = task.Dataset(file_path=test_file_path)
train_data = train_data.head(subsample_size)  # subsample for faster demo
test_data = test_data.head(subsample_size)  # subsample for faster run
label_column = dataset['label_column']

# Fit model ensemble:
predictor = task.fit(train_data=train_data,
                     label=label_column,
                     output_directory=savedir,
                     cache_data=True,
                     auto_stack=True,
                     time_limits=time_limits,
                     eval_metric='mean_absolute_error')

# Distill ensemble-predictor into single model:
time_limits = 60  # set = None to fully train distilled models

# aug_data below is optional, but this could be additional unlabeled data you may have. Here we use the training data for demonstration, but you should only use new data here:
aug_data = task.Dataset(file_path=train_file_path)
aug_data = aug_data.head(subsample_size)  # subsample for faster demo

distilled_model_names = predictor.distill(
    time_limits=time_limits, augment_args={'num_augmented_samples': 100}
)  # default distillation (time_limits & augment_args are also optional, here set to suboptimal values to ensure quick runtime)