Beispiel #1
0
def load_data(directory_prefix, train_file, test_file, name, url=None):
    if not os.path.exists(directory_prefix):
        os.mkdir(directory_prefix)
    directory = directory_prefix + name + "/"
    train_file_path = directory + train_file
    test_file_path = directory + test_file
    if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)):
        # fetch files from s3:
        print("%s data not found locally, so fetching from %s" % (name, url))
        zip_name = ag.download(url, directory_prefix)
        ag.unzip(zip_name, directory_prefix)
        os.remove(zip_name)

    train_data = task.Dataset(file_path=train_file_path)
    test_data = task.Dataset(file_path=test_file_path)
    return train_data, test_data
Beispiel #2
0
def run_tabular_benchmark_toy(fit_args):
    dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/toyClassification.zip',
                          'name': 'toyClassification',
                          'problem_type': MULTICLASS,
                          'label_column': 'y',
                          'performance_val': 0.436}
    # 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data
    # toyclassif_dataset should produce 1 warning and 1 error during inference:
    # Warning: Ignoring 181 (out of 1000) training examples for which the label value in column 'y' is missing
    # ValueError: Required columns are missing from the provided dataset. Missing columns: ['lostcolumn']

    # Additional warning that would have occurred if ValueError was not triggered:
    # UserWarning: These columns from this dataset were not present in the training dataset (AutoGluon will ignore them):  ['distractioncolumn1', 'distractioncolumn2']

    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url'])
    print(f"Evaluating Benchmark Dataset {dataset['name']}")
    directory = directory_prefix + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    predictor = task.fit(train_data=train_data, label=dataset['label_column'], output_directory=savedir, **fit_args)
    print(predictor.feature_metadata)
    print(predictor.feature_metadata.type_map_raw)
    print(predictor.feature_metadata.type_group_map_special)
    try:
        predictor.predict(test_data)
    except KeyError:  # KeyError should be raised because test_data has missing column 'lostcolumn'
        pass
    else:
        raise AssertionError(f'{dataset["name"]} should raise an exception.')
Beispiel #3
0
def run(X_train, y_train, label: str, fit_args: dict = None):
    if fit_args is None:
        fit_args = {}
    X_train[label] = y_train

    predictor = ag_task.fit(
        train_data=X_train,
        label=label,
        **fit_args,
    )

    return predictor
Beispiel #4
0
""" Example script for predicting columns of tables, demonstrating more advanced usage of fit().
    Note that all settings demonstrated here are just chosen for demonstration purposes (to minimize runtime), and do not represent wise choices to use in practice.
    To maximize predictive accuracy, we recommend you do NOT specify `hyperparameters` or `hyperparameter_tune`, and instead only specify the following fit() arguments: eval_metric=YOUR_METRIC, presets='best_quality'
"""

import autogluon.core as ag
from autogluon.tabular import TabularPrediction as task

# Training time:
train_data = task.Dataset(
    file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv'
)  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(100)  # subsample for faster demo
print(train_data.head())
label_column = 'class'  # specifies which column do we want to predict
savedir = 'ag_hpo_models/'  # where to save trained models

hyperparams = {
    'NN': {
        'num_epochs': 10,
        'activation': 'relu',
        'dropout_prob': ag.Real(0.0, 0.5)
    },
    'GBM': {
        'num_boost_round': 1000,
        'learning_rate': ag.Real(0.01, 0.1, log=True)
    }
}

predictor = task.fit(
    train_data=train_data,
Beispiel #5
0
def test_advanced_functionality():
    fast_benchmark = True
    dataset = {
        'url':
        'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
        'name': 'AdultIncomeBinaryClassification',
        'problem_type': BINARY
    }
    label = 'class'
    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix,
                                      train_file=train_file,
                                      test_file=test_file,
                                      name=dataset['name'],
                                      url=dataset['url'])
    if fast_benchmark:  # subsample for fast_benchmark
        subsample_size = 100
        train_data = train_data.head(subsample_size)
        test_data = test_data.head(subsample_size)
    print(
        f"Evaluating Advanced Functionality on Benchmark Dataset {dataset['name']}"
    )
    directory = directory_prefix + 'advanced/' + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(
        savedir, ignore_errors=True
    )  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    predictor = task.fit(train_data=train_data,
                         label=label,
                         output_directory=savedir)
    leaderboard = predictor.leaderboard(dataset=test_data)
    leaderboard_extra = predictor.leaderboard(dataset=test_data,
                                              extra_info=True)
    assert set(predictor.get_model_names()) == set(leaderboard['model'])
    assert set(predictor.get_model_names()) == set(leaderboard_extra['model'])
    assert set(leaderboard_extra.columns).issuperset(set(leaderboard.columns))
    assert len(leaderboard) == len(leaderboard_extra)
    num_models = len(predictor.get_model_names())
    feature_importances = predictor.feature_importance(dataset=test_data)
    original_features = set(train_data.columns)
    original_features.remove(label)
    assert (set(feature_importances.keys()) == original_features)
    predictor.transform_features()
    predictor.transform_features(dataset=test_data)
    predictor.info()

    assert predictor.get_model_names_persisted() == [
    ]  # Assert that no models were persisted during training
    assert predictor.unpersist_models() == [
    ]  # Assert that no models were unpersisted

    persisted_models = predictor.persist_models(models='all', max_memory=None)
    assert set(predictor.get_model_names_persisted()) == set(
        persisted_models)  # Ensure all models are persisted
    assert predictor.persist_models(models='all', max_memory=None) == [
    ]  # Ensure that no additional models are persisted on repeated calls
    unpersised_models = predictor.unpersist_models()
    assert set(unpersised_models) == set(persisted_models)
    assert predictor.get_model_names_persisted() == [
    ]  # Assert that all models were unpersisted

    # Raise exception
    with pytest.raises(NetworkXError):
        predictor.persist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2'])

    assert predictor.get_model_names_persisted() == []

    assert predictor.unpersist_models(
        models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2']) == []

    predictor.persist_models(models='all', max_memory=None)
    predictor.save(
    )  # Save predictor while models are persisted: Intended functionality is that they won't be persisted when loaded.
    predictor_loaded = TabularPredictor.load(
        output_directory=predictor.output_directory
    )  # Assert that predictor loading works
    leaderboard_loaded = predictor_loaded.leaderboard(dataset=test_data)
    assert len(leaderboard) == len(leaderboard_loaded)
    assert predictor_loaded.get_model_names_persisted() == [
    ]  # Assert that models were not still persisted after loading predictor

    assert (predictor.get_model_full_dict() == dict())
    predictor.refit_full()
    assert (len(predictor.get_model_full_dict()) == num_models)
    assert (len(predictor.get_model_names()) == num_models * 2)
    for model in predictor.get_model_names():
        predictor.predict(dataset=test_data, model=model)
    predictor.refit_full()  # Confirm that refit_models aren't further refit.
    assert (len(predictor.get_model_full_dict()) == num_models)
    assert (len(predictor.get_model_names()) == num_models * 2)
    predictor.delete_models(
        models_to_keep=[])  # Test that dry-run doesn't delete models
    assert (len(predictor.get_model_names()) == num_models * 2)
    predictor.predict(dataset=test_data)
    predictor.delete_models(models_to_keep=[],
                            dry_run=False)  # Test that dry-run deletes models
    assert len(predictor.get_model_names()) == 0
    assert len(predictor.leaderboard()) == 0
    assert len(predictor.leaderboard(extra_info=True)) == 0
    try:
        predictor.predict(dataset=test_data)
    except:
        pass
    else:
        raise AssertionError(
            'predictor.predict should raise exception after all models are deleted'
        )
    print('Tabular Advanced Functionality Test Succeeded.')
Beispiel #6
0
def run_tabular_benchmarks(fast_benchmark,
                           subsample_size,
                           perf_threshold,
                           seed_val,
                           fit_args,
                           dataset_indices=None,
                           run_distill=False):
    print("Running fit with args:")
    print(fit_args)
    # Each train/test dataset must be located in single directory with the given names.
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    EPS = 1e-10

    # Information about each dataset in benchmark is stored in dict.
    # performance_val = expected performance on this dataset (lower = better),should update based on previously run benchmarks
    binary_dataset = {
        'url':
        'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
        'name': 'AdultIncomeBinaryClassification',
        'problem_type': BINARY,
        'label_column': 'class',
        'performance_val': 0.129
    }  # Mixed types of features.

    multi_dataset = {
        'url':
        'https://autogluon.s3.amazonaws.com/datasets/CoverTypeMulticlassClassification.zip',
        'name': 'CoverTypeMulticlassClassification',
        'problem_type': MULTICLASS,
        'label_column': 'Cover_Type',
        'performance_val': 0.032
    }  # big dataset with 7 classes, all features are numeric. Runs SLOW.

    regression_dataset = {
        'url':
        'https://autogluon.s3.amazonaws.com/datasets/AmesHousingPriceRegression.zip',
        'name': 'AmesHousingPriceRegression',
        'problem_type': REGRESSION,
        'label_column': 'SalePrice',
        'performance_val': 0.076
    }  # Regression with mixed feature-types, skewed Y-values.

    toyregres_dataset = {
        'url': 'https://autogluon.s3.amazonaws.com/datasets/toyRegression.zip',
        'name': 'toyRegression',
        'problem_type': REGRESSION,
        'label_column': 'y',
        'performance_val': 0.183
    }
    # 1-D toy deterministic regression task with: heavy label+feature missingness, extra distraction column in test data

    # List containing dicts for each dataset to include in benchmark (try to order based on runtimes)
    datasets = [
        toyregres_dataset, binary_dataset, regression_dataset, multi_dataset
    ]
    if dataset_indices is not None:  # only run some datasets
        datasets = [datasets[i] for i in dataset_indices]

    # Aggregate performance summaries obtained in previous benchmark run:
    prev_perf_vals = [dataset['performance_val'] for dataset in datasets]
    previous_avg_performance = np.mean(prev_perf_vals)
    previous_median_performance = np.median(prev_perf_vals)
    previous_worst_performance = np.max(prev_perf_vals)

    # Run benchmark:
    performance_vals = [0.0] * len(
        datasets)  # performance obtained in this run
    directory_prefix = './datasets/'
    with warnings.catch_warnings(record=True) as caught_warnings:
        for idx in range(len(datasets)):
            dataset = datasets[idx]
            train_data, test_data = load_data(
                directory_prefix=directory_prefix,
                train_file=train_file,
                test_file=test_file,
                name=dataset['name'],
                url=dataset['url'])
            if seed_val is not None:
                seed(seed_val)
                np.random.seed(seed_val)
                mx.random.seed(seed_val)
            print("Evaluating Benchmark Dataset %s (%d of %d)" %
                  (dataset['name'], idx + 1, len(datasets)))
            directory = directory_prefix + dataset['name'] + "/"
            savedir = directory + 'AutogluonOutput/'
            shutil.rmtree(
                savedir, ignore_errors=True
            )  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
            label_column = dataset['label_column']
            y_test = test_data[label_column]
            test_data = test_data.drop(labels=[label_column], axis=1)
            if fast_benchmark:
                if subsample_size is None:
                    raise ValueError(
                        "fast_benchmark specified without subsample_size")
                train_data = train_data.head(
                    subsample_size)  # subsample for fast_benchmark
            predictor = task.fit(train_data=train_data,
                                 label=label_column,
                                 output_directory=savedir,
                                 **fit_args)
            results = predictor.fit_summary(verbosity=4)
            if predictor.problem_type != dataset['problem_type']:
                warnings.warn(
                    "For dataset %s: Autogluon inferred problem_type = %s, but should = %s"
                    % (dataset['name'], predictor.problem_type,
                       dataset['problem_type']))
            predictor = task.load(
                savedir)  # Test loading previously-trained predictor from file
            y_pred = predictor.predict(test_data)
            perf_dict = predictor.evaluate_predictions(y_true=y_test,
                                                       y_pred=y_pred,
                                                       auxiliary_metrics=True)
            if dataset['problem_type'] != REGRESSION:
                perf = 1.0 - perf_dict[
                    'accuracy_score']  # convert accuracy to error-rate
            else:
                perf = 1.0 - perf_dict[
                    'r2_score']  # unexplained variance score.
            performance_vals[idx] = perf
            print("Performance on dataset %s: %s   (previous perf=%s)" %
                  (dataset['name'], performance_vals[idx],
                   dataset['performance_val']))
            if (not fast_benchmark) and (
                    performance_vals[idx] >
                    dataset['performance_val'] * perf_threshold):
                warnings.warn(
                    "Performance on dataset %s is %s times worse than previous performance."
                    % (dataset['name'], performance_vals[idx] /
                       (EPS + dataset['performance_val'])))
            if run_distill:
                predictor.distill(time_limits=60,
                                  augment_args={'size_factor': 0.5})
    # Summarize:
    avg_perf = np.mean(performance_vals)
    median_perf = np.median(performance_vals)
    worst_perf = np.max(performance_vals)
    for idx in range(len(datasets)):
        print("Performance on dataset %s: %s   (previous perf=%s)" %
              (datasets[idx]['name'], performance_vals[idx],
               datasets[idx]['performance_val']))

    print("Average performance: %s" % avg_perf)
    print("Median performance: %s" % median_perf)
    print("Worst performance: %s" % worst_perf)

    if not fast_benchmark:
        if avg_perf > previous_avg_performance * perf_threshold:
            warnings.warn(
                "Average Performance is %s times worse than previously." %
                (avg_perf / (EPS + previous_avg_performance)))
        if median_perf > previous_median_performance * perf_threshold:
            warnings.warn(
                "Median Performance is %s times worse than previously." %
                (median_perf / (EPS + previous_median_performance)))
        if worst_perf > previous_worst_performance * perf_threshold:
            warnings.warn(
                "Worst Performance is %s times worse than previously." %
                (worst_perf / (EPS + previous_worst_performance)))

    print("Ran fit with args:")
    print(fit_args)
    # List all warnings again to make sure they are seen:
    print("\n\n WARNINGS:")
    for w in caught_warnings:
        warnings.warn(w.message)
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon.tabular import TabularPrediction as task

# Training time:
train_data = task.Dataset(
    file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv'
)  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())
label_column = 'class'  # specifies which column do we want to predict
savedir = 'ag_models/'  # where to save trained models

predictor = task.fit(train_data=train_data,
                     label=label_column,
                     output_directory=savedir)
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:  predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, presets='best_quality', eval_metric=YOUR_METRIC_NAME)
results = predictor.fit_summary()

# Inference time:
test_data = task.Dataset(
    file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv'
)  # another Pandas DataFrame
y_test = test_data[label_column]
test_data = test_data.drop(
    labels=[label_column], axis=1
)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())

predictor = task.load(
    savedir
Beispiel #8
0
Most users can get strong performance without specifying custom feature generators due to the generic and powerful default feature generator used by AutoGluon.
An advanced user may wish to create a custom feature generator to:
    1. Experiment with different preprocessing pipelines to improve model quality.
    2. Have full control over what data is being sent to downstream models.
    3. Migrate existing pipelines into AutoGluon for ease of use and deployment.
    4. Contribute new feature generators to AutoGluon.
"""

################
# Loading Data #
################

from autogluon.tabular import TabularPrediction as task

train_data = task.Dataset(
    file_path=
    'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/train_data.csv'
)  # can be local CSV file as well, returns Pandas DataFrame
test_data = task.Dataset(
    file_path=
    'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/test_data.csv'
)  # another Pandas DataFrame
label_column = 'class'  # specifies which column do we want to predict
sample_train_data = train_data.head(100)  # subsample for faster demo

# Separate features and labels
# Make sure to not include your label/target column when sending input to the feature generators, or else the label will be transformed as well.
X = sample_train_data.drop(columns=[label_column])
y = sample_train_data[label_column]

X_test = test_data.drop(columns=[label_column])
y_test = test_data[label_column]
Beispiel #9
0
 def fit_dataset(train_data, fit_args, sample_size=None):
     if sample_size is not None and sample_size < len(train_data):
         train_data = train_data.sample(n=sample_size, random_state=0)
     return task.fit(train_data=train_data, **fit_args)
        cat_columns = X.select_dtypes(['category', 'object']).columns
        X = X.drop(cat_columns, axis=1)
        return super()._preprocess(X, **kwargs).fillna(0)

    def _fit(self, X_train, y_train, **kwargs):
        from sklearn.naive_bayes import GaussianNB
        X_train = self.preprocess(X_train)
        self.model = GaussianNB(**self.params)
        self.model.fit(X_train, y_train)


################
# Loading Data #
################

train_data = task.Dataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
test_data = task.Dataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')  # another Pandas DataFrame
label_column = 'class'  # specifies which column do we want to predict
train_data = train_data.head(1000)  # subsample for faster demo

#############################################
# Training custom model outside of task.fit #
#############################################

# Separate features and labels
X_train = train_data.drop(columns=[label_column])
y_train = train_data[label_column]

problem_type = infer_problem_type(y=y_train)  # Infer problem type (or else specify directly)
naive_bayes_model = NaiveBayesModel(path='AutogluonModels/', name='CustomNaiveBayes', problem_type=problem_type)
Beispiel #11
0
train_file = 'train_data.csv'
test_file = 'test_data.csv'
train_file_path = directory + train_file
test_file_path = directory + test_file

if (not os.path.exists(train_file_path)) or (
        not os.path.exists(test_file_path)):  # fetch files from s3:
    print("%s data not found locally, so fetching from %s" %
          (dataset['name'], dataset['url']))
    os.system("wget " + dataset['url'] +
              " -O temp.zip && unzip -o temp.zip && rm temp.zip")

savedir = directory + 'agModels/'

label_column = dataset['label_column']
train_data = task.Dataset(file_path=train_file_path)
test_data = task.Dataset(file_path=test_file_path)
train_data = train_data.head(subsample_size)  # subsample for faster demo
test_data = test_data.head(subsample_size)  # subsample for faster run
print(train_data.head())

# Fit model ensemble:
predictor = task.fit(train_data=train_data,
                     label=label_column,
                     problem_type='multiclass',
                     output_directory=savedir,
                     cache_data=True,
                     auto_stack=True,
                     time_limits=time_limits)

# Distill ensemble-predictor into single model:
Beispiel #12
0
def load_predictor(model_directory: str) -> TabularPredictor:
    predictor = TabularPrediction.load(model_directory)
    predictor.save_space()
    predictor.delete_models(models_to_keep='best', dry_run=False)
    return predictor
Beispiel #13
0
from autogluon.tabular import TabularPrediction as task
from autogluon.tabular.task.tabular_prediction.predictor_v2 import TabularPredictorV2

################
# Loading data #
################

train_data = task.Dataset(
    file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')
test_data = task.Dataset(
    file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')
label = 'class'
eval_metric = 'roc_auc'
hyperparameters = {'RF': {}}
train_data = train_data.head(1000)  # subsample for faster demo

##################################
# Fitting with the old Predictor #
##################################

predictor1 = task.fit(train_data,
                      label=label,
                      eval_metric=eval_metric,
                      hyperparameters=hyperparameters,
                      num_bagging_folds=2)
predictor1.leaderboard(test_data)

##################################
# Fitting with the new Predictor #
##################################
 def __init__(self):
     self.predictor_rank = task2.load(
         '/content/common-alternusvera/PU/ag_predict')
     self.predictor_sts = task.load(
         '/content/common-alternusvera/PU/saved_dir')
Beispiel #15
0
from datetime import datetime
import pandas as pd

# %% define data

root_folder = "/home/lstm/Google Drive/MATLAB data files/Project__autoML/datasets for autoML/data_weekly_archive/"
data_folder = "20200213/"
data_file = "GCP_trainvalid_KOSPIb1f0bNsCFCCOFOC20200213.csv"
data_ref = 'KOSPIb1f0bNsCFCCOFOC20200213'
target_col = "target"
most_recent_folder = "20112032/"
most_recent_file = "GCP_trainvalid_KOSPIb1f0bNsCFCCOFOC2020112032.csv"

cols_2_drop_4_training = ["timestamp", "split_tag", "weight_vector"]

data_trainvalid = task.Dataset(file_path=root_folder + data_folder + data_file)
data_trainvalid["DoW"] = data_trainvalid["DoW"].astype('category')

train_data = data_trainvalid.loc[data_trainvalid.split_tag == 'TRAIN', :]
print(train_data.head())
print(train_data.tail())

valid_data = data_trainvalid.loc[
    data_trainvalid.split_tag ==
    'VALIDATE', :]  # do not provide if bagging/stacking
print(valid_data.head())
print(valid_data.tail())
latest_valid_date = valid_data["timestamp"][valid_data["timestamp"] ==
                                            valid_data["timestamp"].max()]

## REDO TEST DATA (to be pre-processed in matlab first)
dataset = regression_dataset
directory = dataset['name'] + "/"

train_file = 'train_data.csv'
test_file = 'test_data.csv'
train_file_path = directory + train_file
test_file_path = directory + test_file

if (not os.path.exists(train_file_path)) or (
        not os.path.exists(test_file_path)):  # fetch files from s3:
    print("%s data not found locally, so fetching from %s" %
          (dataset['name'], dataset['url']))
    os.system("wget " + dataset['url'] +
              " -O temp.zip && unzip -o temp.zip && rm temp.zip")

train_data = task.Dataset(file_path=train_file_path)
test_data = task.Dataset(file_path=test_file_path)
train_data = train_data.head(subsample_size)  # subsample for faster demo
test_data = test_data.head(subsample_size)  # subsample for faster run
label_column = dataset['label_column']

# Fit model ensemble:
predictor = task.fit(train_data=train_data,
                     label=label_column,
                     output_directory=savedir,
                     cache_data=True,
                     auto_stack=True,
                     time_limits=time_limits,
                     eval_metric='mean_absolute_error')

# Distill ensemble-predictor into single model: