Esempio n. 1
0
def train(args):
    is_distributed = len(args.hosts) > 1
    host_rank = args.hosts.index(args.current_host)
    dist_ip_addrs = args.hosts
    dist_ip_addrs.pop(host_rank)

    # Load training and validation data
    print(f'Train files: {os.listdir(args.train)}')
    train_data = __load_input_data(args.train)

    # Extract column info
    #     target = 'y'
    columns = train_data.columns.tolist()
    column_dict = {"columns": columns}
    with open('columns.pkl', 'wb') as f:
        pickle.dump(column_dict, f)

    # Train models
    predictor = task.fit(train_data=train_data,
                         output_directory=args.model_dir,
                         label='y'
                         #         **args.fit_args,
                         )

    # Results summary
    predictor.fit_summary(verbosity=1)

    # Optional test data
    if args.test:
        print(f'Test files: {os.listdir(args.test)}')
        test_data = __load_input_data(args.test)
        # Test data must be labeled for scoring
        if args.fit_args['label'] in test_data:
            # Leaderboard on test data
            print('Running model on test data and getting Leaderboard...')
            leaderboard = predictor.leaderboard(dataset=test_data, silent=True)
            print(format_for_print(leaderboard), end='\n\n')

            # Feature importance on test data
            # Note: Feature importance must be calculated on held-out (test) data.
            # If calculated on training data it will be biased due to overfitting.
            if args.feature_importance:
                print('Feature importance:')
                # Increase rows to print feature importance
                pd.set_option('display.max_rows', 500)
                print(predictor.feature_importance(test_data))
        else:
            warnings.warn(
                'Skipping eval on test data since label column is not included.'
            )

    # Files summary
    print(f'Model export summary:')
    print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}")
    models_contents = os.listdir('/opt/ml/model/models')
    print(f"/opt/ml/model/models: {models_contents}")
    print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")
Esempio n. 2
0
def test_advanced_functionality():
    fast_benchmark = True
    dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
                      'name': 'AdultIncomeBinaryClassification',
                      'problem_type': BINARY}
    label = 'class'
    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url'])
    if fast_benchmark:  # subsample for fast_benchmark
        subsample_size = 100
        train_data = train_data.head(subsample_size)
        test_data = test_data.head(subsample_size)
    print(f"Evaluating Advanced Functionality on Benchmark Dataset {dataset['name']}")
    directory = directory_prefix + 'advanced/' + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    predictor = task.fit(train_data=train_data, label=label, output_directory=savedir)
    leaderboard = predictor.leaderboard(dataset=test_data)
    leaderboard_extra = predictor.leaderboard(dataset=test_data, extra_info=True)
    assert set(predictor.get_model_names()) == set(leaderboard['model'])
    assert set(predictor.get_model_names()) == set(leaderboard_extra['model'])
    assert set(leaderboard_extra.columns).issuperset(set(leaderboard.columns))
    assert len(leaderboard) == len(leaderboard_extra)
    num_models = len(predictor.get_model_names())
    feature_importances = predictor.feature_importance(dataset=test_data)
    original_features = set(train_data.columns)
    original_features.remove(label)
    assert(set(feature_importances.keys()) == original_features)
    predictor.transform_features()
    predictor.transform_features(dataset=test_data)
    predictor.info()
    assert(predictor.get_model_full_dict() == dict())
    predictor.refit_full()
    assert(len(predictor.get_model_full_dict()) == num_models)
    assert(len(predictor.get_model_names()) == num_models * 2)
    for model in predictor.get_model_names():
        predictor.predict(dataset=test_data, model=model)
    predictor.refit_full()  # Confirm that refit_models aren't further refit.
    assert(len(predictor.get_model_full_dict()) == num_models)
    assert(len(predictor.get_model_names()) == num_models * 2)
    predictor.delete_models(models_to_keep=[])  # Test that dry-run doesn't delete models
    assert(len(predictor.get_model_names()) == num_models * 2)
    predictor.predict(dataset=test_data)
    predictor.delete_models(models_to_keep=[], dry_run=False)  # Test that dry-run deletes models
    assert len(predictor.get_model_names()) == 0
    assert len(predictor.leaderboard()) == 0
    assert len(predictor.leaderboard(extra_info=True)) == 0
    try:
        predictor.predict(dataset=test_data)
    except:
        pass
    else:
        raise AssertionError('predictor.predict should raise exception after all models are deleted')
    print('Tabular Advanced Functionality Test Succeeded.')
Esempio n. 3
0
    def train(self, data, params):
        self.data = data

        self.train_data = task.Dataset(data.unscaled_df)

        autogluon_dir = f'agModels-predictClass/{uuid.uuid4()}'  # specifies folder where to store trained models
        self.predictor = task.fit(train_data=self.train_data,
                                  label=self.metadata.get("output")[0],
                                  output_directory=autogluon_dir)

        self.state = "TRAINED"
Esempio n. 4
0
    def run(self, train_path, test_path, target, task):
        train_data = task.Dataset(file_path=train_path)

        predictor = task.fit(train_data=train_data,
                             label=label_column,
                             eval_metric="f1_macro",
                             num_bagging_folds=5)

        test_data = task.Dataset(file_path=test_path)
        y_test = test_data[target]

        y_pred = predictor.predict(test_data)
        return predictor.evaluate_predictions(y_true=y_test.to_numpy(),
                                              y_pred=y_pred,
                                              auxiliary_metrics=True)
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    model_dir = args.model_dir
    target = args.label_column

    train_file_path = get_file_path(args.train, args.train_filename)

    train_data = task.Dataset(file_path= train_file_path )
    subsample_size = int(args.train_rows)  # subsample subset of data for faster demo, try setting this to much larger values
    train_data = train_data.sample(n=subsample_size, random_state=0)


    predictor = task.fit(train_data = train_data, label=target, output_directory=model_dir)

    return predictor
Esempio n. 6
0
def run_tabular_benchmark_toy(fit_args):
    dataset = {
        'url':
        'https://autogluon.s3.amazonaws.com/datasets/toyClassification.zip',
        'name': 'toyClassification',
        'problem_type': MULTICLASS,
        'label_column': 'y',
        'performance_val': 0.436
    }
    # 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data
    # toyclassif_dataset should produce 1 warning and 1 error during inference:
    # Warning: Ignoring 181 (out of 1000) training examples for which the label value in column 'y' is missing
    # ValueError: Required columns are missing from the provided dataset. Missing columns: ['lostcolumn']

    # Additional warning that would have occurred if ValueError was not triggered:
    # UserWarning: These columns from this dataset were not present in the training dataset (AutoGluon will ignore them):  ['distractioncolumn1', 'distractioncolumn2']

    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix,
                                      train_file=train_file,
                                      test_file=test_file,
                                      name=dataset['name'],
                                      url=dataset['url'])
    print(f"Evaluating Benchmark Dataset {dataset['name']}")
    directory = directory_prefix + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(
        savedir, ignore_errors=True
    )  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    predictor = task.fit(train_data=train_data,
                         label=dataset['label_column'],
                         output_directory=savedir,
                         **fit_args)
    print(predictor.feature_metadata)
    print(predictor.feature_metadata.type_map_raw)
    print(predictor.feature_metadata.type_group_map_special)
    try:
        predictor.predict(test_data)
    except KeyError:  # KeyError should be raised because test_data has missing column 'lostcolumn'
        pass
    else:
        raise AssertionError(f'{dataset["name"]} should raise an exception.')
Esempio n. 7
0
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    num_gpus = int(os.environ['SM_NUM_GPUS'])
    current_host = args.current_host
    hosts = args.hosts
    model_dir = args.model_dir
    target = args.target

    # load training and validation data

    training_dir = args.train
    filename = args.filename
    logging.info(training_dir)
    train_data = task.Dataset(file_path=training_dir + '/' + filename)
    predictor = task.fit(train_data=train_data,
                         label=target,
                         output_directory=model_dir)

    return predictor
Esempio n. 8
0
def train_regression_autogluon(args, train_df, test_df):
    mx.npx.reset_np()
    from autogluon import TabularPrediction as task
    predictor = task.fit(train_data=task.Dataset(df=train_df),
                         output_directory=args.out_dir,
                         label='thrpt',
                         eval_metric='mean_absolute_error')
    #performance = predictor.evaluate(test_df)
    test_prediction = predictor.predict(test_df)
    ret = np.zeros((len(test_prediction), 2), dtype=np.float32)
    for i, (lhs,
            rhs) in enumerate(zip(test_df['thrpt'].to_numpy(),
                                  test_prediction)):
        ret[i][0] = lhs
        ret[i][1] = rhs
    df_result = pd.DataFrame(ret, columns=['gt', 'pred'])
    df_result.to_csv(os.path.join(args.out_dir, 'pred_result.csv'))
    plot_save_figure(gt_thrpt=test_df['thrpt'].to_numpy(),
                     pred_thrpt=test_prediction,
                     save_dir=args.out_dir)
    mx.npx.set_np()
Esempio n. 9
0
def Load_GLUON(dataDownstream, dataFeaturized):

    df = pd.DataFrame(columns=['column', 'feature_type'])
    df.to_csv('AutoGluon_predictions.csv', index=False)

    # dataDownstream
    train = copy.deepcopy(dataDownstream)

    train['label_target'] = 1
    train_data = task.Dataset(df=train)
    label_column = 'label_target'

    try:
        features = task.fit(train_data=train_data, label=label_column)
    except:
        AlwaysTrue = 1

    agl_predictions = pd.read_csv('AutoGluon_predictions.csv')
    predictions = agl_predictions['feature_type'].values.tolist()

    return predictions
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    num_gpus = int(os.environ['SM_NUM_GPUS'])
    current_host = args.current_host
    hosts = args.hosts
    model_dir = args.model_dir
    target = args.target

    # load training and validation data

    training_dir = args.train
    filename = args.filename
    logging.info(training_dir)
    hyperparameters = {
        'GBM': [
            {},
            {
                'extra_trees': True,
                'AG_args': {
                    'name_suffix': 'XT'
                }
            },
        ],
        'RF': {},
        'XT': {},
        'KNN': {},
        'custom': ['GBM']
    }
    presets = 'medium_quality_faster_train'
    train_data = task.Dataset(file_path=training_dir + '/' + filename)
    predictor = task.fit(train_data=train_data,
                         label=target,
                         output_directory=model_dir,
                         presets=presets,
                         hyperparameters=hyperparameters)

    return predictor
Esempio n. 11
0
def frc_AutoGluon(df_train, df_test, 
    categoricalVars, responseVar = 'wk1_sales_all_stores'):
    
    import autogluon as ag
    from autogluon import TabularPrediction as task

    for varName in categoricalVars:
        df_train[varName] = df_train[varName].astype(str)
        df_test[varName] = df_test[varName].astype(str)

    # AutoGluon format
    train_data = task.Dataset(df=df_train)
    test_data = task.Dataset(df=df_test)

    model = task.fit(train_data=train_data, 
    output_directory="auto_gluon", label=responseVar,
    hyperparameter_tune=False)


    # Forecast with the best model
    autogluon_frc = model.predict(test_data)
    return {'autoGluon_frc': autogluon_frc, 'autoGluon_model':model}
    print("%s data not found locally, so fetching from %s" %
          (dataset['name'], dataset['url']))
    os.system("wget " + dataset['url'] +
              " -O temp.zip && unzip -o temp.zip && rm temp.zip")

train_data = task.Dataset(file_path=train_file_path)
test_data = task.Dataset(file_path=test_file_path)
train_data = train_data.head(subsample_size)  # subsample for faster demo
test_data = test_data.head(subsample_size)  # subsample for faster run
label_column = dataset['label_column']

# Fit model ensemble:
predictor = task.fit(train_data=train_data,
                     label=label_column,
                     output_directory=savedir,
                     cache_data=True,
                     auto_stack=True,
                     time_limits=time_limits,
                     eval_metric='mean_absolute_error')

# Distill ensemble-predictor into single model:
time_limits = 60  # set = None to fully train distilled models

# aug_data below is optional, but this could be additional unlabeled data you may have. Here we use the training data for demonstration, but you should only use new data here:
aug_data = task.Dataset(file_path=train_file_path)
aug_data = aug_data.head(subsample_size)  # subsample for faster demo

distilled_model_names = predictor.distill(
    time_limits=time_limits, augment_args={'num_augmented_samples': 100}
)  # default distillation (time_limits & augment_args are also optional, here set to suboptimal values to ensure quick runtime)
Esempio n. 13
0
def run_tabular_benchmarks(fast_benchmark, subsample_size, perf_threshold, seed_val, fit_args, dataset_indices=None):
    print("Running fit with args:")
    print(fit_args)
    # Each train/test dataset must be located in single directory with the given names.
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    EPS = 1e-10

    # Information about each dataset in benchmark is stored in dict.
    # performance_val = expected performance on this dataset (lower = better),should update based on previously run benchmarks
    binary_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
                      'name': 'AdultIncomeBinaryClassification',
                      'problem_type': BINARY,
                      'label_column': 'class',
                      'performance_val': 0.129} # Mixed types of features.

    multi_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/CoverTypeMulticlassClassification.zip',
                      'name': 'CoverTypeMulticlassClassification',
                      'problem_type': MULTICLASS,
                      'label_column': 'Cover_Type',
                      'performance_val': 0.032} # big dataset with 7 classes, all features are numeric. Runs SLOW.

    regression_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/AmesHousingPriceRegression.zip',
                       'name': 'AmesHousingPriceRegression',
                      'problem_type': REGRESSION,
                      'label_column': 'SalePrice',
                      'performance_val': 0.076} # Regression with mixed feature-types, skewed Y-values.

    toyregres_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/toyRegression.zip', 
                         'name': 'toyRegression',
                         'problem_type': REGRESSION, 
                        'label_column': 'y', 
                        'performance_val': 0.183}
    # 1-D toy deterministic regression task with: heavy label+feature missingness, extra distraction column in test data

    toyclassif_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/toyClassification.zip',
                         'name': 'toyClassification',
                         'problem_type': MULTICLASS, 
                        'label_column': 'y', 
                        'performance_val': 0.436}
    # 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data
    # toyclassif_dataset should produce 3 warnings:
    # UserWarning: These columns from this dataset were not present in the training dataset (AutoGluon will ignore them):  ['distractioncolumn1', 'distractioncolumn2']
    # UserWarning: The columns listed below from the training data are no longer in the given dataset. (AutoGluon will proceed assuming their values are missing, but you should remove these columns from training dataset and train a new model):  ['lostcolumn']
    # UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.

    # List containing dicts for each dataset to include in benchmark (try to order based on runtimes)
    datasets = [toyregres_dataset, toyclassif_dataset, binary_dataset, regression_dataset, multi_dataset]
    if dataset_indices is not None: # only run some datasets
        datasets = [datasets[i] for i in dataset_indices]

    # Aggregate performance summaries obtained in previous benchmark run:
    prev_perf_vals = [dataset['performance_val'] for dataset in datasets]
    previous_avg_performance = np.mean(prev_perf_vals)
    previous_median_performance = np.median(prev_perf_vals)
    previous_worst_performance = np.max(prev_perf_vals)

    # Run benchmark:
    performance_vals = [0.0] * len(datasets) # performance obtained in this run
    directory_prefix = './datasets/'
    if not os.path.exists(directory_prefix):
        os.mkdir(directory_prefix)
    with warnings.catch_warnings(record=True) as caught_warnings:
        for idx in range(len(datasets)):
            if seed_val is not None:
                seed(seed_val)
                np.random.seed(seed_val)
                mx.random.seed(seed_val)
            dataset = datasets[idx]
            print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx+1, len(datasets)))
            directory = directory_prefix + dataset['name'] + "/"
            train_file_path = directory + train_file
            test_file_path = directory + test_file
            if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)):
                # fetch files from s3:
                print("%s data not found locally, so fetching from %s" % (dataset['name'],  dataset['url']))
                zip_name = ag.download(dataset['url'], directory_prefix)
                ag.unzip(zip_name, directory_prefix)
                os.remove(zip_name)

            savedir = directory + 'AutogluonOutput/'
            shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed.
            label_column = dataset['label_column']
            train_data = task.Dataset(file_path=train_file_path)
            test_data = task.Dataset(file_path=test_file_path)
            y_test = test_data[label_column]
            test_data = test_data.drop(labels=[label_column], axis=1)
            if fast_benchmark:
                if subsample_size is None:
                    raise ValueError("fast_benchmark specified without subsample_size")
                train_data = train_data.head(subsample_size) # subsample for fast_benchmark
            predictor = None # reset from last Dataset
            predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, **fit_args)
            results = predictor.fit_summary(verbosity=0)
            if predictor.problem_type != dataset['problem_type']:
                warnings.warn("For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type']))
            predictor = None  # We delete predictor here to test loading previously-trained predictor from file
            predictor = task.load(savedir)
            y_pred = predictor.predict(test_data)
            perf_dict = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
            if dataset['problem_type'] != REGRESSION:
                perf = 1.0 - perf_dict['accuracy_score'] # convert accuracy to error-rate
            else:
                perf = 1.0 - perf_dict['r2_score'] # unexplained variance score.
            performance_vals[idx] = perf
            print("Performance on dataset %s: %s   (previous perf=%s)" % (dataset['name'], performance_vals[idx], dataset['performance_val']))
            if (not fast_benchmark) and (performance_vals[idx] > dataset['performance_val'] * perf_threshold):
                warnings.warn("Performance on dataset %s is %s times worse than previous performance." % 
                              (dataset['name'], performance_vals[idx]/(EPS+dataset['performance_val'])))

    # Summarize:
    avg_perf = np.mean(performance_vals)
    median_perf = np.median(performance_vals)
    worst_perf = np.max(performance_vals)
    for idx in range(len(datasets)):
        print("Performance on dataset %s: %s   (previous perf=%s)" % (datasets[idx]['name'], performance_vals[idx], datasets[idx]['performance_val']))

    print("Average performance: %s" % avg_perf)
    print("Median performance: %s" % median_perf)
    print("Worst performance: %s" % worst_perf)

    if not fast_benchmark:
        if avg_perf > previous_avg_performance * perf_threshold:
            warnings.warn("Average Performance is %s times worse than previously." % (avg_perf/(EPS+previous_avg_performance)))
        if median_perf > previous_median_performance * perf_threshold:
            warnings.warn("Median Performance is %s times worse than previously." % (median_perf/(EPS+previous_median_performance)))
        if worst_perf > previous_worst_performance * perf_threshold:
            warnings.warn("Worst Performance is %s times worse than previously." % (worst_perf/(EPS+previous_worst_performance)))

    print("Ran fit with args:")
    print(fit_args)
    # List all warnings again to make sure they are seen:
    print("\n\n WARNINGS:")
    for w in caught_warnings:
        warnings.warn(w.message)
Esempio n. 14
0
import pandas as pd
import autogluon.core as ag
from autogluon import TabularPrediction as task
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

#autogluon
label_column = 'test'
dir = 'agModels-predictClass_jiagnwei'
train_data = task.Dataset(file_path="/dataset/jiangweitrai.csv")
test_data = task.Dataset(file_path="/dataset/jiangweitrai.csv")
# TODO
predictor = task.fit(train_data=train_data,
                     label='test',
                     output_directory=dir,
                     auto_stack=True,
                     time_limits=1800)
results = predictor.fit_summary()
print(predictor.feature_importance(dataset=test_data, subsample_size=None))

# predictor = task.load(dir)
# print(predictor.info())
# print(predictor.feature_importance(dataset=train_data))
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon import TabularPrediction as task

# Training time:
train_data = task.Dataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500) # subsample for faster demo
print(train_data.head())
label_column = 'class' # specifies which column do we want to predict
savedir = 'ag_models/' # where to save trained models

predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir)
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:  predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, presets='best_quality', eval_metric=YOUR_METRIC_NAME)
results = predictor.fit_summary()

# Inference time:
test_data = task.Dataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame
y_test = test_data[label_column]
test_data = test_data.drop(labels=[label_column],axis=1) # delete labels from test data since we wouldn't have them in practice
print(test_data.head())

predictor = task.load(savedir) # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
Esempio n. 16
0
def train():
    print('Starting the training.')
    try:
        # Read in any hyperparameters that the user passed with the training job
        with open(param_path, 'r') as tc:
            trainingParams = json.load(tc)

        # Take the set of files and read them all into a single pandas dataframe
        input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ]
        if len(input_files) == 0:
            raise ValueError(('There are no files in {}.\n' +
                              'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                              'the data specification in S3 was incorrectly specified or the role specified\n' +
                              'does not have permission to access the data.').format(training_path, channel_name))

        print('Found x number of files')        
        all_model_df = pd.read_csv(os.path.join(training_path,'all_model_df.csv'))
        y5 = np.load(os.path.join(training_path,'y5.npy'))
        

        kf = StratifiedKFold(n_splits = 5)

        f = 0

        # for each cancer in tcga
        for c in tqdm(np.unique(y5)): 
            for train_index, test_index in kf.split(all_model_df,y5):
                t1 = time.time()
                print(c,"starting fold",f)
                
                # load prev.pickle with most important biomarkers
                with open(os.path.join(training_path,"c"+str(c)+"_f"+str(f)+"_5hsic5adasynlgbm100ft.b"), "rb") as fp: 
                    train_index,test_index,chsicpredictor,predy,acc = pickle.load(fp)
                
                c_idx = np.where(y5==c)[0]
                cy = np.zeros_like(y5)
                cy[c_idx] = 1
                
                # train an ensemble model with AutoML to maximize accuracy
                train_data = all_model_df.iloc[train_index].iloc[:, chsicpredictor.hsic_idx_]
                train_data["label"] = cy[train_index]
                clf = task.fit(train_data, label="label", presets='best_quality', auto_stack=True, output_directory="_autogluon_c_"+str(c)+"_f"+str(f))
                
                test_y = y5[test_index]
                c_idx = np.where(test_y==c)[0]
                test_y = np.zeros_like(test_y)
                test_y[c_idx] = 1
                
                bpredy = clf.predict(all_model_df.iloc[test_index].iloc[:, chsicpredictor.hsic_idx_])
                bacc = accuracy_score(test_y, bpredy)
                print("done in ",time.time()-t1,"acc",acc) 
                
                # save the results
                model_file_name = "AutoML_c"+str(c)+"_f"+str(f)+"_5hsic5adasynlgbm100ft.b"
                acc_file_name = "AutoML_c"+str(c)+"_f"+str(f)+"_acc.b"

                with open(os.path.join(model_path,model_file_name), "wb") as fp: 
                    pickle.dump((train_index,test_index,chsicpredictor,predy,acc,bpredy,bacc,clf),fp)
                    
                with open(os.path.join(model_path,acc_file_name), "wb") as fp: 
                    pickle.dump((bacc),fp)

                f+=1

        print('Training complete.')
    except Exception as e:
        # Write out an error file. This will be returned as the failureReason in the
        # DescribeTrainingJob result.
        trc = traceback.format_exc()
        with open(os.path.join(output_path, 'failure'), 'w') as s:
            s.write('Exception during training: ' + str(e) + '\n' + trc)
        # Printing this causes the exception to be in the training job logs, as well.
        print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr)
        # A non-zero exit code causes the training job to be marked as Failed.
        sys.exit(255)
Esempio n. 17
0
        'num_epochs': 10,
        'activation': 'relu',
        'dropout_prob': ag.Real(0.0, 0.5)
    },
    'GBM': {
        'num_boost_round': 1000,
        'learning_rate': ag.Real(0.01, 0.1, log=True)
    }
}

predictor = task.fit(
    train_data=train_data,
    label=label_column,
    output_directory=savedir,
    hyperparameter_tune=True,
    hyperparameters=hyperparams,
    num_trials=5,
    time_limits=1 * 60,
    num_bagging_folds=0,
    stack_ensemble_levels=0
)  # since tuning_data = None, automatically determines train/validation split

results = predictor.fit_summary()  # display detailed summary of fit() process

# Inference time:
test_data = task.Dataset(
    file_path=
    'https://autogluon.s3-us-west-2.amazonaws.com/datasets/Inc/test.csv'
)  # another Pandas DataFrame
print(test_data.head())
Esempio n. 18
0
 class_order) = autoweka_fit_predict(train_data=train_data,
                                     test_data=test_data,
                                     label_column=label_column,
                                     problem_type=problem_type,
                                     output_directory=output_directory,
                                     autoweka_path=autoweka_path,
                                     eval_metric=eval_metric,
                                     runtime_sec=runtime_sec,
                                     random_state=random_state,
                                     num_cores=num_cores)

# Can use autogluon.tabular.Predictor to evaluate predictions (assuming metric correctly specified):
ag_predictor = task.fit(task.Dataset(df=train_data),
                        label=label_column,
                        problem_type=problem_type,
                        eval_metric=eval_metric,
                        hyperparameters={'GBM': {
                            'num_boost_round': 2
                        }})
if eval_metric == 'roc_auc':
    preds_toevaluate = y_prob[:, 1]
elif eval_metric == 'log_loss':
    preds_toevaluate = y_prob
else:
    preds_toevaluate = y_pred

perf = ag_predictor.evaluate_predictions(
    test_data[label_column], preds_toevaluate
)  # use y_prob or y_prob[:,1] instead of y_pred for metrics like log_loss or roc_auc

print("Auto-WEKA test performance: %s" % perf)
from autogluon import TabularPrediction as task
from data_config.data_config import load_data, data_config

if __name__ == '__main__':
    res = {}
    for data_name in data_config.keys():
        ylabel = data_config[data_name]['ylabel']

        X_train, X_valid = load_data(data_name, combine_y=True)
        train_data = task.Dataset(df=X_train)
        test_data = task.Dataset(df=X_valid)
        savedir = f'{data_name}/'  # where to save trained models
        predictor = task.fit(
            train_data=train_data,
            label=ylabel,
            output_directory=savedir,
            eval_metric='roc_auc',
            verbosity=2,
            visualizer='tensorboard',
            random_seed=0,
            save_space=True,
            keep_only_best=True,
        )
        auc = predictor.evaluate(X_valid)
        res[data_name] = auc

    print(res)
    import pickle
    with open('autogluon_result.pickle', 'wb') as f:
        pickle.dump(res, f)
Esempio n. 20
0
label_column = 'status'
dir = 'agModels-predictClass'  # specifies folder where to store trained models

# print(train_data.head(10))
# print(train_data.info())
# print(train_data.describe())

if __name__ == '__main__':
    # predictor = task.fit(train_data=train_data, label=label_column, output_directory=dir, time_limits=100)
    # results = predictor.fit_summary()
    # print("AutoGluon infers problem type is: ", predictor.problem_type)
    # print("AutoGluon identified the following types of features:")
    # print(predictor.feature_metadata)
    # # predictor.leaderboard(train_data, silent=True)
    # # print(results)
    time_limits = 60  # for quick demonstration only, you should set this to longest time you are willing to wait (in seconds)
    metric = 'roc_auc'  # specify your evaluation metric here
    predictor = task.fit(train_data=train_data,
                         label=label_column,
                         time_limits=time_limits)
    results = predictor.fit_summary()

    # print("AutoGluon infers problem type is: ", predictor.problem_type)
    # print("AutoGluon identified the following types of features:")
    # print(predictor.feature_metadata)

    # results.to_csv('111.csv')
    # data_utils.data_to_excel(results)
    # predictor.leaderboard(train_data, silent=True)
    # print(results)
Esempio n. 21
0
def run_tabular_benchmarks(fast_benchmark,
                           subsample_size,
                           perf_threshold,
                           seed_val,
                           fit_args,
                           dataset_indices=None,
                           run_distill=False):
    print("Running fit with args:")
    print(fit_args)
    # Each train/test dataset must be located in single directory with the given names.
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    EPS = 1e-10

    # Information about each dataset in benchmark is stored in dict.
    # performance_val = expected performance on this dataset (lower = better),should update based on previously run benchmarks
    binary_dataset = {
        'url':
        'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
        'name': 'AdultIncomeBinaryClassification',
        'problem_type': BINARY,
        'label_column': 'class',
        'performance_val': 0.129
    }  # Mixed types of features.

    multi_dataset = {
        'url':
        'https://autogluon.s3.amazonaws.com/datasets/CoverTypeMulticlassClassification.zip',
        'name': 'CoverTypeMulticlassClassification',
        'problem_type': MULTICLASS,
        'label_column': 'Cover_Type',
        'performance_val': 0.032
    }  # big dataset with 7 classes, all features are numeric. Runs SLOW.

    regression_dataset = {
        'url':
        'https://autogluon.s3.amazonaws.com/datasets/AmesHousingPriceRegression.zip',
        'name': 'AmesHousingPriceRegression',
        'problem_type': REGRESSION,
        'label_column': 'SalePrice',
        'performance_val': 0.076
    }  # Regression with mixed feature-types, skewed Y-values.

    toyregres_dataset = {
        'url': 'https://autogluon.s3.amazonaws.com/datasets/toyRegression.zip',
        'name': 'toyRegression',
        'problem_type': REGRESSION,
        'label_column': 'y',
        'performance_val': 0.183
    }
    # 1-D toy deterministic regression task with: heavy label+feature missingness, extra distraction column in test data

    # List containing dicts for each dataset to include in benchmark (try to order based on runtimes)
    datasets = [
        toyregres_dataset, binary_dataset, regression_dataset, multi_dataset
    ]
    if dataset_indices is not None:  # only run some datasets
        datasets = [datasets[i] for i in dataset_indices]

    # Aggregate performance summaries obtained in previous benchmark run:
    prev_perf_vals = [dataset['performance_val'] for dataset in datasets]
    previous_avg_performance = np.mean(prev_perf_vals)
    previous_median_performance = np.median(prev_perf_vals)
    previous_worst_performance = np.max(prev_perf_vals)

    # Run benchmark:
    performance_vals = [0.0] * len(
        datasets)  # performance obtained in this run
    directory_prefix = './datasets/'
    with warnings.catch_warnings(record=True) as caught_warnings:
        for idx in range(len(datasets)):
            dataset = datasets[idx]
            train_data, test_data = load_data(
                directory_prefix=directory_prefix,
                train_file=train_file,
                test_file=test_file,
                name=dataset['name'],
                url=dataset['url'])
            if seed_val is not None:
                seed(seed_val)
                np.random.seed(seed_val)
                mx.random.seed(seed_val)
            print("Evaluating Benchmark Dataset %s (%d of %d)" %
                  (dataset['name'], idx + 1, len(datasets)))
            directory = directory_prefix + dataset['name'] + "/"
            savedir = directory + 'AutogluonOutput/'
            shutil.rmtree(
                savedir, ignore_errors=True
            )  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
            label_column = dataset['label_column']
            y_test = test_data[label_column]
            test_data = test_data.drop(labels=[label_column], axis=1)
            if fast_benchmark:
                if subsample_size is None:
                    raise ValueError(
                        "fast_benchmark specified without subsample_size")
                train_data = train_data.head(
                    subsample_size)  # subsample for fast_benchmark
            predictor = task.fit(train_data=train_data,
                                 label=label_column,
                                 output_directory=savedir,
                                 **fit_args)
            results = predictor.fit_summary(verbosity=0)
            if predictor.problem_type != dataset['problem_type']:
                warnings.warn(
                    "For dataset %s: Autogluon inferred problem_type = %s, but should = %s"
                    % (dataset['name'], predictor.problem_type,
                       dataset['problem_type']))
            predictor = task.load(
                savedir)  # Test loading previously-trained predictor from file
            y_pred = predictor.predict(test_data)
            perf_dict = predictor.evaluate_predictions(y_true=y_test,
                                                       y_pred=y_pred,
                                                       auxiliary_metrics=True)
            if dataset['problem_type'] != REGRESSION:
                perf = 1.0 - perf_dict[
                    'accuracy_score']  # convert accuracy to error-rate
            else:
                perf = 1.0 - perf_dict[
                    'r2_score']  # unexplained variance score.
            performance_vals[idx] = perf
            print("Performance on dataset %s: %s   (previous perf=%s)" %
                  (dataset['name'], performance_vals[idx],
                   dataset['performance_val']))
            if (not fast_benchmark) and (
                    performance_vals[idx] >
                    dataset['performance_val'] * perf_threshold):
                warnings.warn(
                    "Performance on dataset %s is %s times worse than previous performance."
                    % (dataset['name'], performance_vals[idx] /
                       (EPS + dataset['performance_val'])))
            if run_distill:
                predictor.distill(time_limits=60,
                                  augment_args={'size_factor': 0.5})
    # Summarize:
    avg_perf = np.mean(performance_vals)
    median_perf = np.median(performance_vals)
    worst_perf = np.max(performance_vals)
    for idx in range(len(datasets)):
        print("Performance on dataset %s: %s   (previous perf=%s)" %
              (datasets[idx]['name'], performance_vals[idx],
               datasets[idx]['performance_val']))

    print("Average performance: %s" % avg_perf)
    print("Median performance: %s" % median_perf)
    print("Worst performance: %s" % worst_perf)

    if not fast_benchmark:
        if avg_perf > previous_avg_performance * perf_threshold:
            warnings.warn(
                "Average Performance is %s times worse than previously." %
                (avg_perf / (EPS + previous_avg_performance)))
        if median_perf > previous_median_performance * perf_threshold:
            warnings.warn(
                "Median Performance is %s times worse than previously." %
                (median_perf / (EPS + previous_median_performance)))
        if worst_perf > previous_worst_performance * perf_threshold:
            warnings.warn(
                "Worst Performance is %s times worse than previously." %
                (worst_perf / (EPS + previous_worst_performance)))

    print("Ran fit with args:")
    print(fit_args)
    # List all warnings again to make sure they are seen:
    print("\n\n WARNINGS:")
    for w in caught_warnings:
        warnings.warn(w.message)
Esempio n. 22
0
    else:
        excluded_model_types = []

    # Create output directory
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

    (X_train, y_train), (X_valid, y_valid) = load_data(use_test=False)

    df_train = convert_to_dataframe(X_train, y_train)
    df_valid = convert_to_dataframe(X_valid, y_valid)

    predictor = task.fit(
        train_data=task.Dataset(df=df_train),
        tuning_data=task.Dataset(df=df_valid),
        label="label",
        output_directory=output_dir,
        time_limits=args.walltime,
        hyperparameter_tune=True,
        auto_stack=True,
        excluded_model_types=excluded_model_types,
    )
else:
    _, (X_test, y_test) = load_data(use_test=True)

    print("Convert arrays to DataFrame...")
    df_test = convert_to_dataframe(X_test, y_test)

    print("Loading models...")
    predictor = task.load(output_dir, verbosity=4)

    print("Predicting...")
    t1 = time.time()
Esempio n. 23
0
              " -O temp.zip && unzip -o temp.zip && rm temp.zip")

savedir = directory + 'agModels/'

label_column = dataset['label_column']
train_data = task.Dataset(file_path=train_file_path)
test_data = task.Dataset(file_path=test_file_path)
train_data = train_data.head(subsample_size)  # subsample for faster demo
test_data = test_data.head(subsample_size)  # subsample for faster run
print(train_data.head())

# Fit model ensemble:
predictor = task.fit(train_data=train_data,
                     label=label_column,
                     problem_type='multiclass',
                     output_directory=savedir,
                     cache_data=True,
                     auto_stack=True,
                     time_limits=time_limits)

# Distill ensemble-predictor into single model:
time_limits = 60  # None

# aug_data below is optional, but this could be additional unlabeled data you may have. Here we use the training data for demonstration, but you should only use new data here:
aug_data = task.Dataset(file_path=train_file_path)
aug_data = aug_data.head(subsample_size)  # subsample for faster demo

distilled_model_names = predictor.distill(
    time_limits=time_limits, augment_args={'num_augmented_samples': 100}
)  # default distillation (time_limits & augment_args are also optional, here set to suboptimal values to ensure quick runtime)
Esempio n. 24
0
def test_tabularHPO():
    # Aggregate performance summaries obtained in previous benchmark run:
    prev_perf_vals = [dataset['performance_val'] for dataset in datasets]
    previous_avg_performance = np.mean(prev_perf_vals)
    previous_median_performance = np.median(prev_perf_vals)
    previous_worst_performance = np.max(prev_perf_vals)

    # Run benchmark:
    performance_vals = [0.0] * len(
        datasets)  # performance obtained in this run
    with warnings.catch_warnings(record=True) as caught_warnings:
        for idx in range(len(datasets)):
            seed(seed_val)
            np.random.seed(seed_val)
            mx.random.seed(seed_val)
            dataset = datasets[idx]
            print("Evaluating Benchmark Dataset %s (%d of %d)" %
                  (dataset['name'], idx + 1, len(datasets)))
            directory = dataset['name'] + "/"
            train_file_path = directory + train_file
            test_file_path = directory + test_file
            if (not os.path.exists(train_file_path)) or (
                    not os.path.exists(test_file_path)):
                # fetch files from s3:
                print("%s data not found locally, so fetching from %s" %
                      (dataset['name'], dataset['url']))
                os.system("wget " + dataset['url'] +
                          " -O temp.zip && unzip -o temp.zip && rm temp.zip")

            savedir = directory + 'AutogluonOutput/'
            shutil.rmtree(
                savedir, ignore_errors=True
            )  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
            label_column = dataset['label_column']
            train_data = task.Dataset(file_path=train_file_path)
            test_data = task.Dataset(file_path=test_file_path)
            y_test = test_data[label_column]
            test_data = test_data.drop(labels=[label_column], axis=1)
            if fast_benchmark:
                train_data = train_data.head(
                    subsample_size)  # subsample for fast_benchmark
            predictor = None  # reset from last Dataset
            if fast_benchmark:
                predictor = task.fit(train_data=train_data,
                                     label=label_column,
                                     output_directory=savedir,
                                     hyperparameter_tune=hyperparameter_tune,
                                     hyperparameters=hyperparameters,
                                     time_limits=time_limits,
                                     num_trials=num_trials,
                                     verbosity=verbosity)
            else:
                predictor = task.fit(train_data=train_data,
                                     label=label_column,
                                     output_directory=savedir,
                                     hyperparameter_tune=hyperparameter_tune,
                                     verbosity=verbosity)
            results = predictor.fit_summary(verbosity=0)
            if predictor.problem_type != dataset['problem_type']:
                warnings.warn(
                    "For dataset %s: Autogluon inferred problem_type = %s, but should = %s"
                    % (dataset['name'], predictor.problem_type,
                       dataset['problem_type']))
            predictor = None  # We delete predictor here to test loading previously-trained predictor from file
            predictor = task.load(savedir)
            y_pred = predictor.predict(test_data)
            perf_dict = predictor.evaluate_predictions(y_true=y_test,
                                                       y_pred=y_pred,
                                                       auxiliary_metrics=True)
            if dataset['problem_type'] != REGRESSION:
                perf = 1.0 - perf_dict[
                    'accuracy_score']  # convert accuracy to error-rate
            else:
                perf = 1.0 - perf_dict[
                    'r2_score']  # unexplained variance score.
            performance_vals[idx] = perf
            print("Performance on dataset %s: %s   (previous perf=%s)" %
                  (dataset['name'], performance_vals[idx],
                   dataset['performance_val']))
            if (not fast_benchmark) and (
                    performance_vals[idx] >
                    dataset['performance_val'] * perf_threshold):
                warnings.warn(
                    "Performance on dataset %s is %s times worse than previous performance."
                    % (dataset['name'], performance_vals[idx] /
                       (EPS + dataset['performance_val'])))

    # Summarize:
    avg_perf = np.mean(performance_vals)
    median_perf = np.median(performance_vals)
    worst_perf = np.max(performance_vals)
    for idx in range(len(datasets)):
        print("Performance on dataset %s: %s   (previous perf=%s)" %
              (datasets[idx]['name'], performance_vals[idx],
               datasets[idx]['performance_val']))

    print("Average performance: %s" % avg_perf)
    print("Median performance: %s" % median_perf)
    print("Worst performance: %s" % worst_perf)

    if not fast_benchmark:
        if avg_perf > previous_avg_performance * perf_threshold:
            warnings.warn(
                "Average Performance is %s times worse than previously." %
                (avg_perf / (EPS + previous_avg_performance)))
        if median_perf > previous_median_performance * perf_threshold:
            warnings.warn(
                "Median Performance is %s times worse than previously." %
                (median_perf / (EPS + previous_median_performance)))
        if worst_perf > previous_worst_performance * perf_threshold:
            warnings.warn(
                "Worst Performance is %s times worse than previously." %
                (worst_perf / (EPS + previous_worst_performance)))

    # List all warnings again to make sure they are seen:
    print("\n\n WARNINGS:")
    for w in caught_warnings:
        warnings.warn(w.message)
Esempio n. 25
0
def processData(data,
                label_column=None,
                output_directory=None,
                ag_predictor=None,
                problem_type=None,
                eval_metric=None):
    """ Converts pandas Dataframe to matrix of entirely numerical values (stored in DataFrame).
        Performs same data preprocessing as used for AutoGluon's tabular neural network model, 
        to deal with issues such as: missing value imputation, one-hot encoding of categoricals, 
        handling of high-cardinality categoricals, handling unknown categorical feature-levels at test-time, etc.
        
        If ag_predictor is not None, uses existing autogluon predictor object to process data (must have tabularNN as first model).
        To process training data, ag_predictor should = None. For test data, should != None.
        Returns:
            Tuple (X, y, ag_predictor)
            where y may be None if labels are not present in test data.
    """

    # fit dummy neural network model just to preprocess data. Here we ensure no embedding layers are used.
    if ag_predictor is None:
        if label_column is None:
            raise ValueError(
                "when processing training data, label_column cannot be None")
        elif not label_column in data.columns:
            raise ValueError(
                "label_column cannot be missing from training data")
        ag_predictor = task.fit(train_data=task.Dataset(data),
                                tuning_data=task.Dataset(data),
                                label=label_column,
                                hyperparameter_tune=False,
                                problem_type=problem_type,
                                eval_metric=eval_metric,
                                hyperparameters={
                                    'NN': {
                                        'num_epochs': 0,
                                        'proc.embed_min_categories': np.inf
                                    }
                                },
                                num_bagging_folds=0,
                                stack_ensemble_levels=0,
                                label_count_threshold=1,
                                verbosity=2,
                                feature_generator_kwargs={
                                    'enable_nlp_vectorizer_features': False,
                                    'enable_nlp_ratio_features': False
                                })

    model = ag_predictor._trainer.load_model(
        ag_predictor._trainer.get_model_names_all()
        [0])  # This must be the neural net model which contains data processor
    if 'NeuralNet' not in model.name:
        raise ValueError(
            "Data preprocessing error. This model should be the NeuralNet, not the: %s"
            % model.name)
    bad_inds = []  # row-indices to remove from dataset
    if label_column is not None and label_column in data.columns:
        label_cleaner = ag_predictor._learner.label_cleaner
        y = data[label_column].values
        data = data.drop([label_column], axis=1, inplace=False)
        y = label_cleaner.transform(y)
        if np.sum(y.isna()) > 0:
            bad_inds = y.index[y.apply(np.isnan)].tolist(
            )  # remove these inds as label is NaN (due to very rare classes)
            warnings.warn(
                "Dropped these rows from data in preprocessing, due to missing labels: "
                + str(bad_inds))
    else:
        y = None
    data_initial_processed = ag_predictor._learner.transform_features(
        data)  # general autogluon data processing.
    # data_fg = ag_predictor._learner.general_data_processing(X=data, X_test=data, holdout_frac=0.0, num_bagging_folds=0)
    tabNN_data = model.process_data(
        data_initial_processed, is_test=True
    )  # neural net-specific autogluon data processing required to turn tabular data into numerical matrix.
    numeric_data = tabNN_data.dataset._data  # list of mxnet.NDArrays
    if len(numeric_data) != 1:
        raise ValueError("Data Preprocessing failed.")
    numpy_data = numeric_data[0].asnumpy()  # 2D Numpy array
    X = pd.DataFrame(numpy_data)
    X.columns = ['feature' + str(i) for i in range(X.shape[1])]
    if len(bad_inds) > 0:
        y.drop(index=bad_inds, inplace=True)
        X.drop(index=bad_inds, axis=0, inplace=True)
    return (X, y, ag_predictor)
Esempio n. 26
0
def test_advanced_functionality():
    fast_benchmark = True
    dataset = {
        'url':
        'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
        'name': 'AdultIncomeBinaryClassification',
        'problem_type': BINARY
    }
    label = 'class'
    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix,
                                      train_file=train_file,
                                      test_file=test_file,
                                      name=dataset['name'],
                                      url=dataset['url'])
    if fast_benchmark:  # subsample for fast_benchmark
        subsample_size = 100
        train_data = train_data.head(subsample_size)
        test_data = test_data.head(subsample_size)
    print(
        f"Evaluating Advanced Functionality on Benchmark Dataset {dataset['name']}"
    )
    directory = directory_prefix + 'advanced/' + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(
        savedir, ignore_errors=True
    )  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    predictor = task.fit(train_data=train_data,
                         label=label,
                         output_directory=savedir)
    leaderboard = predictor.leaderboard(dataset=test_data)
    leaderboard_extra = predictor.leaderboard(dataset=test_data,
                                              extra_info=True)
    assert set(predictor.get_model_names()) == set(leaderboard['model'])
    assert set(predictor.get_model_names()) == set(leaderboard_extra['model'])
    assert set(leaderboard_extra.columns).issuperset(set(leaderboard.columns))
    assert len(leaderboard) == len(leaderboard_extra)
    num_models = len(predictor.get_model_names())
    feature_importances = predictor.feature_importance(dataset=test_data)
    original_features = set(train_data.columns)
    original_features.remove(label)
    assert (set(feature_importances.keys()) == original_features)
    predictor.transform_features()
    predictor.transform_features(dataset=test_data)
    predictor.info()

    assert predictor.get_model_names_persisted() == [
    ]  # Assert that no models were persisted during training
    assert predictor.unpersist_models() == [
    ]  # Assert that no models were unpersisted

    persisted_models = predictor.persist_models(models='all', max_memory=None)
    assert set(predictor.get_model_names_persisted()) == set(
        persisted_models)  # Ensure all models are persisted
    assert predictor.persist_models(models='all', max_memory=None) == [
    ]  # Ensure that no additional models are persisted on repeated calls
    unpersised_models = predictor.unpersist_models()
    assert set(unpersised_models) == set(persisted_models)
    assert predictor.get_model_names_persisted() == [
    ]  # Assert that all models were unpersisted

    # Raise exception
    with pytest.raises(NetworkXError):
        predictor.persist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2'])

    assert predictor.get_model_names_persisted() == []

    assert predictor.unpersist_models(
        models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2']) == []

    predictor.persist_models(models='all', max_memory=None)
    predictor.save(
    )  # Save predictor while models are persisted: Intended functionality is that they won't be persisted when loaded.
    predictor_loaded = TabularPredictor.load(
        output_directory=predictor.output_directory
    )  # Assert that predictor loading works
    leaderboard_loaded = predictor_loaded.leaderboard(dataset=test_data)
    assert len(leaderboard) == len(leaderboard_loaded)
    assert predictor_loaded.get_model_names_persisted() == [
    ]  # Assert that models were not still persisted after loading predictor

    assert (predictor.get_model_full_dict() == dict())
    predictor.refit_full()
    assert (len(predictor.get_model_full_dict()) == num_models)
    assert (len(predictor.get_model_names()) == num_models * 2)
    for model in predictor.get_model_names():
        predictor.predict(dataset=test_data, model=model)
    predictor.refit_full()  # Confirm that refit_models aren't further refit.
    assert (len(predictor.get_model_full_dict()) == num_models)
    assert (len(predictor.get_model_names()) == num_models * 2)
    predictor.delete_models(
        models_to_keep=[])  # Test that dry-run doesn't delete models
    assert (len(predictor.get_model_names()) == num_models * 2)
    predictor.predict(dataset=test_data)
    predictor.delete_models(models_to_keep=[],
                            dry_run=False)  # Test that dry-run deletes models
    assert len(predictor.get_model_names()) == 0
    assert len(predictor.leaderboard()) == 0
    assert len(predictor.leaderboard(extra_info=True)) == 0
    try:
        predictor.predict(dataset=test_data)
    except:
        pass
    else:
        raise AssertionError(
            'predictor.predict should raise exception after all models are deleted'
        )
    print('Tabular Advanced Functionality Test Succeeded.')
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    model_dir = args.model_dir
    #     target = args.label
    #     presets = args.presets
    # Load training and validation data
    print(f'Train files: {os.listdir(args.train)}')
    train_data = __load_input_data(args.train)

    columns = train_data.columns.tolist()
    column_dict = {"columns": columns}
    with open('columns.pkl', 'wb') as f:
        pickle.dump(column_dict, f)

    subsample_size = int(
        args.train_rows
    )  # subsample subset of data for faster demo, try setting this to much larger values
    train_data = train_data.sample(n=subsample_size, random_state=0)

    #     predictor = task.fit(train_data = train_data, label=target,
    #                          output_directory=model_dir,
    #                          presets = presets)
    # Train models
    predictor = task.fit(
        train_data=train_data,
        output_directory=model_dir,
        **args.fit_args,
    )

    # Results summary
    predictor.fit_summary(verbosity=1)

    # Optional test data
    if args.test:
        print(f'Test files: {os.listdir(args.test)}')
        test_data = __load_input_data(args.test)
        # Test data must be labeled for scoring

        # Leaderboard on test data
        print('Running model on test data and getting Leaderboard...')
        leaderboard = predictor.leaderboard(dataset=test_data, silent=True)
        print(format_for_print(leaderboard), end='\n\n')

        # Feature importance on test data
        # Note: Feature importance must be calculated on held-out (test) data.
        # If calculated on training data it will be biased due to overfitting.
        if args.feature_importance:
            print('Feature importance:')
            # Increase rows to print feature importance
            pd.set_option('display.max_rows', 500)
            print(predictor.feature_importance(test_data))

    # Files summary
    print(f'Model export summary:')
    print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}")
    models_contents = os.listdir('/opt/ml/model/models')
    print(f"/opt/ml/model/models: {models_contents}")
    print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")

    return predictor
for seed in seeds:
    with mlflow.start_run(run_name='autogluon'):
        # Create output directory for auto gluon
        models_dir = 'AutogluonModels'
        random_dir = ''.join(random.choices(string.ascii_uppercase +
                             string.digits, k = 12))
        output_dir = f'{models_dir}{os.sep}{random_dir}'
        os.mkdir(output_dir)
        # Split data into two parts (train, valid)
        train, valid = train_test_split(data, random_state = seed)
        predictor = task.fit(train_data=train, 
                            label=target_column,
                            problem_type = 'regression',
                            eval_metric = custom_metric, 
                            stopping_metric=custom_metric,
                            hyperparameters= hyper_parameters,
                            stack_ensemble_levels=2, 
                            time_limits = run_time_secs,
                            cache_data=False, 
                            verbosity = 2,
                            output_directory=output_dir)
        test_data = valid
        y_test = test_data[target_column]  # values to predict
        test_data_nolab = test_data.drop(labels=[target_column],axis=1) # delete label column to prove we're not cheating
        # AutoGluon will gauge predictive performance using 
        # evaluation metric: roc_auc this metric expects predicted probabilities 
        # rather than predicted class labels, so you'll need to use predict_proba() 
        # instead of predict()
        y_pred = predictor.predict_proba(test_data_nolab)
        score = RMSLE(y_test,y_pred)
        mlflow.log_metric('RMSLE', score)
def train(args):
    model_output_dir = f'{args.output_dir}/data'
    
    is_distributed = len(args.hosts) > 1
    host_rank = args.hosts.index(args.current_host)
    dist_ip_addrs = args.hosts
    dist_ip_addrs.pop(host_rank)

    # Load training and validation data
    print(f'Train files: {os.listdir(args.train)}')
    train_data = __load_input_data(args.train)
    
    # Extract column info
    target = args.fit_args['label']
    columns = train_data.columns.tolist()
    column_dict = {"columns":columns}
    with open('columns.pkl', 'wb') as f:
        pickle.dump(column_dict, f)
    
    # Train models
    predictor = task.fit(
        train_data=train_data,
        output_directory=args.model_dir,
        **args.fit_args,
    )
    
    # Results summary
    predictor.fit_summary(verbosity=3)
    model_summary_fname_src = os.path.join(predictor.output_directory, 'SummaryOfModels.html')
    model_summary_fname_tgt = os.path.join(model_output_dir, 'SummaryOfModels.html')
    
    if os.path.exists(model_summary_fname_src):
        shutil.copy(model_summary_fname_src, model_summary_fname_tgt)
    
    # ensemble visualization
    G = predictor._trainer.model_graph
    remove = [node for node,degree in dict(G.degree()).items() if degree < 1]
    G.remove_nodes_from(remove)
    A = nx.nx_agraph.to_agraph(G)
    A.graph_attr.update(rankdir='BT')
    A.node_attr.update(fontsize=10)
    for node in A.iternodes():
        node.attr['shape'] = 'rectagle'
    A.draw(os.path.join(model_output_dir, 'ensemble-model.png'), format='png', prog='dot')

    # Optional test data
    if args.test:
        print(f'Test files: {os.listdir(args.test)}')
        test_data = __load_input_data(args.test)
        # Test data must be labeled for scoring
        if args.fit_args['label'] in test_data:
            # Leaderboard on test data
            print('Running model on test data and getting Leaderboard...')
            leaderboard = predictor.leaderboard(dataset=test_data, silent=True)
            print(format_for_print(leaderboard), end='\n\n')
            leaderboard.to_csv(f'{model_output_dir}/leaderboard.csv', index=False)

            # Feature importance on test data
            # Note: Feature importance must be calculated on held-out (test) data.
            # If calculated on training data it will be biased due to overfitting.
            if args.feature_importance:      
                print('Feature importance:')
                # Increase rows to print feature importance                
                pd.set_option('display.max_rows', 500)
                feature_importance = predictor.feature_importance(test_data)
                feature_importance_df = pd.DataFrame(feature_importance, columns=['Importance score']).rename_axis(index='Feature')
                print(feature_importance_df)
                feature_importance_df.to_csv(f'{model_output_dir}/feature_importance.csv', index=True)
            
            # Classification report and confusion matrix for classification model
            if predictor.problem_type in [BINARY, MULTICLASS]:
                from sklearn.metrics import classification_report, confusion_matrix
                
                X_test = test_data.drop(args.fit_args['label'], axis=1)
                y_test_true = test_data[args.fit_args['label']]
                y_test_pred = predictor.predict(X_test)
                y_test_pred_prob = predictor.predict_proba(X_test, as_multiclass=True)
                
                report_dict = classification_report(y_test_true, y_test_pred, output_dict=True, labels=predictor.class_labels)
                report_dict_df = pd.DataFrame(report_dict).T
                report_dict_df.to_csv(f'{model_output_dir}/classification_report.csv', index=True)
                
                cm = confusion_matrix(y_test_true, y_test_pred, labels=predictor.class_labels)
                cm_df = pd.DataFrame(cm, predictor.class_labels, predictor.class_labels)
                sns.set(font_scale=1)
                cmap = 'coolwarm'
                sns.heatmap(cm_df, annot=True, fmt='d', cmap=cmap)
                plt.title('Confusion Matrix')
                plt.ylabel('true label')
                plt.xlabel('predicted label')
                plt.show()
                plt.savefig(f'{model_output_dir}/confusion_matrix.png')
                
                get_roc_auc(y_test_true, y_test_pred_prob, predictor.class_labels, predictor.class_labels_internal, model_output_dir)
        else:
            warnings.warn('Skipping eval on test data since label column is not included.')

    # Files summary
    print(f'Model export summary:')
    print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}")
    models_contents = os.listdir('/opt/ml/model/models')
    print(f"/opt/ml/model/models: {models_contents}")
    print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")
Esempio n. 30
0
    for data_name in data_config.keys():

        hyperparameters = {'NN': {}}
        ylabel = data_config[data_name]['ylabel']

        X_train, X_valid = load_data(data_name, combine_y=True)
        train_data = task.Dataset(df=X_train)
        test_data = task.Dataset(df=X_valid)
        savedir = f'{data_name}/'  # where to save trained models
        predictor = task.fit(
            train_data=train_data,
            label=ylabel,
            output_directory=savedir,
            eval_metric='roc_auc',
            stack_ensemble_levels=0,
            # auto_stack=True,
            num_bagging_folds=5,
            verbosity=2,
            visualizer='tensorboard',
            random_seed=0,
            save_space=True,
            keep_only_best=True,
            hyperparameters=hyperparameters)
        auc = predictor.evaluate(X_valid)
        res[data_name] = auc

    print(res)
    import pickle
    with open('WideDeep_result.pickle', 'wb') as f:
        pickle.dump(res, f)