Exemple #1
0
def train(args):
    model_output_dir = f'{args.output_dir}/data'

    is_distributed = len(args.hosts) > 1
    host_rank = args.hosts.index(args.current_host)
    dist_ip_addrs = args.hosts
    dist_ip_addrs.pop(host_rank)

    # Load training and validation data
    print(f'Train files: {os.listdir(args.train)}')
    train_data = __load_input_data(args.train)

    # Extract column info
    target = args.init_args['label']
    columns = train_data.columns.tolist()
    column_dict = {"columns": columns}
    with open('columns.pkl', 'wb') as f:
        pickle.dump(column_dict, f)

    # Train models

    args.init_args['path'] = args.model_dir
    #args.fit_args.pop('label', None)
    predictor = TabularPredictor(**args.init_args).fit(train_data,
                                                       **args.fit_args)

    # Results summary
    predictor.fit_summary(verbosity=3)
    #model_summary_fname_src = os.path.join(predictor.output_directory, 'SummaryOfModels.html')
    model_summary_fname_src = os.path.join(args.model_dir,
                                           'SummaryOfModels.html')
    model_summary_fname_tgt = os.path.join(model_output_dir,
                                           'SummaryOfModels.html')

    if os.path.exists(model_summary_fname_src):
        shutil.copy(model_summary_fname_src, model_summary_fname_tgt)

    # ensemble visualization
    G = predictor._trainer.model_graph
    remove = [node for node, degree in dict(G.degree()).items() if degree < 1]
    G.remove_nodes_from(remove)
    A = nx.nx_agraph.to_agraph(G)
    A.graph_attr.update(rankdir='BT')
    A.node_attr.update(fontsize=10)
    for node in A.iternodes():
        node.attr['shape'] = 'rectagle'
    A.draw(os.path.join(model_output_dir, 'ensemble-model.png'),
           format='png',
           prog='dot')

    # Optional test data
    if args.test:
        print(f'Test files: {os.listdir(args.test)}')
        test_data = __load_input_data(args.test)
        # Test data must be labeled for scoring
        if target in test_data:
            # Leaderboard on test data
            print('Running model on test data and getting Leaderboard...')
            leaderboard = predictor.leaderboard(test_data, silent=True)
            print(format_for_print(leaderboard), end='\n\n')
            leaderboard.to_csv(f'{model_output_dir}/leaderboard.csv',
                               index=False)

            # Feature importance on test data
            # Note: Feature importance must be calculated on held-out (test) data.
            # If calculated on training data it will be biased due to overfitting.
            if args.feature_importance:
                print('Feature importance:')
                # Increase rows to print feature importance
                pd.set_option('display.max_rows', 500)
                feature_importance_df = predictor.feature_importance(test_data)

                print(feature_importance_df)
                feature_importance_df.to_csv(
                    f'{model_output_dir}/feature_importance.csv', index=True)

            # Classification report and confusion matrix for classification model
            if predictor.problem_type in [BINARY, MULTICLASS]:
                from sklearn.metrics import classification_report, confusion_matrix

                X_test = test_data.drop(target, axis=1)
                y_test_true = test_data[target]
                y_test_pred = predictor.predict(X_test)
                y_test_pred_prob = predictor.predict_proba(X_test,
                                                           as_multiclass=True)

                report_dict = classification_report(
                    y_test_true,
                    y_test_pred,
                    output_dict=True,
                    labels=predictor.class_labels)
                report_dict_df = pd.DataFrame(report_dict).T
                report_dict_df.to_csv(
                    f'{model_output_dir}/classification_report.csv',
                    index=True)

                cm = confusion_matrix(y_test_true,
                                      y_test_pred,
                                      labels=predictor.class_labels)
                cm_df = pd.DataFrame(cm, predictor.class_labels,
                                     predictor.class_labels)
                sns.set(font_scale=1)
                cmap = 'coolwarm'
                sns.heatmap(cm_df, annot=True, fmt='d', cmap=cmap)
                plt.title('Confusion Matrix')
                plt.ylabel('true label')
                plt.xlabel('predicted label')
                plt.show()
                plt.savefig(f'{model_output_dir}/confusion_matrix.png')

                get_roc_auc(y_test_true, y_test_pred_prob,
                            predictor.class_labels,
                            predictor.class_labels_internal, model_output_dir)
        else:
            warnings.warn(
                'Skipping eval on test data since label column is not included.'
            )

    # Files summary
    print(f'Model export summary:')
    print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}")
    models_contents = os.listdir('/opt/ml/model/models')
    print(f"/opt/ml/model/models: {models_contents}")
    print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")
Exemple #2
0
def run_tabular_benchmarks(fast_benchmark, subsample_size, perf_threshold, seed_val, fit_args, dataset_indices=None, run_distill=False, crash_in_oof=False):
    print("Running fit with args:")
    print(fit_args)
    # Each train/test dataset must be located in single directory with the given names.
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    EPS = 1e-10

    # List containing dicts for each dataset to include in benchmark (try to order based on runtimes)
    datasets = get_benchmark_sets()
    if dataset_indices is not None: # only run some datasets
        datasets = [datasets[i] for i in dataset_indices]

    # Aggregate performance summaries obtained in previous benchmark run:
    prev_perf_vals = [dataset['performance_val'] for dataset in datasets]
    previous_avg_performance = np.mean(prev_perf_vals)
    previous_median_performance = np.median(prev_perf_vals)
    previous_worst_performance = np.max(prev_perf_vals)

    # Run benchmark:
    performance_vals = [0.0] * len(datasets) # performance obtained in this run
    directory_prefix = './datasets/'
    with warnings.catch_warnings(record=True) as caught_warnings:
        for idx in range(len(datasets)):
            dataset = datasets[idx]
            train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url'])
            if seed_val is not None:
                seed(seed_val)
                np.random.seed(seed_val)
            print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx+1, len(datasets)))
            directory = directory_prefix + dataset['name'] + "/"
            savedir = directory + 'AutogluonOutput/'
            shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
            label = dataset['label']
            y_test = test_data[label]
            test_data = test_data.drop(labels=[label], axis=1)
            if fast_benchmark:
                if subsample_size is None:
                    raise ValueError("fast_benchmark specified without subsample_size")
                if subsample_size < len(train_data):
                    # .sample instead of .head to increase diversity and test cases where data index is not monotonically increasing.
                    train_data = train_data.sample(n=subsample_size, random_state=seed_val)  # subsample for fast_benchmark
            predictor = TabularPredictor(label=label, path=savedir).fit(train_data, **fit_args)
            results = predictor.fit_summary(verbosity=4)
            if predictor.problem_type != dataset['problem_type']:
                warnings.warn("For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type']))
            predictor = TabularPredictor.load(savedir)  # Test loading previously-trained predictor from file
            y_pred_empty = predictor.predict(test_data[0:0])
            assert len(y_pred_empty) == 0
            y_pred = predictor.predict(test_data)
            perf_dict = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
            if dataset['problem_type'] != REGRESSION:
                perf = 1.0 - perf_dict['accuracy']  # convert accuracy to error-rate
            else:
                perf = 1.0 - perf_dict['r2']  # unexplained variance score.
            performance_vals[idx] = perf
            print("Performance on dataset %s: %s   (previous perf=%s)" % (dataset['name'], performance_vals[idx], dataset['performance_val']))
            if (not fast_benchmark) and (performance_vals[idx] > dataset['performance_val'] * perf_threshold):
                warnings.warn("Performance on dataset %s is %s times worse than previous performance." %
                              (dataset['name'], performance_vals[idx]/(EPS+dataset['performance_val'])))
            if predictor._trainer.bagged_mode and not crash_in_oof:
                # TODO: Test index alignment with original training data (first handle duplicated rows / dropped rows edge cases)
                y_pred_oof = predictor.get_oof_pred()
                y_pred_proba_oof = predictor.get_oof_pred_proba(as_multiclass=False)
                y_pred_oof_transformed = predictor.get_oof_pred(transformed=True)
                y_pred_proba_oof_transformed = predictor.get_oof_pred_proba(as_multiclass=False, transformed=True)

                # Assert expected type output
                assert isinstance(y_pred_oof, pd.Series)
                assert isinstance(y_pred_oof_transformed, pd.Series)
                if predictor.problem_type == MULTICLASS:
                    assert isinstance(y_pred_proba_oof, pd.DataFrame)
                    assert isinstance(y_pred_proba_oof_transformed, pd.DataFrame)
                else:
                    if predictor.problem_type == BINARY:
                        assert isinstance(predictor.get_oof_pred_proba(), pd.DataFrame)
                    assert isinstance(y_pred_proba_oof, pd.Series)
                    assert isinstance(y_pred_proba_oof_transformed, pd.Series)

                assert y_pred_oof_transformed.equals(predictor.transform_labels(y_pred_oof, proba=False))

                # Test that the transform_labels method is capable of reproducing the same output when converting back and forth, and test that oof 'transform' parameter works properly.
                y_pred_proba_oof_inverse = predictor.transform_labels(y_pred_proba_oof, proba=True)
                y_pred_proba_oof_inverse_inverse = predictor.transform_labels(y_pred_proba_oof_inverse, proba=True, inverse=True)
                y_pred_oof_inverse = predictor.transform_labels(y_pred_oof)
                y_pred_oof_inverse_inverse = predictor.transform_labels(y_pred_oof_inverse, inverse=True)

                if isinstance(y_pred_proba_oof_transformed, pd.DataFrame):
                    pd.testing.assert_frame_equal(y_pred_proba_oof_transformed, y_pred_proba_oof_inverse)
                    pd.testing.assert_frame_equal(y_pred_proba_oof, y_pred_proba_oof_inverse_inverse)
                else:
                    pd.testing.assert_series_equal(y_pred_proba_oof_transformed, y_pred_proba_oof_inverse)
                    pd.testing.assert_series_equal(y_pred_proba_oof, y_pred_proba_oof_inverse_inverse)
                pd.testing.assert_series_equal(y_pred_oof_transformed, y_pred_oof_inverse)
                pd.testing.assert_series_equal(y_pred_oof, y_pred_oof_inverse_inverse)

                # Test that index of both the internal training data and the oof outputs are consistent in their index values.
                X_internal, y_internal = predictor.load_data_internal()
                y_internal_index = list(y_internal.index)
                assert list(X_internal.index) == y_internal_index
                assert list(y_pred_oof.index) == y_internal_index
                assert list(y_pred_proba_oof.index) == y_internal_index
                assert list(y_pred_oof_transformed.index) == y_internal_index
                assert list(y_pred_proba_oof_transformed.index) == y_internal_index
            else:
                # Raise exception
                with pytest.raises(AssertionError):
                    predictor.get_oof_pred()
                with pytest.raises(AssertionError):
                    predictor.get_oof_pred_proba()
            if run_distill:
                predictor.distill(time_limit=60, augment_args={'size_factor':0.5})

    # Summarize:
    avg_perf = np.mean(performance_vals)
    median_perf = np.median(performance_vals)
    worst_perf = np.max(performance_vals)
    for idx in range(len(datasets)):
        print("Performance on dataset %s: %s   (previous perf=%s)" % (datasets[idx]['name'], performance_vals[idx], datasets[idx]['performance_val']))

    print("Average performance: %s" % avg_perf)
    print("Median performance: %s" % median_perf)
    print("Worst performance: %s" % worst_perf)

    if not fast_benchmark:
        if avg_perf > previous_avg_performance * perf_threshold:
            warnings.warn("Average Performance is %s times worse than previously." % (avg_perf/(EPS+previous_avg_performance)))
        if median_perf > previous_median_performance * perf_threshold:
            warnings.warn("Median Performance is %s times worse than previously." % (median_perf/(EPS+previous_median_performance)))
        if worst_perf > previous_worst_performance * perf_threshold:
            warnings.warn("Worst Performance is %s times worse than previously." % (worst_perf/(EPS+previous_worst_performance)))

    print("Ran fit with args:")
    print(fit_args)
    # List all warnings again to make sure they are seen:
    print("\n\n WARNINGS:")
    for w in caught_warnings:
        warnings.warn(w.message)
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon.tabular import TabularDataset, TabularPredictor


# Training time:
train_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())
label = 'class'  # specifies which column do we want to predict
save_path = 'ag_models/'  # where to save trained models

predictor = TabularPredictor(label=label, path=save_path).fit(train_data)
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:
# predictor = TabularPredictor(label=label_column, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality')
results = predictor.fit_summary()

# Inference time:
test_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')  # another Pandas DataFrame
y_test = test_data[label]
test_data = test_data.drop(labels=[label], axis=1)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())

predictor = TabularPredictor.load(save_path)  # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
def autogluon(df, task, timelife):
    pd.options.mode.chained_assignment = None
    df_new = copy.copy(df)
    X, y, _ = return_X_y(df_new)

    if isinstance(y, pd.Series):
        y = y.to_frame()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=1)

    if isinstance(y_train, pd.Series):
        y_train = y_train.to_frame()
    target = y_train.columns[0]

    if isinstance(y_test, pd.Series):
        y_test = y_test.to_frame()

    X_train[target] = y_train
    train = X_train

    test = X_test

    if task == 'classification':
        if len(y[y.columns[0]].unique()) > 2:
            pt = 'multiclass'
            f1 = lambda y_test, y_pred: f1_score(
                y_test, y_pred, average='weighted')
        else:
            pt = 'binary'
            f1 = lambda y_test, y_pred: f1_score(y_test, y_pred)
    else:
        pt = 'regression'
        #, path='/home/riccardo/.local/share/Trash'
    predictor = TabularPredictor(label=target, problem_type=pt).fit(
        train_data=train,
        time_limit=timelife * 60,
        presets=['optimize_for_deployment'
                 ])  # TEMPORANEO -> attenzione salvo sul cestino
    results = predictor.fit_summary()

    y_pred = predictor.predict(test)

    pipelines = (predictor.leaderboard(df, silent=True))  # sono queste

    res = predictor.evaluate_predictions(y_true=y_test.squeeze(),
                                         y_pred=y_pred,
                                         auxiliary_metrics=True)

    shutil.rmtree('./AutogluonModels')

    if (task == 'classification'):
        '''y_test = le.fit_transform(y_test)
    y_pred = le.fit_transform(y_pred)
    if len(np.unique(y_pred)) > 2:
      f1 = f1_score(y_test, y_pred, average='weighted')s
    else:
      f1 = f1_score(y_test, y_pred)
    return (res['accuracy'], f1)'''
        return (res['accuracy'], f1(y_test, y_pred), pipelines)
    else:
        return (res['root_mean_squared_error'], res['r2'], pipelines)
Exemple #5
0
        'num_boost_round': 1000,
        'learning_rate': ag.Real(0.01, 0.1, log=True)
    },
    'XGB': {
        'n_estimators': 1000,
        'learning_rate': ag.Real(0.01, 0.1, log=True)
    }
}

predictor = TabularPredictor(label=label, path=save_path).fit(
    train_data,
    hyperparameters=hyperparameters,
    hyperparameter_tune_kwargs='auto',
    time_limit=60)

results = predictor.fit_summary()  # display detailed summary of fit() process
print(results)

# Inference time:
test_data = TabularDataset(
    'https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv'
)  # another Pandas DataFrame
print(test_data.head())

perf = predictor.evaluate(
    test_data
)  # shorthand way to evaluate our predictor if test-labels are available

# Otherwise we make predictions and can evaluate them later:
y_pred = predictor.predict_proba(test_data)
perf = predictor.evaluate_predictions(y_true=test_data[label],
Exemple #6
0
    # Enter text for testing
    s = 'pd.DataFrame'
    sample_dtypes = {
        'list': [1, 'a', [2, 'c'], {
            'b': 2
        }],
        'str': 'Hello Streamlit!',
        'int': 17,
        'float': 17.0,
        'dict': {
            1: 'a',
            'x': [2, 'c'],
            2: {
                'b': 2
            }
        },
        'bool': True,
        'pd.DataFrame': y_predproba
    }
    sample_dtypes = sample_dtypes
    # Download predictor
    st.write("予測・分類結果をダウンロードします。※分類問題においては各ラベルに対する確率で表示されています。")
    download_button_str = download_button(
        sample_dtypes[s], "prediction.csv",
        'Click here to download prediction.csv')
    st.markdown(download_button_str, unsafe_allow_html=True)
    st.write(predictor.fit_summary())
else:
    st.write("チェックを入れると教師データによる学習モデルの作成とテストデータに対する計算が行われます")
Exemple #7
0
from autogluon.tabular import TabularDataset, TabularPredictor

# Training time:
train_data = TabularDataset(
    'https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv'
)  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())
label = 'class'  # specifies which column do we want to predict
save_path = 'ag_models/'  # where to save trained models

predictor = TabularPredictor(label=label, path=save_path).fit(train_data)
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:
# predictor = TabularPredictor(label=label, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality')
results = predictor.fit_summary(show_plot=True)

# Inference time:
test_data = TabularDataset(
    'https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv'
)  # another Pandas DataFrame
y_test = test_data[label]
test_data = test_data.drop(
    labels=[label], axis=1
)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())

predictor = TabularPredictor.load(
    save_path
)  # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)