Esempio n. 1
0
def run_tabular_benchmark_toy(fit_args):
    dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/toyClassification.zip',
               'name': 'toyClassification',
               'problem_type': MULTICLASS,
               'label': 'y',
               'performance_val': 0.436}
    # 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data
    # toyclassif_dataset should produce 1 warning and 1 error during inference:
    # Warning: Ignoring 181 (out of 1000) training examples for which the label value in column 'y' is missing
    # ValueError: Required columns are missing from the provided dataset. Missing columns: ['lostcolumn']

    # Additional warning that would have occurred if ValueError was not triggered:
    # UserWarning: These columns from this dataset were not present in the training dataset (AutoGluon will ignore them):  ['distractioncolumn1', 'distractioncolumn2']

    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url'])
    print(f"Evaluating Benchmark Dataset {dataset['name']}")
    directory = directory_prefix + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    predictor = TabularPredictor(label=dataset['label'], path=savedir).fit(train_data, **fit_args)
    print(predictor.feature_metadata)
    print(predictor.feature_metadata.type_map_raw)
    print(predictor.feature_metadata.type_group_map_special)
    try:
        predictor.predict(test_data)
    except KeyError:  # KeyError should be raised because test_data has missing column 'lostcolumn'
        pass
    else:
        raise AssertionError(f'{dataset["name"]} should raise an exception.')
Esempio n. 2
0
def train(args):
    set_seed(args.seed)
    if args.task is not None:
        feature_columns, label_column, eval_metric, all_metrics = TASKS[args.task]
    else:
        raise NotImplementedError
    if args.exp_dir is None:
        args.exp_dir = 'autogluon_text_{}'.format(args.task)
    train_df = load_pd.load(args.train_file)
    dev_df = load_pd.load(args.dev_file)
    test_df = load_pd.load(args.test_file)
    train_df = train_df[feature_columns + [label_column]]
    dev_df = dev_df[feature_columns + [label_column]]
    test_df = test_df[feature_columns]
    if args.task == 'mrpc' or args.task == 'sts':
        # Augmenting the un-ordered set manually.
        train_df_other_part = pd.DataFrame({feature_columns[0]: train_df[feature_columns[1]],
                                            feature_columns[1]: train_df[feature_columns[0]],
                                            label_column: train_df[label_column]})
        real_train_df = pd.concat([train_df, train_df_other_part])
        real_dev_df = dev_df
    else:
        real_train_df = train_df
        real_dev_df = dev_df
    if args.mode == 'stacking':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=real_train_df,
                      tuning_data=real_dev_df,
                      hyperparameters='multimodal',
                      num_bag_folds=5,
                      num_stack_levels=1)
    elif args.mode == 'weighted':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=real_train_df,
                      tuning_data=real_dev_df,
                      hyperparameters='multimodal')
    elif args.mode == 'single':
        # When no embedding is used,
        # we will just use TextPredictor that will train a single model internally.
        predictor = TextPredictor(label=label_column,
                                  eval_metric=eval_metric,
                                  path=args.exp_dir)
        predictor.fit(train_data=real_train_df,
                      tuning_data=real_dev_df,
                      seed=args.seed)
    else:
        raise NotImplementedError
    dev_metric_score = predictor.evaluate(dev_df)
    dev_predictions = predictor.predict(dev_df, as_pandas=True)
    test_predictions = predictor.predict(test_df, as_pandas=True)
    dev_predictions.to_csv(os.path.join(args.exp_dir, 'dev_prediction.csv'))
    test_predictions.to_csv(os.path.join(args.exp_dir, 'test_prediction.csv'))
    with open(os.path.join(args.exp_dir, 'final_model_scores.json'), 'w') as of:
        json.dump({f'valid_{eval_metric}': dev_metric_score}, of)
    class AGLearner(object):
        def __init__(self, path=None):
            self.path = path

        def fit(self, x, y):
            ''' '''
            x = x if len(x.shape) > 1 else x[:, None]
            y = y if len(y.shape) > 1 else y[:, None]
            x_columns = ['x_%d' % i for i in range(x.shape[1])]
            self.x_columns = x_columns
            y_column = 'target'
            columns = x_columns + [y_column]

            train_data = pd.DataFrame(np.concatenate([x, y], axis=1),
                                      columns=columns)
            self._model = TabularPredictor(y_column, problem_type=problem_type, eval_metric=eval_metric, \
             path=self.path, verbosity=verbosity, sample_weight=sample_weight, weight_evaluation=weight_evaluation, \
             groups=groups, **kwargs).fit(train_data, **fit_kwargs)

        def predict(self, x):
            ''' '''
            assert hasattr(self, '_model'), 'The model has not been fitted yet'
            x = x if len(x.shape) > 1 else x[:, None]
            if not hasattr(self, 'x_columns'):
                self.x_columns = ['x_%d' % i for i in range(x.shape[1])]
            assert x.shape[1] == len(
                self.x_columns
            ), 'x has a shape incompatible with training data'
            data = pd.DataFrame(x, columns=self.x_columns)
            y_pred = self._model.predict(data, as_pandas=False)
            return y_pred

        @property
        def feature_importances_(self):
            try:
                importance_df = self._model.feature_importance()
                importances = [
                    importance_df.at[col, 'importance']
                    for col in self.x_columns
                ]
                return importances
            except:
                return []

        def save(self, path):
            self._model.save()

        @classmethod
        def load(cls, path):
            learner = AGLearner(path=path)
            learner._model = TabularPredictor.load(path)
            return learner
Esempio n. 4
0
def train(args):
    model_output_dir = f'{args.output_dir}/data'

    is_distributed = len(args.hosts) > 1
    host_rank = args.hosts.index(args.current_host)
    dist_ip_addrs = args.hosts
    dist_ip_addrs.pop(host_rank)

    # Load training and validation data
    print(f'Train files: {os.listdir(args.train)}')
    train_data = __load_input_data(args.train)

    # Extract column info
    target = args.init_args['label']
    columns = train_data.columns.tolist()
    column_dict = {"columns": columns}
    with open('columns.pkl', 'wb') as f:
        pickle.dump(column_dict, f)

    # Train models

    args.init_args['path'] = args.model_dir
    #args.fit_args.pop('label', None)
    predictor = TabularPredictor(**args.init_args).fit(train_data,
                                                       **args.fit_args)

    # Results summary
    predictor.fit_summary(verbosity=3)
    #model_summary_fname_src = os.path.join(predictor.output_directory, 'SummaryOfModels.html')
    model_summary_fname_src = os.path.join(args.model_dir,
                                           'SummaryOfModels.html')
    model_summary_fname_tgt = os.path.join(model_output_dir,
                                           'SummaryOfModels.html')

    if os.path.exists(model_summary_fname_src):
        shutil.copy(model_summary_fname_src, model_summary_fname_tgt)

    # ensemble visualization
    G = predictor._trainer.model_graph
    remove = [node for node, degree in dict(G.degree()).items() if degree < 1]
    G.remove_nodes_from(remove)
    A = nx.nx_agraph.to_agraph(G)
    A.graph_attr.update(rankdir='BT')
    A.node_attr.update(fontsize=10)
    for node in A.iternodes():
        node.attr['shape'] = 'rectagle'
    A.draw(os.path.join(model_output_dir, 'ensemble-model.png'),
           format='png',
           prog='dot')

    # Optional test data
    if args.test:
        print(f'Test files: {os.listdir(args.test)}')
        test_data = __load_input_data(args.test)
        # Test data must be labeled for scoring
        if target in test_data:
            # Leaderboard on test data
            print('Running model on test data and getting Leaderboard...')
            leaderboard = predictor.leaderboard(test_data, silent=True)
            print(format_for_print(leaderboard), end='\n\n')
            leaderboard.to_csv(f'{model_output_dir}/leaderboard.csv',
                               index=False)

            # Feature importance on test data
            # Note: Feature importance must be calculated on held-out (test) data.
            # If calculated on training data it will be biased due to overfitting.
            if args.feature_importance:
                print('Feature importance:')
                # Increase rows to print feature importance
                pd.set_option('display.max_rows', 500)
                feature_importance_df = predictor.feature_importance(test_data)

                print(feature_importance_df)
                feature_importance_df.to_csv(
                    f'{model_output_dir}/feature_importance.csv', index=True)

            # Classification report and confusion matrix for classification model
            if predictor.problem_type in [BINARY, MULTICLASS]:
                from sklearn.metrics import classification_report, confusion_matrix

                X_test = test_data.drop(target, axis=1)
                y_test_true = test_data[target]
                y_test_pred = predictor.predict(X_test)
                y_test_pred_prob = predictor.predict_proba(X_test,
                                                           as_multiclass=True)

                report_dict = classification_report(
                    y_test_true,
                    y_test_pred,
                    output_dict=True,
                    labels=predictor.class_labels)
                report_dict_df = pd.DataFrame(report_dict).T
                report_dict_df.to_csv(
                    f'{model_output_dir}/classification_report.csv',
                    index=True)

                cm = confusion_matrix(y_test_true,
                                      y_test_pred,
                                      labels=predictor.class_labels)
                cm_df = pd.DataFrame(cm, predictor.class_labels,
                                     predictor.class_labels)
                sns.set(font_scale=1)
                cmap = 'coolwarm'
                sns.heatmap(cm_df, annot=True, fmt='d', cmap=cmap)
                plt.title('Confusion Matrix')
                plt.ylabel('true label')
                plt.xlabel('predicted label')
                plt.show()
                plt.savefig(f'{model_output_dir}/confusion_matrix.png')

                get_roc_auc(y_test_true, y_test_pred_prob,
                            predictor.class_labels,
                            predictor.class_labels_internal, model_output_dir)
        else:
            warnings.warn(
                'Skipping eval on test data since label column is not included.'
            )

    # Files summary
    print(f'Model export summary:')
    print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}")
    models_contents = os.listdir('/opt/ml/model/models')
    print(f"/opt/ml/model/models: {models_contents}")
    print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")
Esempio n. 5
0
def run(args):
    if args.task == 'product_sentiment':
        train_df, test_df, label_column = load_machine_hack_product_sentiment(args.train_file,
                                                                              args.test_file)
    elif args.task == 'mercari_price':
        train_df, test_df, label_column = load_mercari_price_prediction(args.train_file,
                                                                        args.test_file)
    elif args.task == 'price_of_books':
        train_df, test_df, label_column = load_price_of_books(args.train_file, args.test_file)
    elif args.task == 'data_scientist_salary':
        train_df, test_df, label_column = load_data_scientist_salary(args.train_file, args.test_file)
    else:
        raise NotImplementedError

    hyperparameters = get_hyperparameter_config('multimodal')
    if args.preset is not None and args.mode in ['stacking', 'weighted']:
        hyperparameters['AG_TEXT_NN']['presets'] = args.preset

    if args.mode == 'stacking':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=args.eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=train_df,
                      hyperparameters=hyperparameters,
                      num_bag_folds=5,
                      num_stack_levels=1)
    elif args.mode == 'weighted':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=args.eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=train_df,
                      hyperparameters=hyperparameters)
    elif args.mode == 'single':
        # When no embedding is used,
        # we will just use TextPredictor that will train a single model internally.
        predictor = TextPredictor(label=label_column,
                                  eval_metric=args.eval_metric,
                                  path=args.exp_dir)
        predictor.fit(train_data=train_df,
                      presets=args.preset,
                      seed=args.seed)
    else:
        raise NotImplementedError
    if args.task == 'product_sentiment':
        test_probabilities = predictor.predict_proba(test_df, as_pandas=True, as_multiclass=True)
        test_probabilities.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False)
    elif args.task == 'data_scientist_salary':
        predictions = predictor.predict(test_df, as_pandas=False)
        submission = pd.read_excel(args.sample_submission, engine='openpyxl')
        submission.loc[:, label_column] = predictions
        submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx'))
    elif args.task == 'price_of_books':
        predictions = predictor.predict(test_df, as_pandas=False)
        submission = pd.read_excel(args.sample_submission, engine='openpyxl')
        submission.loc[:, label_column] = np.power(10, predictions) - 1
        submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx'))
    elif args.task == 'mercari_price':
        test_predictions = predictor.predict(test_df, as_pandas=False)
        submission = pd.read_csv(args.sample_submission)
        submission.loc[:, label_column] = np.exp(test_predictions) - 1
        submission.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False)
    else:
        raise NotImplementedError
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon.tabular import TabularDataset, TabularPredictor


# Training time:
train_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())
label = 'class'  # specifies which column do we want to predict
save_path = 'ag_models/'  # where to save trained models

predictor = TabularPredictor(label=label, path=save_path).fit(train_data)
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:
# predictor = TabularPredictor(label=label_column, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality')
results = predictor.fit_summary()

# Inference time:
test_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')  # another Pandas DataFrame
y_test = test_data[label]
test_data = test_data.drop(labels=[label], axis=1)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())

predictor = TabularPredictor.load(save_path)  # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
Esempio n. 7
0
def autogluon(df, task, timelife):
    pd.options.mode.chained_assignment = None
    df_new = copy.copy(df)
    X, y, _ = return_X_y(df_new)

    if isinstance(y, pd.Series):
        y = y.to_frame()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=1)

    if isinstance(y_train, pd.Series):
        y_train = y_train.to_frame()
    target = y_train.columns[0]

    if isinstance(y_test, pd.Series):
        y_test = y_test.to_frame()

    X_train[target] = y_train
    train = X_train

    test = X_test

    if task == 'classification':
        if len(y[y.columns[0]].unique()) > 2:
            pt = 'multiclass'
            f1 = lambda y_test, y_pred: f1_score(
                y_test, y_pred, average='weighted')
        else:
            pt = 'binary'
            f1 = lambda y_test, y_pred: f1_score(y_test, y_pred)
    else:
        pt = 'regression'
        #, path='/home/riccardo/.local/share/Trash'
    predictor = TabularPredictor(label=target, problem_type=pt).fit(
        train_data=train,
        time_limit=timelife * 60,
        presets=['optimize_for_deployment'
                 ])  # TEMPORANEO -> attenzione salvo sul cestino
    results = predictor.fit_summary()

    y_pred = predictor.predict(test)

    pipelines = (predictor.leaderboard(df, silent=True))  # sono queste

    res = predictor.evaluate_predictions(y_true=y_test.squeeze(),
                                         y_pred=y_pred,
                                         auxiliary_metrics=True)

    shutil.rmtree('./AutogluonModels')

    if (task == 'classification'):
        '''y_test = le.fit_transform(y_test)
    y_pred = le.fit_transform(y_pred)
    if len(np.unique(y_pred)) > 2:
      f1 = f1_score(y_test, y_pred, average='weighted')s
    else:
      f1 = f1_score(y_test, y_pred)
    return (res['accuracy'], f1)'''
        return (res['accuracy'], f1(y_test, y_pred), pipelines)
    else:
        return (res['root_mean_squared_error'], res['r2'], pipelines)
    predictor = TabularPredictor(path=os.path.join(args.save_dir,
                                                   args.model_type, time_str),
                                 problem_type=train_dataset.problem_type,
                                 eval_metric=train_dataset.metric,
                                 label=label_columns[0])
    if args.ensemble_type == 'weighted':
        predictor.fit(concat_df[feature_columns + [label_columns[0]]],
                      feature_generator=feature_generator,
                      hyperparameters=tabular_hparams)
    else:
        predictor.fit(concat_df[feature_columns + [label_columns[0]]],
                      feature_generator=feature_generator,
                      num_bag_folds=5,
                      num_stack_levels=1,
                      hyperparameters=tabular_hparams)
    predictor.save()
else:
    predictor = TextPredictor(path=os.path.join(args.save_dir, args.model_type,
                                                time_str),
                              problem_type=train_dataset.problem_type,
                              eval_metric=train_dataset.metric,
                              label=label_columns[0])
    predictor.fit(concat_df[feature_columns + [label_columns[0]]],
                  presets='electra_base_late_fusion_concate_e10_avg3')
    predictor.save(
        os.path.join(args.save_dir, args.model_type, time_str,
                     'text_prediction'))
predictions = predictor.predict(competition_df, as_pandas=True)
predictions.to_csv(
    os.path.join(args.save_dir, args.model_type, time_str, 'pred.csv'))
Esempio n. 9
0
def run(dataset, config):
    log.info(f"\n**** AutoGluon [v{__version__}] ****\n")
    save_metadata(config, version=__version__)

    metrics_mapping = dict(
        acc=metrics.accuracy,
        auc=metrics.roc_auc,
        f1=metrics.f1,
        logloss=metrics.log_loss,
        mae=metrics.mean_absolute_error,
        mse=metrics.mean_squared_error,
        r2=metrics.r2,
        rmse=metrics.root_mean_squared_error,
    )

    label = dataset.target.name
    problem_type = dataset.problem_type

    perf_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
    if perf_metric is None:
        # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported.", config.metric)

    is_classification = config.type == 'classification'
    training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}

    load_raw = config.framework_params.get('_load_raw', False)
    if load_raw:
        train, test = load_data_raw(dataset=dataset)
    else:
        column_names, _ = zip(*dataset.columns)
        column_types = dict(dataset.columns)
        train = pd.DataFrame(dataset.train.data, columns=column_names).astype(column_types, copy=False)
        print(f"Columns dtypes:\n{train.dtypes}")
        test = pd.DataFrame(dataset.test.data, columns=column_names).astype(column_types, copy=False)

    del dataset
    gc.collect()

    output_dir = output_subdir("models", config)
    with utils.Timer() as training:
        predictor = TabularPredictor(
            label=label,
            eval_metric=perf_metric.name,
            path=output_dir,
            problem_type=problem_type,
        ).fit(
            train_data=train,
            time_limit=config.max_runtime_seconds,
            **training_params
        )

    del train

    y_test = test[label]
    test = test.drop(columns=label)

    if is_classification:
        with utils.Timer() as predict:
            probabilities = predictor.predict_proba(test, as_multiclass=True)
        predictions = probabilities.idxmax(axis=1).to_numpy()
    else:
        with utils.Timer() as predict:
            predictions = predictor.predict(test, as_pandas=False)
        probabilities = None

    prob_labels = probabilities.columns.values.tolist() if probabilities is not None else None

    leaderboard = predictor.leaderboard(silent=True)  # Removed test data input to avoid long running computation, remove 7200s timeout limitation to re-enable
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
        print(leaderboard)

    save_artifacts(predictor, leaderboard, config)

    num_models_trained = len(leaderboard)
    if predictor._trainer.model_best is not None:
        num_models_ensemble = len(predictor._trainer.get_minimum_model_set(predictor._trainer.model_best))
    else:
        num_models_ensemble = 1

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  probabilities_labels=prob_labels,
                  target_is_encoded=False,
                  models_count=num_models_trained,
                  models_ensemble_count=num_models_ensemble,
                  training_duration=training.duration,
                  predict_duration=predict.duration)
Esempio n. 10
0
    ag_predictor_args["path"] = args.model_dir
    ag_fit_args = config["ag_fit_args"]

    predictor = TabularPredictor(**ag_predictor_args).fit(
        train_data, **ag_fit_args)
    logger.info("Best model: %s", predictor.get_model_best())

    # Leaderboard
    lb = predictor.leaderboard()
    lb.to_csv(f'{args.output_data_dir}/leaderboard.csv', index=False)
    logger.info("Saved leaderboard to output.")

    # Feature importance
    feature_importance = predictor.feature_importance(test_data)
    feature_importance.to_csv(f'{args.output_data_dir}/feature_importance.csv')
    logger.info("Saved feature importance to output.")

    # Evaluation
    evaluation = predictor.evaluate(test_data)
    with open(f'{args.output_data_dir}/evaluation.json', 'w') as f:
        json.dump(evaluation, f)
    logger.info("Saved evaluation to output.")

    predictor.save_space()

    # ---------------------------- Inference -----------------------------------

    test_data_nolabel = test_data.drop(labels=ag_predictor_args['label'],
                                       axis=1)
    y_pred = predictor.predict(test_data_nolabel)
    y_pred.to_csv(f'{args.output_data_dir}/predictions.csv', index=False)
Esempio n. 11
0
    data_adjust=data_adjust.reset_index(drop = True)
    '''
    #train
    train_data = TabularDataset(
        datasettemp.drop(
            columns=['Trading_money', 'open', 'max', 'min', 'PER', 'PBR'
                     ]).iloc[:-11])

    #predictor
    predictor = TabularPredictor(label='close').fit(
        train_data.drop(columns=['date', 'stock_id']))
    # , num_stack_levels=1,num_bag_folds=2)

    #test
    test_data = datasettemp.iloc[-11:len(datasettemp)]
    preds = predictor.predict(
        test_data.drop(columns=['date', 'stock_id', 'close']))
    test_hat = pd.DataFrame({
        'date': test_data['date'],
        'stock_id': test_data['stock_id'],
        'close': preds
    })
    test_hat

    predition_data = []
    for k in range(0, 10):  #抓後10天的資料
        g = []
        g.append(test_hat.iloc[k, 0])
        g.append(test_hat.iloc[k, 2])
        g.append(datasettemp.iloc[k - 10, 2])
        g.append(Quote_change(test_hat.iloc[k + 1, 2], test_hat.iloc[k, 2]))
        predition_data.append(g)
Esempio n. 12
0
# %%
# Train AutoGLUON
train = False
if train:
    df_train = df[df["vad"] & (df[COL_LABEL] != LABEL_NONE_ID)]
    predictor = TabularPredictor(label=COL_LABEL).fit(train_data=df_train)
    print("[green]Finished training[/green]")
else:
    predictor = TabularPredictor.load("AutogluonModels/ag-20211002_203405/")
    print("[green]Loaded pre-trained model[/green]")

# predictions = predictor.predict(TEST_DATA.csv)

# %%
predictions = predictor.predict(df)

# %%
# TODO
# Set label = none where VAD=0
# Create training set, removing "none"
# Predict the whole audio and check results

# %%
# Plot waveform, features and labels
fig, axes = get_figure(n_axes=6)
(wv_points, ) = axes[0].plot(t, y)
axes[0].set_ylabel("Waveform")
axes[0].set_xlabel("time (s)")
axes[1].plot(t, is_voice, "r")
axes[1].set_ylabel("VAD")
def train_model(dataset_name,
                text_presets,
                save_dir,
                model,
                tabular_presets,
                num_gpus=None,
                get_competition_results=False,
                seed=123):
    set_seed(seed)
    if get_competition_results:
        train_dataset = dataset_registry.create(dataset_name, 'train')
        test_dataset = dataset_registry.create(dataset_name, 'competition')
    else:
        train_dataset = dataset_registry.create(dataset_name, 'train')
        test_dataset = dataset_registry.create(dataset_name, 'test')
    feature_columns = train_dataset.feature_columns
    label_columns = train_dataset.label_columns
    metric = train_dataset.metric
    problem_type = train_dataset.problem_type
    train_data1, tuning_data1 = sklearn.model_selection.train_test_split(
        train_dataset.data,
        test_size=0.05,
        random_state=np.random.RandomState(seed))
    train_data = train_dataset.data
    test_data = test_dataset.data
    column_types, inferred_problem_type = infer_column_problem_types(
        train_data1,
        tuning_data1,
        label_columns=label_columns,
        problem_type=problem_type)
    train_data = train_data[feature_columns + label_columns]
    # tuning_data = tuning_data[feature_columns + label_columns]
    if not get_competition_results:
        test_data = test_data[feature_columns + label_columns]
    train_tic = time.time()
    if model == 'ag_tabular_quick':
        MAX_NGRAM = 300
        feature_generator = AutoMLPipelineFeatureGenerator(
            vectorizer=CountVectorizer(min_df=30,
                                       ngram_range=(1, 3),
                                       max_features=MAX_NGRAM,
                                       dtype=np.uint8))
        predictor = TabularPredictor(label=label_columns[0],
                                     path=save_dir,
                                     problem_type=problem_type)
        predictor.fit(train_data,
                      time_limit=30,
                      feature_generator=feature_generator)
    elif model == 'ag_tabular_without_text':
        no_text_feature_columns = []
        for col_name in feature_columns:
            if column_types[col_name] != _TEXT:
                no_text_feature_columns.append(col_name)
        train_data = train_data[no_text_feature_columns + label_columns]
        # tuning_data = tuning_data[no_text_feature_columns + label_columns]
        test_data = test_data[no_text_feature_columns + label_columns]
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets in ['best_quality']:
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS,
                          presets=tabular_presets)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS,
                          num_bag_folds=5,
                          num_stack_levels=1)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError
    elif model == 'ag_tabular_old':
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets == 'best_quality':
            predictor.fit(train_data=train_data,
                          presets=tabular_presets,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=5,
                          num_stack_levels=1,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError
    elif model == 'ag_text_only':
        text_feature_columns = [
            col_name for col_name in feature_columns
            if column_types[col_name] == _TEXT
        ]
        train_data = train_data[text_feature_columns + label_columns]
        test_data = test_data[text_feature_columns + label_columns]
        predictor = TextPredictor(path=save_dir,
                                  label=label_columns[0],
                                  problem_type=problem_type,
                                  eval_metric=metric)
        hparams = ag_text_presets.create(text_presets)
        if len(train_data) > 500000:
            hparams = set_epoch3(hparams)
        predictor.fit(train_data=train_data,
                      hyperparameters=hparams,
                      num_gpus=num_gpus,
                      seed=seed)
    elif model == 'ag_text_multimodal':
        predictor = TextPredictor(path=save_dir,
                                  label=label_columns[0],
                                  problem_type=problem_type,
                                  eval_metric=metric)
        hparams = ag_text_presets.create(text_presets)
        if len(train_data) > 500000:
            hparams = set_epoch3(hparams)
        predictor.fit(train_data=train_data,
                      hyperparameters=hparams,
                      num_gpus=num_gpus,
                      seed=seed)
    elif model == 'pre_embedding' or model == 'tune_embedding_multimodal' or model == 'tune_embedding_text':
        feature_generator = AutoMLPipelineFeatureGenerator(
            enable_text_special_features=False,
            enable_text_ngram_features=False)
        pre_embedding_folder = os.path.join(_CURR_DIR,
                                            'pre_computed_embeddings')
        if model == 'pre_embedding':
            train_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'pretrain_text_embedding', 'train.npy'))
            test_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'pretrain_text_embedding', 'test.npy'))
        elif model == 'tune_embedding_multimodal':
            train_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'multimodal_embedding', 'train.npy'))
            test_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'multimodal_embedding', 'test.npy'))
        elif model == 'tune_embedding_text':
            train_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'tuned_text_embedding', 'train.npy'))
            test_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'tuned_text_embedding', 'test.npy'))
        else:
            raise NotImplementedError
        train_data = train_data.join(
            pd.DataFrame(train_features,
                         columns=[
                             f'pre_feat{i}'
                             for i in range(train_features.shape[1])
                         ]))
        train_data.reset_index(drop=True, inplace=True)
        test_data = test_data.join(
            pd.DataFrame(test_features,
                         columns=[
                             f'pre_feat{i}'
                             for i in range(test_features.shape[1])
                         ]))
        test_data.reset_index(drop=True, inplace=True)
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets == 'best_quality':
            predictor.fit(train_data=train_data,
                          presets=tabular_presets,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=5,
                          num_stack_levels=1,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError

    elif model == 'tabular_multimodal' or model == 'tabular_multimodal_just_table':
        if model == 'tabular_multimodal':
            MAX_NGRAM = 300
            feature_generator = AutoMLPipelineFeatureGenerator(
                vectorizer=CountVectorizer(min_df=30,
                                           ngram_range=(1, 3),
                                           max_features=MAX_NGRAM,
                                           dtype=np.uint8),
                enable_raw_text_features=True)
            hyperparameters = get_multimodal_tabular_hparam_just_gbm(
                text_presets=text_presets)
        else:
            MAX_NGRAM = 300
            feature_generator = AutoMLPipelineFeatureGenerator(
                vectorizer=CountVectorizer(min_df=30,
                                           ngram_range=(1, 3),
                                           max_features=MAX_NGRAM,
                                           dtype=np.uint8),
                enable_raw_text_features=True,
                enable_text_special_features=False,
                enable_text_ngram_features=False)
            hyperparameters = multimodal_tabular_just_table_hparam(
                text_presets=text_presets)
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets == 'best_quality':
            predictor.fit(train_data=train_data,
                          presets=tabular_presets,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=5,
                          num_stack_levels=1,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '3fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=3,
                          num_stack_levels=1,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError
    else:
        raise NotImplementedError
    train_toc = time.time()
    inference_tic = time.time()
    predictions = predictor.predict(test_data, as_pandas=True)
    predictor.save()
    inference_toc = time.time()
    if problem_type == MULTICLASS or problem_type == BINARY:
        prediction_prob = predictor.predict_proba(test_data, as_pandas=True)
        prediction_prob.to_csv(
            os.path.join(save_dir, 'test_prediction_prob.csv'))
    predictions.to_csv(os.path.join(save_dir, 'test_prediction.csv'))
    gt = test_data[label_columns[0]]
    gt.to_csv(os.path.join(save_dir, 'ground_truth.csv'))
    if not get_competition_results:
        score = predictor.evaluate(test_data)
        with open(os.path.join(save_dir, 'test_score.json'), 'w') as of:
            json.dump({metric: score}, of)
    with open(os.path.join(save_dir, 'speed_stats.json'), 'w') as of:
        json.dump(
            {
                'train_time': train_toc - train_tic,
                'inference_time': inference_toc - inference_tic,
                'cpuinfo': cpuinfo.get_cpu_info()
            }, of)
Esempio n. 14
0
def run(dataset, config):
    log.info(f"\n**** AutoGluon [v{__version__}] ****\n")

    metrics_mapping = dict(
        acc=metrics.accuracy,
        auc=metrics.roc_auc,
        f1=metrics.f1,
        logloss=metrics.log_loss,
        mae=metrics.mean_absolute_error,
        mse=metrics.mean_squared_error,
        r2=metrics.r2,
        rmse=metrics.root_mean_squared_error,
    )

    perf_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if perf_metric is None:
        # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported.", config.metric)

    is_classification = config.type == 'classification'
    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    train, test = dataset.train.path, dataset.test.path
    label = dataset.target.name
    problem_type = dataset.problem_type

    models_dir = tempfile.mkdtemp() + os.sep  # passed to AG

    with Timer() as training:
        predictor = TabularPredictor(
            label=label,
            eval_metric=perf_metric.name,
            path=models_dir,
            problem_type=problem_type,
        ).fit(train_data=train,
              time_limit=config.max_runtime_seconds,
              **training_params)

    del train

    if is_classification:
        with Timer() as predict:
            probabilities = predictor.predict_proba(test, as_multiclass=True)
        predictions = probabilities.idxmax(axis=1).to_numpy()
    else:
        with Timer() as predict:
            predictions = predictor.predict(test, as_pandas=False)
        probabilities = None

    prob_labels = probabilities.columns.values.astype(
        str).tolist() if probabilities is not None else None

    _leaderboard_extra_info = config.framework_params.get(
        '_leaderboard_extra_info',
        False)  # whether to get extra model info (very verbose)
    _leaderboard_test = config.framework_params.get(
        '_leaderboard_test',
        False)  # whether to compute test scores in leaderboard (expensive)
    leaderboard_kwargs = dict(silent=True, extra_info=_leaderboard_extra_info)
    # Disabled leaderboard test data input by default to avoid long running computation, remove 7200s timeout limitation to re-enable
    if _leaderboard_test:
        leaderboard_kwargs['data'] = test

    leaderboard = predictor.leaderboard(**leaderboard_kwargs)
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None, 'display.width', 1000):
        log.info(leaderboard)

    num_models_trained = len(leaderboard)
    if predictor._trainer.model_best is not None:
        num_models_ensemble = len(
            predictor._trainer.get_minimum_model_set(
                predictor._trainer.model_best))
    else:
        num_models_ensemble = 1

    save_artifacts(predictor, leaderboard, config)
    shutil.rmtree(predictor.path, ignore_errors=True)

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  probabilities=probabilities,
                  probabilities_labels=prob_labels,
                  target_is_encoded=False,
                  models_count=num_models_trained,
                  models_ensemble_count=num_models_ensemble,
                  training_duration=training.duration,
                  predict_duration=predict.duration)
Esempio n. 15
0
train_data = TabularDataset(
    "../../data/processed/oversampled/train_valid_feat_eng_oversample.csv")
# train_data = train_data.drop(["Age","Room_Rate","Discount_Rate"],axis="columns")
save_path = "models_oversample_valid"
predictor = TabularPredictor(label="Reservation_Status",
                             path=save_path,
                             eval_metric="f1_macro").fit(
                                 train_data,
                                 time_limit=7200,
                                 presets="best_quality")

valid_data = TabularDataset("../../data/processed/valid_preproc.csv")
y_test = valid_data.loc[:, "Reservation_Status"]
valid_data = valid_data.drop(["Reservation_Status"], axis="columns")

y_pred = predictor.predict(valid_data)
perf = predictor.evaluate_predictions(y_true=y_test,
                                      y_pred=y_pred,
                                      auxiliary_metrics=True)
print(perf)

test_data = TabularDataset("../../data/processed/test_preproc.csv")
test_preds = predictor.predict(test_data)

test_df = pd.read_csv("../../data/processed/test_preproc.csv")
test_df["Reservation_Status"] = test_preds
test_df = test_df.replace(
    {"Reservation_Status": {
        "Check-In": 1,
        "Canceled": 2,
        "No-Show": 3
Esempio n. 16
0
def run_tabular_benchmarks(fast_benchmark, subsample_size, perf_threshold, seed_val, fit_args, dataset_indices=None, run_distill=False, crash_in_oof=False):
    print("Running fit with args:")
    print(fit_args)
    # Each train/test dataset must be located in single directory with the given names.
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    EPS = 1e-10

    # List containing dicts for each dataset to include in benchmark (try to order based on runtimes)
    datasets = get_benchmark_sets()
    if dataset_indices is not None: # only run some datasets
        datasets = [datasets[i] for i in dataset_indices]

    # Aggregate performance summaries obtained in previous benchmark run:
    prev_perf_vals = [dataset['performance_val'] for dataset in datasets]
    previous_avg_performance = np.mean(prev_perf_vals)
    previous_median_performance = np.median(prev_perf_vals)
    previous_worst_performance = np.max(prev_perf_vals)

    # Run benchmark:
    performance_vals = [0.0] * len(datasets) # performance obtained in this run
    directory_prefix = './datasets/'
    with warnings.catch_warnings(record=True) as caught_warnings:
        for idx in range(len(datasets)):
            dataset = datasets[idx]
            train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url'])
            if seed_val is not None:
                seed(seed_val)
                np.random.seed(seed_val)
            print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx+1, len(datasets)))
            directory = directory_prefix + dataset['name'] + "/"
            savedir = directory + 'AutogluonOutput/'
            shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
            label = dataset['label']
            y_test = test_data[label]
            test_data = test_data.drop(labels=[label], axis=1)
            if fast_benchmark:
                if subsample_size is None:
                    raise ValueError("fast_benchmark specified without subsample_size")
                if subsample_size < len(train_data):
                    # .sample instead of .head to increase diversity and test cases where data index is not monotonically increasing.
                    train_data = train_data.sample(n=subsample_size, random_state=seed_val)  # subsample for fast_benchmark
            predictor = TabularPredictor(label=label, path=savedir).fit(train_data, **fit_args)
            results = predictor.fit_summary(verbosity=4)
            if predictor.problem_type != dataset['problem_type']:
                warnings.warn("For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type']))
            predictor = TabularPredictor.load(savedir)  # Test loading previously-trained predictor from file
            y_pred_empty = predictor.predict(test_data[0:0])
            assert len(y_pred_empty) == 0
            y_pred = predictor.predict(test_data)
            perf_dict = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
            if dataset['problem_type'] != REGRESSION:
                perf = 1.0 - perf_dict['accuracy']  # convert accuracy to error-rate
            else:
                perf = 1.0 - perf_dict['r2']  # unexplained variance score.
            performance_vals[idx] = perf
            print("Performance on dataset %s: %s   (previous perf=%s)" % (dataset['name'], performance_vals[idx], dataset['performance_val']))
            if (not fast_benchmark) and (performance_vals[idx] > dataset['performance_val'] * perf_threshold):
                warnings.warn("Performance on dataset %s is %s times worse than previous performance." %
                              (dataset['name'], performance_vals[idx]/(EPS+dataset['performance_val'])))
            if predictor._trainer.bagged_mode and not crash_in_oof:
                # TODO: Test index alignment with original training data (first handle duplicated rows / dropped rows edge cases)
                y_pred_oof = predictor.get_oof_pred()
                y_pred_proba_oof = predictor.get_oof_pred_proba(as_multiclass=False)
                y_pred_oof_transformed = predictor.get_oof_pred(transformed=True)
                y_pred_proba_oof_transformed = predictor.get_oof_pred_proba(as_multiclass=False, transformed=True)

                # Assert expected type output
                assert isinstance(y_pred_oof, pd.Series)
                assert isinstance(y_pred_oof_transformed, pd.Series)
                if predictor.problem_type == MULTICLASS:
                    assert isinstance(y_pred_proba_oof, pd.DataFrame)
                    assert isinstance(y_pred_proba_oof_transformed, pd.DataFrame)
                else:
                    if predictor.problem_type == BINARY:
                        assert isinstance(predictor.get_oof_pred_proba(), pd.DataFrame)
                    assert isinstance(y_pred_proba_oof, pd.Series)
                    assert isinstance(y_pred_proba_oof_transformed, pd.Series)

                assert y_pred_oof_transformed.equals(predictor.transform_labels(y_pred_oof, proba=False))

                # Test that the transform_labels method is capable of reproducing the same output when converting back and forth, and test that oof 'transform' parameter works properly.
                y_pred_proba_oof_inverse = predictor.transform_labels(y_pred_proba_oof, proba=True)
                y_pred_proba_oof_inverse_inverse = predictor.transform_labels(y_pred_proba_oof_inverse, proba=True, inverse=True)
                y_pred_oof_inverse = predictor.transform_labels(y_pred_oof)
                y_pred_oof_inverse_inverse = predictor.transform_labels(y_pred_oof_inverse, inverse=True)

                if isinstance(y_pred_proba_oof_transformed, pd.DataFrame):
                    pd.testing.assert_frame_equal(y_pred_proba_oof_transformed, y_pred_proba_oof_inverse)
                    pd.testing.assert_frame_equal(y_pred_proba_oof, y_pred_proba_oof_inverse_inverse)
                else:
                    pd.testing.assert_series_equal(y_pred_proba_oof_transformed, y_pred_proba_oof_inverse)
                    pd.testing.assert_series_equal(y_pred_proba_oof, y_pred_proba_oof_inverse_inverse)
                pd.testing.assert_series_equal(y_pred_oof_transformed, y_pred_oof_inverse)
                pd.testing.assert_series_equal(y_pred_oof, y_pred_oof_inverse_inverse)

                # Test that index of both the internal training data and the oof outputs are consistent in their index values.
                X_internal, y_internal = predictor.load_data_internal()
                y_internal_index = list(y_internal.index)
                assert list(X_internal.index) == y_internal_index
                assert list(y_pred_oof.index) == y_internal_index
                assert list(y_pred_proba_oof.index) == y_internal_index
                assert list(y_pred_oof_transformed.index) == y_internal_index
                assert list(y_pred_proba_oof_transformed.index) == y_internal_index
            else:
                # Raise exception
                with pytest.raises(AssertionError):
                    predictor.get_oof_pred()
                with pytest.raises(AssertionError):
                    predictor.get_oof_pred_proba()
            if run_distill:
                predictor.distill(time_limit=60, augment_args={'size_factor':0.5})

    # Summarize:
    avg_perf = np.mean(performance_vals)
    median_perf = np.median(performance_vals)
    worst_perf = np.max(performance_vals)
    for idx in range(len(datasets)):
        print("Performance on dataset %s: %s   (previous perf=%s)" % (datasets[idx]['name'], performance_vals[idx], datasets[idx]['performance_val']))

    print("Average performance: %s" % avg_perf)
    print("Median performance: %s" % median_perf)
    print("Worst performance: %s" % worst_perf)

    if not fast_benchmark:
        if avg_perf > previous_avg_performance * perf_threshold:
            warnings.warn("Average Performance is %s times worse than previously." % (avg_perf/(EPS+previous_avg_performance)))
        if median_perf > previous_median_performance * perf_threshold:
            warnings.warn("Median Performance is %s times worse than previously." % (median_perf/(EPS+previous_median_performance)))
        if worst_perf > previous_worst_performance * perf_threshold:
            warnings.warn("Worst Performance is %s times worse than previously." % (worst_perf/(EPS+previous_worst_performance)))

    print("Ran fit with args:")
    print(fit_args)
    # List all warnings again to make sure they are seen:
    print("\n\n WARNINGS:")
    for w in caught_warnings:
        warnings.warn(w.message)
Esempio n. 17
0
def test_advanced_functionality():
    fast_benchmark = True
    dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
                      'name': 'AdultIncomeBinaryClassification',
                      'problem_type': BINARY}
    label = 'class'
    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url'])
    if fast_benchmark:  # subsample for fast_benchmark
        subsample_size = 100
        train_data = train_data.head(subsample_size)
        test_data = test_data.head(subsample_size)
    print(f"Evaluating Advanced Functionality on Benchmark Dataset {dataset['name']}")
    directory = directory_prefix + 'advanced/' + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    predictor = TabularPredictor(label=label, path=savedir).fit(train_data)
    leaderboard = predictor.leaderboard(data=test_data)
    extra_metrics = ['accuracy', 'roc_auc', 'log_loss']
    leaderboard_extra = predictor.leaderboard(data=test_data, extra_info=True, extra_metrics=extra_metrics)
    assert set(predictor.get_model_names()) == set(leaderboard['model'])
    assert set(predictor.get_model_names()) == set(leaderboard_extra['model'])
    assert set(leaderboard_extra.columns).issuperset(set(leaderboard.columns))
    assert len(leaderboard) == len(leaderboard_extra)
    assert set(leaderboard_extra.columns).issuperset(set(extra_metrics))  # Assert that extra_metrics are present in output
    num_models = len(predictor.get_model_names())
    feature_importances = predictor.feature_importance(data=test_data)
    original_features = set(train_data.columns)
    original_features.remove(label)
    assert set(feature_importances.index) == original_features
    assert set(feature_importances.columns) == {'importance', 'stddev', 'p_value', 'n', 'p99_high', 'p99_low'}
    predictor.transform_features()
    predictor.transform_features(data=test_data)
    predictor.info()

    assert predictor.get_model_names_persisted() == []  # Assert that no models were persisted during training
    assert predictor.unpersist_models() == []  # Assert that no models were unpersisted

    persisted_models = predictor.persist_models(models='all', max_memory=None)
    assert set(predictor.get_model_names_persisted()) == set(persisted_models)  # Ensure all models are persisted
    assert predictor.persist_models(models='all', max_memory=None) == []  # Ensure that no additional models are persisted on repeated calls
    unpersised_models = predictor.unpersist_models()
    assert set(unpersised_models) == set(persisted_models)
    assert predictor.get_model_names_persisted() == []  # Assert that all models were unpersisted

    # Raise exception
    with pytest.raises(NetworkXError):
        predictor.persist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2'])

    assert predictor.get_model_names_persisted() == []

    assert predictor.unpersist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2']) == []

    predictor.persist_models(models='all', max_memory=None)
    predictor.save()  # Save predictor while models are persisted: Intended functionality is that they won't be persisted when loaded.
    predictor_loaded = TabularPredictor.load(predictor.path)  # Assert that predictor loading works
    leaderboard_loaded = predictor_loaded.leaderboard(data=test_data)
    assert len(leaderboard) == len(leaderboard_loaded)
    assert predictor_loaded.get_model_names_persisted() == []  # Assert that models were not still persisted after loading predictor

    assert(predictor.get_model_full_dict() == dict())
    predictor.refit_full()
    assert(len(predictor.get_model_full_dict()) == num_models)
    assert(len(predictor.get_model_names()) == num_models * 2)
    for model in predictor.get_model_names():
        predictor.predict(data=test_data, model=model)
    predictor.refit_full()  # Confirm that refit_models aren't further refit.
    assert(len(predictor.get_model_full_dict()) == num_models)
    assert(len(predictor.get_model_names()) == num_models * 2)
    predictor.delete_models(models_to_keep=[])  # Test that dry-run doesn't delete models
    assert(len(predictor.get_model_names()) == num_models * 2)
    predictor.predict(data=test_data)
    predictor.delete_models(models_to_keep=[], dry_run=False)  # Test that dry-run deletes models
    assert len(predictor.get_model_names()) == 0
    assert len(predictor.leaderboard()) == 0
    assert len(predictor.leaderboard(extra_info=True)) == 0
    try:
        predictor.predict(data=test_data)
    except:
        pass
    else:
        raise AssertionError('predictor.predict should raise exception after all models are deleted')
    print('Tabular Advanced Functionality Test Succeeded.')
Esempio n. 18
0
from autogluon.tabular import TabularDataset, TabularPredictor
# Train
train_data = TabularDataset('train.csv')
id, label = 'PassengerId', 'Survived'
save_path = 'model'

time_limit = 300

predictor = TabularPredictor(label=label,
                             path=save_path).fit(train_data.drop(columns=[id]),
                                                 time_limit=time_limit,
                                                 presets='best_quality')
# Test
import pandas as pd

test_data = TabularDataset('test.csv')

# predictor = TabularPredictor.load(
#     save_path
# )  # unnecessary, just demonstrates how to load previously-trained predictor from file

preds = predictor.predict(test_data.drop(columns=[id]))
submission = pd.DataFrame({id: test_data[id], label: preds})
submission.to_csv('submission.csv', index=False)