Esempio n. 1
0
def train_model(df_train: pd.DataFrame,
                df_test: pd.DataFrame,
                label: str,
                verbosity: int = 0,
                random_state: int = 0) -> TabularPredictor:
    """
    Train an autogluon model for df_train, df_test. Specify the label column.
    Optionally, you can set verbosity to control how much output AutoGluon
    produces during training.

    The function caches models that have been trained on the same data by
    computing the hash of df_train and comparing that to existing models.

    Returns the predictor object.

    TODO: Optimize this bad boy for experiments. Would be k-fold
    cross-validation instead of train-test split and a AG-preset that opts
    for highest quality model. Also no or very high time_limit.
    """
    logger = logging.getLogger('pfd')
    d = 'agModels'  # folder to store trained models
    checksum = calculate_model_hash(df_train, label, random_state)
    model_path = f'{d}/{checksum}'
    logger.info(f'Calculated a checksum of {checksum}.')
    try:
        predictor = TabularPredictor.load(model_path)
    except FileNotFoundError:
        logger.info("Didn't find a model to load from the cache.")
        p = TabularPredictor(label=label, path=model_path)
        predictor = p.fit(train_data=df_train,
                          tuning_data=df_test,
                          time_limit=20,
                          verbosity=verbosity,
                          presets='medium_quality_faster_train')
    return predictor
Esempio n. 2
0
    def train(self,
              train_data,
              eval_metric=EVAL_METRIC,
              quality=QUALITY,
              time_limit=TIME_LIMIT,
              verbosity=VERBOSITY):
        """Train prospective models."""
        # predictor gives us default access to the *best* predictor that
        # was trained on the task (otherwise we're just wrapping AutoGluon)

        # create custom feature generator to force autogluon to use our features
        # as they are
        fg = AutoMLPipelineFeatureGenerator(enable_categorical_features=False,
                                            enable_datetime_features=False,
                                            enable_text_special_features=False,
                                            enable_text_ngram_features=False)
        # create our own feature metadata object as we know what the type of every
        # feature we have. Skip the label column in the training data when doing so
        fmd = FeatureMetadata(dict.fromkeys(train_data.columns[:-1], 'int'))

        task = TabularPredictor(
            label='label',
            eval_metric=eval_metric,
            path=self.outpath,
            verbosity=verbosity,
        )
        return task.fit(train_data=train_data,
                        time_limit=time_limit,
                        presets=self.QUALITY_PRESETS[quality],
                        feature_generator=fg,
                        feature_metadata=fmd)
Esempio n. 3
0
    def _fit(self, X: List[Config[ModelConfig]],
             y: npt.NDArray[np.float32]) -> None:
        X_numpy = self.config_transformer.fit_transform(X)

        # We need to train one predictor per output feature
        self.predictors = []
        for i in range(y.shape[1]):
            df = pd.DataFrame(np.concatenate([X_numpy, y[:, i:i + 1]],
                                             axis=-1))
            predictor = TabularPredictor(
                df.shape[1] - 1,
                problem_type="regression",
                eval_metric="root_mean_squared_error",
            )
            predictor.fit(df, time_limit=self.time_limit, verbosity=0)
            self.predictors.append(predictor)
Esempio n. 4
0
def train(args):
    set_seed(args.seed)
    if args.task is not None:
        feature_columns, label_column, eval_metric, all_metrics = TASKS[args.task]
    else:
        raise NotImplementedError
    if args.exp_dir is None:
        args.exp_dir = 'autogluon_text_{}'.format(args.task)
    train_df = load_pd.load(args.train_file)
    dev_df = load_pd.load(args.dev_file)
    test_df = load_pd.load(args.test_file)
    train_df = train_df[feature_columns + [label_column]]
    dev_df = dev_df[feature_columns + [label_column]]
    test_df = test_df[feature_columns]
    if args.task == 'mrpc' or args.task == 'sts':
        # Augmenting the un-ordered set manually.
        train_df_other_part = pd.DataFrame({feature_columns[0]: train_df[feature_columns[1]],
                                            feature_columns[1]: train_df[feature_columns[0]],
                                            label_column: train_df[label_column]})
        real_train_df = pd.concat([train_df, train_df_other_part])
        real_dev_df = dev_df
    else:
        real_train_df = train_df
        real_dev_df = dev_df
    if args.mode == 'stacking':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=real_train_df,
                      tuning_data=real_dev_df,
                      hyperparameters='multimodal',
                      num_bag_folds=5,
                      num_stack_levels=1)
    elif args.mode == 'weighted':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=real_train_df,
                      tuning_data=real_dev_df,
                      hyperparameters='multimodal')
    elif args.mode == 'single':
        # When no embedding is used,
        # we will just use TextPredictor that will train a single model internally.
        predictor = TextPredictor(label=label_column,
                                  eval_metric=eval_metric,
                                  path=args.exp_dir)
        predictor.fit(train_data=real_train_df,
                      tuning_data=real_dev_df,
                      seed=args.seed)
    else:
        raise NotImplementedError
    dev_metric_score = predictor.evaluate(dev_df)
    dev_predictions = predictor.predict(dev_df, as_pandas=True)
    test_predictions = predictor.predict(test_df, as_pandas=True)
    dev_predictions.to_csv(os.path.join(args.exp_dir, 'dev_prediction.csv'))
    test_predictions.to_csv(os.path.join(args.exp_dir, 'test_prediction.csv'))
    with open(os.path.join(args.exp_dir, 'final_model_scores.json'), 'w') as of:
        json.dump({f'valid_{eval_metric}': dev_metric_score}, of)
Esempio n. 5
0
def run(args):
    if args.task == 'product_sentiment':
        train_df, test_df, label_column = load_machine_hack_product_sentiment(args.train_file,
                                                                              args.test_file)
    elif args.task == 'mercari_price':
        train_df, test_df, label_column = load_mercari_price_prediction(args.train_file,
                                                                        args.test_file)
    elif args.task == 'price_of_books':
        train_df, test_df, label_column = load_price_of_books(args.train_file, args.test_file)
    elif args.task == 'data_scientist_salary':
        train_df, test_df, label_column = load_data_scientist_salary(args.train_file, args.test_file)
    else:
        raise NotImplementedError

    hyperparameters = get_hyperparameter_config('multimodal')
    if args.preset is not None and args.mode in ['stacking', 'weighted']:
        hyperparameters['AG_TEXT_NN']['presets'] = args.preset

    if args.mode == 'stacking':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=args.eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=train_df,
                      hyperparameters=hyperparameters,
                      num_bag_folds=5,
                      num_stack_levels=1)
    elif args.mode == 'weighted':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=args.eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=train_df,
                      hyperparameters=hyperparameters)
    elif args.mode == 'single':
        # When no embedding is used,
        # we will just use TextPredictor that will train a single model internally.
        predictor = TextPredictor(label=label_column,
                                  eval_metric=args.eval_metric,
                                  path=args.exp_dir)
        predictor.fit(train_data=train_df,
                      presets=args.preset,
                      seed=args.seed)
    else:
        raise NotImplementedError
    if args.task == 'product_sentiment':
        test_probabilities = predictor.predict_proba(test_df, as_pandas=True, as_multiclass=True)
        test_probabilities.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False)
    elif args.task == 'data_scientist_salary':
        predictions = predictor.predict(test_df, as_pandas=False)
        submission = pd.read_excel(args.sample_submission, engine='openpyxl')
        submission.loc[:, label_column] = predictions
        submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx'))
    elif args.task == 'price_of_books':
        predictions = predictor.predict(test_df, as_pandas=False)
        submission = pd.read_excel(args.sample_submission, engine='openpyxl')
        submission.loc[:, label_column] = np.power(10, predictions) - 1
        submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx'))
    elif args.task == 'mercari_price':
        test_predictions = predictor.predict(test_df, as_pandas=False)
        submission = pd.read_csv(args.sample_submission)
        submission.loc[:, label_column] = np.exp(test_predictions) - 1
        submission.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False)
    else:
        raise NotImplementedError
elif args.model_type == 'large':
    tabular_hparams = get_tabular_hparams(
        electra_large_late_fusion_concate_e10_avg3())
else:
    raise NotImplementedError

time_str = strftime("%Y-%m-%d_%H-%M-%S", gmtime())
if args.ensemble_type == 'weighted' or args.ensemble_type == 'stack':
    predictor = TabularPredictor(path=os.path.join(args.save_dir,
                                                   args.model_type, time_str),
                                 problem_type=train_dataset.problem_type,
                                 eval_metric=train_dataset.metric,
                                 label=label_columns[0])
    if args.ensemble_type == 'weighted':
        predictor.fit(concat_df[feature_columns + [label_columns[0]]],
                      feature_generator=feature_generator,
                      hyperparameters=tabular_hparams)
    else:
        predictor.fit(concat_df[feature_columns + [label_columns[0]]],
                      feature_generator=feature_generator,
                      num_bag_folds=5,
                      num_stack_levels=1,
                      hyperparameters=tabular_hparams)
    predictor.save()
else:
    predictor = TextPredictor(path=os.path.join(args.save_dir, args.model_type,
                                                time_str),
                              problem_type=train_dataset.problem_type,
                              eval_metric=train_dataset.metric,
                              label=label_columns[0])
    predictor.fit(concat_df[feature_columns + [label_columns[0]]],
all_train_data = all_train_data[keep_ind]

train_data, test_data = train_test_split(all_train_data,
                                        test_size=0.2,
                                        random_state=np.random.RandomState(seed))

train_data.to_csv(os.path.join(directory, output_subdir, train_name), index=False)
test_data.to_csv(os.path.join(directory, output_subdir, test_name), index=False)
print(f'#Train={len(train_data)}, #Dev={len(test_data)}')


# Test run autogluon:
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon import TabularPrediction as task
from sklearn.feature_extraction.text import CountVectorizer
from autogluon.features.generators import AutoMLPipelineFeatureGenerator

MAX_NGRAM = 300
time_limit = 300
feature_generator = AutoMLPipelineFeatureGenerator(vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8))

predictor = TabularPredictor(label=label, path=directory+output_subdir, problem_type=problem_type)
predictor.fit(train_data, time_limit=time_limit, feature_generator=feature_generator, hyperparameters={'GBM':{}})
predictor.evaluate(test_data)


# Compute checksum:
from auto_mm_bench.utils import sha1sum
print("Train hash:\n", sha1sum(os.path.join(directory, output_subdir, train_name)))
print("Test hash:\n", sha1sum(os.path.join(directory, output_subdir, test_name)))
Esempio n. 8
0
def inner_test_tabular(testname):

    # Find the named test
    test = None
    for t in tests:
        if t['name'] == testname:
            test = t
    assert test is not None, f"Could not find test {testname}"

    # Build the dataset
    (dftrain, dftest) = make_dataset(request=test, seed=0)

    # Check the synthetic dataset itself hasn't changed.  We round it to 3dp otherwise tiny floating point differences
    # between platforms can give a different hash that still yields same prediction scores.
    # Ultimately it doesn't matter how we do this as long as the same dataset gives the same hash function on
    # different python versions and architectures.
    current_hash = hashlib.sha256(
        dftrain.round(decimals=3).values.tobytes()).hexdigest()[0:10]
    proposedconfig = "Proposed new config:\n"
    proposedconfig += f"'dataset_hash' : '{current_hash}',"
    assert current_hash == test[
        'dataset_hash'], f"Test '{testname}' input dataset has changed.  All scores will change.\n" + proposedconfig

    # Now run the Predictor 1 or more times with various parameters, and make sure we get
    # back the expected results.

    # Params can either omitted, or a single run, or a list of runs.
    if 'params' not in test:
        test['params'] = {'predict': {}, 'fit': {}}
    if not isinstance(test['params'], list):
        test['params'] = [test['params']]
    for params in test['params']:

        # Run this model and set of params
        predictor = TabularPredictor(label='label', **params['predict'])
        predictor.fit(dftrain, **params['fit'])
        leaderboard = predictor.leaderboard(dftest, silent=True)
        leaderboard = leaderboard.sort_values(
            by='model'
        )  # So we can pre-generate sample config in alphabetical order

        # Store proposed new config based on the current run, in case the developer wants to keep thee results (just cut and paste).
        proposedconfig = "Proposed new config:\n"
        proposedconfig += "'expected_score_range' : {\n"
        for model in leaderboard['model']:
            midx_in_leaderboard = leaderboard.index.values[leaderboard['model']
                                                           == model][0]
            if np.isnan(leaderboard['score_test'][midx_in_leaderboard]):
                values = "np.nan, np.nan"
            else:
                if model in test['expected_score_range'] and not np.isnan(
                        test['expected_score_range'][model][1]):
                    currentprecision = test['expected_score_range'][model][1]
                else:
                    currentprecision = 0.01
                values = "{}, {}".format(
                    myfloor(leaderboard['score_test'][midx_in_leaderboard],
                            currentprecision), currentprecision)
            proposedconfig += f"    '{model}': ({values}),\n"
        proposedconfig += "},\n"

        # First validate the model list was as expected.
        assert set(leaderboard['model']) == set(
            test['expected_score_range'].keys()
        ), (f"Test '{testname}' params {params} got unexpected model list.\n" +
            proposedconfig)

        # Now validate the scores for each model were as expected.
        all_assertions_met = True
        currentconfig = "Existing config:\n"
        currentconfig += "'expected_score_range' : {\n"
        for model in sorted(test['expected_score_range']):
            midx_in_leaderboard = leaderboard.index.values[leaderboard['model']
                                                           == model][0]
            assert leaderboard['model'][midx_in_leaderboard] == model
            expectedrange = test['expected_score_range'][model][1]
            expectedmin = test['expected_score_range'][model][0]
            expectedmax = expectedmin + expectedrange

            if np.isnan(expectedmin):
                values = "np.nan, np.nan"
            else:
                values = "{}, {}".format(expectedmin, expectedrange)

            if ((
                (leaderboard['score_test'][midx_in_leaderboard] >= expectedmin)
                    and (leaderboard['score_test'][midx_in_leaderboard] <=
                         expectedmax)) or
                (np.isnan(leaderboard['score_test'][midx_in_leaderboard])
                 and np.isnan(expectedmin))):
                currentconfig += f"    '{model}': ({values}),\n"
            else:
                currentconfig += f"    '{model}': ({values}), # <--- not met, got {leaderboard['score_test'][midx_in_leaderboard]} \n"
                all_assertions_met = False
        currentconfig += "},\n"

        assert all_assertions_met, f"Test '{testname}', params {params} had unexpected scores:\n" + currentconfig + proposedconfig

        # Clean up this model created with specific params.
        predictor.delete_models(models_to_keep=[], dry_run=False)
def train_model(dataset_name,
                text_presets,
                save_dir,
                model,
                tabular_presets,
                num_gpus=None,
                get_competition_results=False,
                seed=123):
    set_seed(seed)
    if get_competition_results:
        train_dataset = dataset_registry.create(dataset_name, 'train')
        test_dataset = dataset_registry.create(dataset_name, 'competition')
    else:
        train_dataset = dataset_registry.create(dataset_name, 'train')
        test_dataset = dataset_registry.create(dataset_name, 'test')
    feature_columns = train_dataset.feature_columns
    label_columns = train_dataset.label_columns
    metric = train_dataset.metric
    problem_type = train_dataset.problem_type
    train_data1, tuning_data1 = sklearn.model_selection.train_test_split(
        train_dataset.data,
        test_size=0.05,
        random_state=np.random.RandomState(seed))
    train_data = train_dataset.data
    test_data = test_dataset.data
    column_types, inferred_problem_type = infer_column_problem_types(
        train_data1,
        tuning_data1,
        label_columns=label_columns,
        problem_type=problem_type)
    train_data = train_data[feature_columns + label_columns]
    # tuning_data = tuning_data[feature_columns + label_columns]
    if not get_competition_results:
        test_data = test_data[feature_columns + label_columns]
    train_tic = time.time()
    if model == 'ag_tabular_quick':
        MAX_NGRAM = 300
        feature_generator = AutoMLPipelineFeatureGenerator(
            vectorizer=CountVectorizer(min_df=30,
                                       ngram_range=(1, 3),
                                       max_features=MAX_NGRAM,
                                       dtype=np.uint8))
        predictor = TabularPredictor(label=label_columns[0],
                                     path=save_dir,
                                     problem_type=problem_type)
        predictor.fit(train_data,
                      time_limit=30,
                      feature_generator=feature_generator)
    elif model == 'ag_tabular_without_text':
        no_text_feature_columns = []
        for col_name in feature_columns:
            if column_types[col_name] != _TEXT:
                no_text_feature_columns.append(col_name)
        train_data = train_data[no_text_feature_columns + label_columns]
        # tuning_data = tuning_data[no_text_feature_columns + label_columns]
        test_data = test_data[no_text_feature_columns + label_columns]
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets in ['best_quality']:
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS,
                          presets=tabular_presets)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS,
                          num_bag_folds=5,
                          num_stack_levels=1)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError
    elif model == 'ag_tabular_old':
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets == 'best_quality':
            predictor.fit(train_data=train_data,
                          presets=tabular_presets,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=5,
                          num_stack_levels=1,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError
    elif model == 'ag_text_only':
        text_feature_columns = [
            col_name for col_name in feature_columns
            if column_types[col_name] == _TEXT
        ]
        train_data = train_data[text_feature_columns + label_columns]
        test_data = test_data[text_feature_columns + label_columns]
        predictor = TextPredictor(path=save_dir,
                                  label=label_columns[0],
                                  problem_type=problem_type,
                                  eval_metric=metric)
        hparams = ag_text_presets.create(text_presets)
        if len(train_data) > 500000:
            hparams = set_epoch3(hparams)
        predictor.fit(train_data=train_data,
                      hyperparameters=hparams,
                      num_gpus=num_gpus,
                      seed=seed)
    elif model == 'ag_text_multimodal':
        predictor = TextPredictor(path=save_dir,
                                  label=label_columns[0],
                                  problem_type=problem_type,
                                  eval_metric=metric)
        hparams = ag_text_presets.create(text_presets)
        if len(train_data) > 500000:
            hparams = set_epoch3(hparams)
        predictor.fit(train_data=train_data,
                      hyperparameters=hparams,
                      num_gpus=num_gpus,
                      seed=seed)
    elif model == 'pre_embedding' or model == 'tune_embedding_multimodal' or model == 'tune_embedding_text':
        feature_generator = AutoMLPipelineFeatureGenerator(
            enable_text_special_features=False,
            enable_text_ngram_features=False)
        pre_embedding_folder = os.path.join(_CURR_DIR,
                                            'pre_computed_embeddings')
        if model == 'pre_embedding':
            train_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'pretrain_text_embedding', 'train.npy'))
            test_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'pretrain_text_embedding', 'test.npy'))
        elif model == 'tune_embedding_multimodal':
            train_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'multimodal_embedding', 'train.npy'))
            test_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'multimodal_embedding', 'test.npy'))
        elif model == 'tune_embedding_text':
            train_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'tuned_text_embedding', 'train.npy'))
            test_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'tuned_text_embedding', 'test.npy'))
        else:
            raise NotImplementedError
        train_data = train_data.join(
            pd.DataFrame(train_features,
                         columns=[
                             f'pre_feat{i}'
                             for i in range(train_features.shape[1])
                         ]))
        train_data.reset_index(drop=True, inplace=True)
        test_data = test_data.join(
            pd.DataFrame(test_features,
                         columns=[
                             f'pre_feat{i}'
                             for i in range(test_features.shape[1])
                         ]))
        test_data.reset_index(drop=True, inplace=True)
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets == 'best_quality':
            predictor.fit(train_data=train_data,
                          presets=tabular_presets,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=5,
                          num_stack_levels=1,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError

    elif model == 'tabular_multimodal' or model == 'tabular_multimodal_just_table':
        if model == 'tabular_multimodal':
            MAX_NGRAM = 300
            feature_generator = AutoMLPipelineFeatureGenerator(
                vectorizer=CountVectorizer(min_df=30,
                                           ngram_range=(1, 3),
                                           max_features=MAX_NGRAM,
                                           dtype=np.uint8),
                enable_raw_text_features=True)
            hyperparameters = get_multimodal_tabular_hparam_just_gbm(
                text_presets=text_presets)
        else:
            MAX_NGRAM = 300
            feature_generator = AutoMLPipelineFeatureGenerator(
                vectorizer=CountVectorizer(min_df=30,
                                           ngram_range=(1, 3),
                                           max_features=MAX_NGRAM,
                                           dtype=np.uint8),
                enable_raw_text_features=True,
                enable_text_special_features=False,
                enable_text_ngram_features=False)
            hyperparameters = multimodal_tabular_just_table_hparam(
                text_presets=text_presets)
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets == 'best_quality':
            predictor.fit(train_data=train_data,
                          presets=tabular_presets,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=5,
                          num_stack_levels=1,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '3fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=3,
                          num_stack_levels=1,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError
    else:
        raise NotImplementedError
    train_toc = time.time()
    inference_tic = time.time()
    predictions = predictor.predict(test_data, as_pandas=True)
    predictor.save()
    inference_toc = time.time()
    if problem_type == MULTICLASS or problem_type == BINARY:
        prediction_prob = predictor.predict_proba(test_data, as_pandas=True)
        prediction_prob.to_csv(
            os.path.join(save_dir, 'test_prediction_prob.csv'))
    predictions.to_csv(os.path.join(save_dir, 'test_prediction.csv'))
    gt = test_data[label_columns[0]]
    gt.to_csv(os.path.join(save_dir, 'ground_truth.csv'))
    if not get_competition_results:
        score = predictor.evaluate(test_data)
        with open(os.path.join(save_dir, 'test_score.json'), 'w') as of:
            json.dump({metric: score}, of)
    with open(os.path.join(save_dir, 'speed_stats.json'), 'w') as of:
        json.dump(
            {
                'train_time': train_toc - train_tic,
                'inference_time': inference_toc - inference_tic,
                'cpuinfo': cpuinfo.get_cpu_info()
            }, of)
Esempio n. 10
0
label_columns = train_dataset.label_columns

train_data = train_dataset.data
test_data = test_dataset.data
concat_df = pd.concat([train_data, test_data])
concat_df.reset_index(drop=True, inplace=True)

competition_df = competition_dataset.data[feature_columns]

if args.model_type == 'base':
    tabular_hparams = get_tabular_hparams(electra_base_late_fusion_concate_e10_avg3())
elif args.model_type == 'large':
    tabular_hparams = get_tabular_hparams(electra_large_late_fusion_concate_e10_avg3())
else:
    raise NotImplementedError

time_str = strftime("%Y-%m-%d_%H-%M-%S", gmtime())
predictor = TabularPredictor(
    path=os.path.join(args.save_dir, args.model_type, time_str),
    problem_type=train_dataset.problem_type,
    eval_metric='log_loss',
    label=label_columns[0])
predictor.fit(concat_df[feature_columns + [label_columns[0]]],
              feature_generator=feature_generator,
              num_bag_folds=5,
              num_stack_levels=1,
              hyperparameters=tabular_hparams)
predictor.save()
predictions = predictor.predict_proba(competition_df, as_pandas=True)
predictions.to_csv(os.path.join(args.save_dir, args.model_type, time_str, 'pred_probabilities.csv'))
Esempio n. 11
0
""" Example script for quantile regression with tabular data, demonstrating simple use-case """
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor

# Training time:
train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(1000)  # subsample for faster demo
print(train_data.head())

label = 'age'  #  which column we want to predict
save_path = 'ag_models/'  # where to save trained models
quantile_levels = [0.1, 0.5, 0.9]  # which quantiles of numeric label-variable we want to predict

predictor = TabularPredictor(label=label, path=save_path, problem_type='quantile', quantile_levels=quantile_levels)
predictor.fit(train_data, calibrate=True, num_bag_folds=5)  # here we fit with 5-fold bagging and calibrate quantile estimates via conformal method

# Inference time:
test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')  # another Pandas DataFrame
test_data = test_data.head(1000)  # subsample for faster demo
predictor = TabularPredictor.load(save_path)  # unnecessary here, we just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
print(y_pred)  # each column contains estimates of a particular quantile-level of the label variable

# Check coverage of prediction intervals (ie. how often they contain the observed Y value):
num_quantiles = len(quantile_levels)
y_pred = y_pred.to_numpy()
y_target = test_data[label].to_numpy()
for i in range(num_quantiles // 2):
    low_idx = i
    high_idx = num_quantiles - i - 1
    low_quantile = quantile_levels[low_idx]  # which quantile to use for lower end of prediction interval
Esempio n. 12
0
    def fit_static(X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        from autogluon.tabular import TabularDataset, TabularPredictor
        from autogluon.tabular.models.knn.knn_rapids_model import KNNRapidsModel
        from autogluon.tabular.models.lr.lr_rapids_model import LinearRapidsModel

        num_classes = kwargs['num_classes']
        if kwargs['verbose'] is not None and kwargs['verbose'] is True:
            verbosity = 2
        else:
            verbosity = 0
        labels = kwargs['labels']
        num_gpus = kwargs['n_gpus']
        accuracy = kwargs.get('accuracy', 10)
        interpretability = kwargs.get('interpretability', 1)
        is_acceptance = kwargs.get('IS_ACCEPTANCE', False)
        is_backend_tuning = kwargs.get('IS_BACKEND_TUNING', False)

        lb = None
        if num_classes >= 2:
            from sklearn.preprocessing import LabelEncoder
            lb = LabelEncoder()
            lb.fit(labels)
            y = lb.transform(y)

        label = '____TARGET_____'
        import datatable as dt
        y_dt = dt.Frame(y, names=[label])

        if eval_set is not None:
            valid_X = eval_set[0][0]
            valid_y = eval_set[0][1]
            if num_classes >= 2:
                valid_y = lb.transform(valid_y)
            valid_y_dt = dt.Frame(valid_y, names=[label])

            assert X.shape[1] == valid_X.shape[1], "Bad shape to rbind: %s %s : %s %s" % (
            X.shape, X.names, valid_X.shape, valid_X.names)
            X = dt.rbind([X, valid_X])
            y_dt = dt.rbind([y_dt, valid_y_dt])

        sw = None
        if sample_weight is not None:
            sw = '____SAMPLE_WEIGHT_____'
            sw_dt = dt.Frame(sample_weight, names=[sw])
            if sample_weight_eval_set is not None:
                swes_dt = dt.Frame(sample_weight_eval_set[0], names=[sw])
                sw_dt = dt.rbind([sw_dt, swes_dt])
            X = dt.cbind([X, y_dt, sw_dt])
        else:
            X = dt.cbind([X, y_dt])

        X = X.to_pandas()  # AutoGluon needs pandas, not numpy

        eval_metric = AutoGluonModel.get_eval_metric(**kwargs)
        time_limit = AutoGluonModel.get_time_limit(accuracy)
        presets = AutoGluonModel.get_presets(accuracy, interpretability, is_acceptance, is_backend_tuning)

        model = TabularPredictor(
            label=label,
            sample_weight=sw,
            eval_metric=eval_metric,
            verbosity=verbosity,
            # learner_kwargs={'ignored_columns': ['id']}
        )
        hyperparameters = {
            KNNRapidsModel: {},
            LinearRapidsModel: {},
            'RF': {},
            'XGB': {'ag_args_fit': {'num_gpus': num_gpus}},
            'CAT': {'ag_args_fit': {'num_gpus': num_gpus}},
            'GBM': [{}, {'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, 'GBMLarge'],
            'NN': {'ag_args_fit': {'num_gpus': num_gpus}},
            'FASTAI': {'ag_args_fit': {'num_gpus': num_gpus}},
        }
        kwargs_fit = dict(hyperparameters=hyperparameters)
        if accuracy >= 5:
            kwargs_fit.update(dict(presets=presets, time_limit=time_limit))
        model.fit(X, **kwargs_fit)

        print(model.leaderboard(silent=True))

        return model
)  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())

label = 'age'  # specifies which column do we want to predict
save_path = 'ag_models/'  # where to save trained models
quantiles_topredict = [
    0.1, 0.5, 0.9
]  # which quantiles of numeric label-variable we want to predict

predictor = TabularPredictor(label=label,
                             path=save_path,
                             problem_type='quantile',
                             quantile_levels=quantiles_topredict)
predictor.fit(
    train_data, time_limit=30
)  # time_limit is optional, you should increase it for real applications

# Inference time:
test_data = TabularDataset(
    'https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv'
)  # another Pandas DataFrame
predictor = TabularPredictor.load(
    save_path
)  # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
print(y_pred)  # each column contains estimates for one target quantile-level

ldr = predictor.leaderboard(
    test_data)  # evaluate performance of every trained model
print(f"Quantile-regression evaluated using metric = {predictor.eval_metric}")
train_data, test_data = train_test_split(all_train_data,
                                        test_size=0.2,
                                        stratify=all_train_data[label],
                                        random_state=np.random.RandomState(seed))

train_data.to_csv(os.path.join(directory, output_subdir, train_name), index=False)
test_data.to_csv(os.path.join(directory, output_subdir, test_name), index=False)
print(f'#Train={len(train_data)}, #Dev={len(test_data)}')


# Test run autogluon:
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon import TabularPrediction as task
from sklearn.feature_extraction.text import CountVectorizer
from autogluon.features.generators import AutoMLPipelineFeatureGenerator

MAX_NGRAM = 300
time_limit = 30

feature_generator = AutoMLPipelineFeatureGenerator(vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8))

predictor = TabularPredictor(label=label, path=directory+output_subdir, problem_type=problem_type)
predictor.fit(train_data, time_limit=time_limit, feature_generator=feature_generator)
predictor.evaluate(test_data)


# Compute checksum:
from auto_mm_bench.utils import sha1sum
print("Train hash:\n", sha1sum(os.path.join(directory, output_subdir, train_name)))
print("Test hash:\n", sha1sum(os.path.join(directory, output_subdir, test_name)))