Esempio n. 1
0
def test_sample_weight():
    dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/toyRegression.zip',
               'name': 'toyRegression',
               'problem_type': REGRESSION,
               'label': 'y',
               'performance_val': 0.183}
    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url'])
    print(f"Evaluating Benchmark Dataset {dataset['name']}")
    directory = directory_prefix + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    sample_weight = 'sample_weights'
    weights = np.abs(np.random.rand(len(train_data),))
    test_weights = np.abs(np.random.rand(len(test_data),))
    train_data[sample_weight] = weights
    test_data_weighted = test_data.copy()
    test_data_weighted[sample_weight] = test_weights
    fit_args = {'time_limit': 20}
    predictor = TabularPredictor(label=dataset['label'], path=savedir, problem_type=dataset['problem_type'], sample_weight=sample_weight).fit(train_data, **fit_args)
    ldr = predictor.leaderboard(test_data)
    perf = predictor.evaluate(test_data)
    # Run again with weight_evaluation:
    predictor = TabularPredictor(label=dataset['label'], path=savedir, problem_type=dataset['problem_type'], sample_weight=sample_weight, weight_evaluation=True).fit(train_data, **fit_args)
    perf = predictor.evaluate(test_data_weighted)
    predictor.distill(time_limit=10)
    ldr = predictor.leaderboard(test_data_weighted)
Esempio n. 2
0
def run_tabular_benchmark_toy(fit_args):
    dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/toyClassification.zip',
               'name': 'toyClassification',
               'problem_type': MULTICLASS,
               'label': 'y',
               'performance_val': 0.436}
    # 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data
    # toyclassif_dataset should produce 1 warning and 1 error during inference:
    # Warning: Ignoring 181 (out of 1000) training examples for which the label value in column 'y' is missing
    # ValueError: Required columns are missing from the provided dataset. Missing columns: ['lostcolumn']

    # Additional warning that would have occurred if ValueError was not triggered:
    # UserWarning: These columns from this dataset were not present in the training dataset (AutoGluon will ignore them):  ['distractioncolumn1', 'distractioncolumn2']

    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url'])
    print(f"Evaluating Benchmark Dataset {dataset['name']}")
    directory = directory_prefix + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    predictor = TabularPredictor(label=dataset['label'], path=savedir).fit(train_data, **fit_args)
    print(predictor.feature_metadata)
    print(predictor.feature_metadata.type_map_raw)
    print(predictor.feature_metadata.type_group_map_special)
    try:
        predictor.predict(test_data)
    except KeyError:  # KeyError should be raised because test_data has missing column 'lostcolumn'
        pass
    else:
        raise AssertionError(f'{dataset["name"]} should raise an exception.')
Esempio n. 3
0
    def train(self,
              train_data,
              eval_metric=EVAL_METRIC,
              quality=QUALITY,
              time_limit=TIME_LIMIT,
              verbosity=VERBOSITY):
        """Train prospective models."""
        # predictor gives us default access to the *best* predictor that
        # was trained on the task (otherwise we're just wrapping AutoGluon)

        # create custom feature generator to force autogluon to use our features
        # as they are
        fg = AutoMLPipelineFeatureGenerator(enable_categorical_features=False,
                                            enable_datetime_features=False,
                                            enable_text_special_features=False,
                                            enable_text_ngram_features=False)
        # create our own feature metadata object as we know what the type of every
        # feature we have. Skip the label column in the training data when doing so
        fmd = FeatureMetadata(dict.fromkeys(train_data.columns[:-1], 'int'))

        task = TabularPredictor(
            label='label',
            eval_metric=eval_metric,
            path=self.outpath,
            verbosity=verbosity,
        )
        return task.fit(train_data=train_data,
                        time_limit=time_limit,
                        presets=self.QUALITY_PRESETS[quality],
                        feature_generator=fg,
                        feature_metadata=fmd)
Esempio n. 4
0
def test_quantile():
    quantile_levels = [0.01, 0.02, 0.05, 0.98, 0.99]
    dataset = {
        'url': 'https://autogluon.s3.amazonaws.com/datasets/toyRegression.zip',
        'name': 'toyRegression',
        'problem_type': QUANTILE,
        'label': 'y'
    }
    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix,
                                      train_file=train_file,
                                      test_file=test_file,
                                      name=dataset['name'],
                                      url=dataset['url'])
    print(f"Evaluating Benchmark Dataset {dataset['name']}")
    directory = directory_prefix + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(
        savedir, ignore_errors=True
    )  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    fit_args = {'time_limit': 20}
    predictor = TabularPredictor(label=dataset['label'],
                                 path=savedir,
                                 problem_type=dataset['problem_type'],
                                 quantile_levels=quantile_levels).fit(
                                     train_data, **fit_args)
    ldr = predictor.leaderboard(test_data)
    perf = predictor.evaluate(test_data)
Esempio n. 5
0
    def serialize(self, path: Path) -> None:
        # call Predictor.serialize() in order to serialize the class name

        super().serialize(path)

        # serialize self.ag_model
        # move autogluon model to where we want to do the serialization
        ag_path = self.ag_model.path
        shutil.move(ag_path, path)
        ag_path = Path(ag_path)
        print(f"Autogluon files moved from {ag_path} to {path}.")
        # reset the path stored in tabular model.
        AutogluonTabularPredictor.load(path / Path(ag_path.name))
        # serialize all remaining constructor parameters
        with (path / "parameters.json").open("w") as fp:
            parameters = dict(
                batch_size=self.batch_size,
                prediction_length=self.prediction_length,
                freq=self.freq,
                dtype=self.dtype,
                time_features=self.time_features,
                lag_indices=self.lag_indices,
                ag_path=path / Path(ag_path.name),
            )
            print(dump_json(parameters), file=fp)
Esempio n. 6
0
def train_model(df_train: pd.DataFrame,
                df_test: pd.DataFrame,
                label: str,
                verbosity: int = 0,
                random_state: int = 0) -> TabularPredictor:
    """
    Train an autogluon model for df_train, df_test. Specify the label column.
    Optionally, you can set verbosity to control how much output AutoGluon
    produces during training.

    The function caches models that have been trained on the same data by
    computing the hash of df_train and comparing that to existing models.

    Returns the predictor object.

    TODO: Optimize this bad boy for experiments. Would be k-fold
    cross-validation instead of train-test split and a AG-preset that opts
    for highest quality model. Also no or very high time_limit.
    """
    logger = logging.getLogger('pfd')
    d = 'agModels'  # folder to store trained models
    checksum = calculate_model_hash(df_train, label, random_state)
    model_path = f'{d}/{checksum}'
    logger.info(f'Calculated a checksum of {checksum}.')
    try:
        predictor = TabularPredictor.load(model_path)
    except FileNotFoundError:
        logger.info("Didn't find a model to load from the cache.")
        p = TabularPredictor(label=label, path=model_path)
        predictor = p.fit(train_data=df_train,
                          tuning_data=df_test,
                          time_limit=20,
                          verbosity=verbosity,
                          presets='medium_quality_faster_train')
    return predictor
Esempio n. 7
0
def estimate_importance(dataset, model_name):
    if os.path.exists(
            os.path.join('feature_importance', dataset, model_name,
                         'importance.csv')):
        print(f'Found {dataset}, {model_name}')
        return
    model_remote_path = stat_df.loc[model_name, dataset]
    postfix = '/test_score.json'

    remote_dir_name = model_remote_path[:-len(postfix)]

    def downloadDirectoryFroms3(bucketName, remoteDirectoryName,
                                local_dir_path):
        s3_resource = boto3.resource('s3')
        bucket = s3_resource.Bucket(bucketName)
        for obj in bucket.objects.filter(Prefix=remoteDirectoryName):
            print(obj.key)
            download_path = os.path.join(local_dir_path, obj.key)
            if not os.path.exists(os.path.dirname(download_path)):
                os.makedirs(os.path.dirname(download_path), exist_ok=True)
            bucket.download_file(obj.key, download_path)

    local_dir_name = os.path.join(download_path, remote_dir_name)
    if os.path.exists(local_dir_name):
        pass
    else:
        downloadDirectoryFroms3('automl-mm-bench', remote_dir_name,
                                download_path)
    test_dataset = dataset_registry.create(dataset, 'test')
    if model_name == MULTIMODAL_TEXT_MODEL_NAME:
        predictor = MultiModalTextModel.load(
            os.path.join(local_dir_name, 'saved_model'))
    elif model_name == TABULAR_MODEL_NAME:
        predictor = TabularPredictor.load(os.path.join(local_dir_name))
    elif model_name == STACK_ENSEMBLE_MODEL_NAME:
        predictor = TabularPredictor.load(os.path.join(local_dir_name))
    else:
        raise NotImplementedError
    sample_size = min(len(test_dataset.data), 1000)
    if model_name == TABULAR_MODEL_NAME:
        importance_df = predictor.feature_importance(
            test_dataset.data[test_dataset.feature_columns +
                              test_dataset.label_columns],
            subsample_size=sample_size)
    else:
        importance_df = compute_permutation_feature_importance(
            test_dataset.data[test_dataset.feature_columns],
            test_dataset.data[test_dataset.label_columns[0]],
            predict_func=predictor.predict,
            eval_metric=get_metric(test_dataset.metric),
            subsample_size=sample_size,
            num_shuffle_sets=3)
    os.makedirs(os.path.join('feature_importance', dataset, model_name),
                exist_ok=True)
    importance_df.to_csv(
        os.path.join('feature_importance', dataset, model_name,
                     'importance.csv'))
    print(importance_df)
def test_image_predictor(fit_helper):
    from autogluon.vision import ImageDataset
    train_data, _, test_data = ImageDataset.from_folders('https://autogluon.s3.amazonaws.com/datasets/shopee-iet.zip')
    feature_metadata = FeatureMetadata.from_df(train_data).add_special_types({'image': ['image_path']})
    predictor = TabularPredictor(label='label').fit(
        train_data=train_data,
        hyperparameters={'AG_IMAGE_NN': {'epochs': 2, 'model': 'resnet18_v1b'}},
        feature_metadata=feature_metadata
    )
    leaderboard = predictor.leaderboard(test_data)
    assert len(leaderboard) > 0
    class AGLearner(object):
        def __init__(self, path=None):
            self.path = path

        def fit(self, x, y):
            ''' '''
            x = x if len(x.shape) > 1 else x[:, None]
            y = y if len(y.shape) > 1 else y[:, None]
            x_columns = ['x_%d' % i for i in range(x.shape[1])]
            self.x_columns = x_columns
            y_column = 'target'
            columns = x_columns + [y_column]

            train_data = pd.DataFrame(np.concatenate([x, y], axis=1),
                                      columns=columns)
            self._model = TabularPredictor(y_column, problem_type=problem_type, eval_metric=eval_metric, \
             path=self.path, verbosity=verbosity, sample_weight=sample_weight, weight_evaluation=weight_evaluation, \
             groups=groups, **kwargs).fit(train_data, **fit_kwargs)

        def predict(self, x):
            ''' '''
            assert hasattr(self, '_model'), 'The model has not been fitted yet'
            x = x if len(x.shape) > 1 else x[:, None]
            if not hasattr(self, 'x_columns'):
                self.x_columns = ['x_%d' % i for i in range(x.shape[1])]
            assert x.shape[1] == len(
                self.x_columns
            ), 'x has a shape incompatible with training data'
            data = pd.DataFrame(x, columns=self.x_columns)
            y_pred = self._model.predict(data, as_pandas=False)
            return y_pred

        @property
        def feature_importances_(self):
            try:
                importance_df = self._model.feature_importance()
                importances = [
                    importance_df.at[col, 'importance']
                    for col in self.x_columns
                ]
                return importances
            except:
                return []

        def save(self, path):
            self._model.save()

        @classmethod
        def load(cls, path):
            learner = AGLearner(path=path)
            learner._model = TabularPredictor.load(path)
            return learner
Esempio n. 10
0
        def fit(self, x, y):
            ''' '''
            x = x if len(x.shape) > 1 else x[:, None]
            y = y if len(y.shape) > 1 else y[:, None]
            x_columns = ['x_%d' % i for i in range(x.shape[1])]
            self.x_columns = x_columns
            y_column = 'target'
            columns = x_columns + [y_column]

            train_data = pd.DataFrame(np.concatenate([x, y], axis=1),
                                      columns=columns)
            self._model = TabularPredictor(y_column, problem_type=problem_type, eval_metric=eval_metric, \
             path=self.path, verbosity=verbosity, sample_weight=sample_weight, weight_evaluation=weight_evaluation, \
             groups=groups, **kwargs).fit(train_data, **fit_kwargs)
def train(args):
    
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    model_dir = args.model_dir    
    train_dir = args.train_dir
    filename = args.filename
    target = args.target    
    debug = args.debug
    eval_metric = args.eval_metric   
    presets = args.presets    
    
    num_gpus = int(os.environ['SM_NUM_GPUS'])
    current_host = args.current_host
    hosts = args.hosts
    time_limit = int(args.training_minutes) * 60
     
    logging.info(train_dir)
    
    train_data = TabularDataset(os.path.join(train_dir, filename))
    if debug:
        subsample_size = 500  # subsample subset of data for faster demo, try setting this to much larger values
        train_data = train_data.sample(n=subsample_size, random_state=0)
        
    predictor = TabularPredictor(label=target, path=model_dir, eval_metric=eval_metric).fit(
        train_data=train_data,
        excluded_model_types=['KNN','RF','NN'],
        time_limit=time_limit, presets=[presets, 'optimize_for_deployment'])

    return predictor
Esempio n. 12
0
def predict_chains(chains: Iterable[List[Method]], sources: Iterable[Method],
                   method_feats: Dict[Method, MethodFeature],
                   proj_feat: ProjectFeature, d2v_model: Doc2Vec,
                   predictor: TabularPredictor) -> List[List[ChainEntry]]:
    df_list: List[pd.DataFrame] = []
    for chain, source in zip(chains, sources):
        if len(chain) == 0:
            continue
        df = chain_to_df(chain=chain,
                         source=source,
                         method_features=method_feats,
                         project_feature=proj_feat,
                         d2v_model=d2v_model)
        df_list.append(df)
    large_df = pd.concat(df_list)
    prob: np.ndarray = predictor.predict_proba(large_df)
    results: List[List[ChainEntry]] = []
    cur = 0  # row cursor of large df
    for chain in chains:
        chain_prob: List[ChainEntry] = []
        for method in chain:
            chain_prob.append(ChainEntry(method, prob[cur]))
            cur += 1
        results.append(chain_prob)
    assert cur == len(large_df)
    return results
Esempio n. 13
0
    def _fit(self, X: List[Config[ModelConfig]],
             y: npt.NDArray[np.float32]) -> None:
        X_numpy = self.config_transformer.fit_transform(X)

        # We need to train one predictor per output feature
        self.predictors = []
        for i in range(y.shape[1]):
            df = pd.DataFrame(np.concatenate([X_numpy, y[:, i:i + 1]],
                                             axis=-1))
            predictor = TabularPredictor(
                df.shape[1] - 1,
                problem_type="regression",
                eval_metric="root_mean_squared_error",
            )
            predictor.fit(df, time_limit=self.time_limit, verbosity=0)
            self.predictors.append(predictor)
def model_fn(model_dir):
    """
    Load the gluon model. Called once when hosting service starts.
    :param: model_dir The directory where model files are stored.
    :return: a model (in this case an AutoGluon network)
    """
    net = TabularPredictor.load(model_dir)
    return net
Esempio n. 15
0
def predict(args):
    if args.use_tabular:
        predictor = TabularPredictor.load(args.model_dir)
    else:
        predictor = TextPredictor.load(args.model_dir)
    test_prediction = predictor.predict(args.test_file, as_pandas=True)
    if args.exp_dir is None:
        args.exp_dir = '.'
    test_prediction.to_csv(os.path.join(args.exp_dir, 'test_prediction.csv'))
Esempio n. 16
0
def df_to_ag_style(df: pd.DataFrame) -> TabularPredictor.Dataset:
    """
    Define a standardised way of passing DataFrames to AutoGluon by first
    casting the DataFrame to TabularPredictor.Dataset, then overwriting
    column-names with the string of the column's index.
    """
    ag_df = TabularPredictor.Dataset(df)
    ag_df.columns = [str(i) for i in df.columns]
    return ag_df
def model_fn(model_dir):
    """Load the AutoGluon model. Called when the hosting service starts.

    :param model_dir: The directory where model files are stored.

    :return: AutoGluon model.
    """
    model = TabularPredictor.load(model_dir)
    globals()["column_names"] = model.feature_metadata_in.get_features()
    return model
Esempio n. 18
0
def model_fn(model_dir):
    """
    Load the gluon model. Called once when hosting service starts.
    :param: model_dir The directory where model files are stored.
    :return: a model (in this case a Gluon network) and the column info.
    """
    print(f'Loading model from {model_dir} with contents {os.listdir(model_dir)}')

    net = TabularPredictor.load(model_dir, verbosity=True)
    with open(f'{model_dir}/code/columns.pkl', 'rb') as f:
        column_dict = pickle.load(f)
    return net, column_dict
Esempio n. 19
0
def train(args):
    set_seed(args.seed)
    train_df = pd.read_csv(os.path.join(args.data_path, 'train.csv'))
    test_df = pd.read_csv(os.path.join(args.data_path, 'test.csv'))
    # For the purpose of generating submission file
    submission_df = pd.read_csv(os.path.join(args.data_path, 'sample_submission.csv'))
    train_df = preprocess(train_df,
                          with_tax_values=args.with_tax_values, has_label=True)
    test_df = preprocess(test_df,
                         with_tax_values=args.with_tax_values, has_label=False)
    label_column = 'Sold Price'
    eval_metric = 'r2'

    automm_hyperparameters = get_automm_hyperparameters(args.automm_mode, args.text_backbone, args.cat_as_text)

    tabular_hyperparameters = {
        'GBM': [
            {},
            {'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}},
        ],
        'CAT': {},
        'AG_AUTOMM': automm_hyperparameters,
    }
    if args.mode == 'single':
        predictor = MultiModalPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path)
        predictor.fit(train_df, hyperparameters=automm_hyperparameters, seed=args.seed)
    elif args.mode == 'weighted' or args.mode == 'stack5' or args.mode == 'single_bag5' or args.mode == 'single_bag4':
        predictor = TabularPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path)

        if args.mode == 'single_bag5':
            tabular_hyperparameters = {
                'AG_AUTOMM': automm_hyperparameters,
            }
            num_bag_folds, num_stack_levels = 5, 0
        elif args.mode == 'weighted':
            num_bag_folds, num_stack_levels = None, None
        elif args.mode == 'stack5':
            num_bag_folds, num_stack_levels = 5, 1
        else:
            raise NotImplementedError
        predictor.fit(train_df,
                      hyperparameters=tabular_hyperparameters,
                      num_bag_folds=num_bag_folds,
                      num_stack_levels=num_stack_levels)
        leaderboard = predictor.leaderboard()
        leaderboard.to_csv(os.path.join(args.exp_path, 'leaderboard.csv'))
    else:
        raise NotImplementedError
    predictions = np.exp(predictor.predict(test_df))
    submission_df['Sold Price'] = predictions
    submission_df.to_csv(os.path.join(args.exp_path, 'submission.csv'), index=None)
Esempio n. 20
0
    def fit_model(self, df_train, lab_train):
        if self.learner != None:
            grid_search = GridSearchCV(self.pipeline,
                                       self.param_grid,
                                       scoring='roc_auc',
                                       cv=5,
                                       verbose=1,
                                       n_jobs=-1)
            model = grid_search.fit(df_train, lab_train)
        else:
            df_train["class"] = lab_train
            model = TabularPredictor(label="class").fit(df_train)

        return model
Esempio n. 21
0
def run_feature_permutation(predictor: TabularPredictor,
                            df_train: pd.DataFrame,
                            model_name: Union[str, None],
                            **kwargs) -> pd.DataFrame:
    """
    Use feature permutation to derive feature importances from an AutoGluon
    model. The AG documentation refers this website to explain feature
    permutation: https://explained.ai/rf-importance/
    """
    df_importance = predictor.feature_importance(df_train,
                                                 model=model_name,
                                                 num_shuffle_sets=kwargs['num_shuffle_sets'],
                                                 subsample_size=kwargs['subsample_size'])
                                                 #**kwargs)
    return df_importance
Esempio n. 22
0
def predict_chain(chain: List[Method], source: Method,
                  method_features: Dict[Method, MethodFeature],
                  project_feature: ProjectFeature, d2v_model: Doc2Vec,
                  predictor: TabularPredictor) -> List[ChainEntry]:
    if len(chain) == 0:
        return []
    df = chain_to_df(chain=chain,
                     source=source,
                     method_features=method_features,
                     project_feature=project_feature,
                     d2v_model=d2v_model)
    probabilities: np.ndarray = predictor.predict_proba(df)
    result = [
        ChainEntry(method, probabilities[i]) for i, method in enumerate(chain)
    ]
    result.append(ChainEntry(None, 0.5))
    return result
Esempio n. 23
0
    def deserialize(
        cls,
        path: Path,
        # TODO this is temporary, we should make the callable object serializable in the first place
        scaling: Callable[[pd.Series], Tuple[pd.Series,
                                             float]] = mean_abs_scaling,
        **kwargs,
    ) -> "Predictor":
        # deserialize constructor parameters
        with (path / "parameters.json").open("r") as fp:
            parameters = load_json(fp.read())
        loaded_ag_path = parameters["ag_path"]
        del parameters["ag_path"]
        # load tabular model
        ag_model = AutogluonTabularPredictor.load(loaded_ag_path)

        return TabularPredictor(ag_model=ag_model,
                                scaling=scaling,
                                **parameters)
Esempio n. 24
0
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    num_gpus = int(os.environ['SM_NUM_GPUS'])
    current_host = args.current_host
    hosts = args.hosts
    model_dir = args.model_dir
    target = args.target

    # load training and validation data

    training_dir = args.train
    filename = args.filename
    logging.info(training_dir)
    #   train_data = task.Dataset(file_path=training_dir + '/' + filename)
    train_data = TabularDataset(data=training_dir + '/' + filename)

    #   predictor = task.fit(train_data = train_data, label=target, output_directory=model_dir)
    predictor = TabularPredictor(label=target, path=model_dir).fit(train_data)

    return predictor
Esempio n. 25
0
def test_advanced_functionality():
    fast_benchmark = True
    dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
                      'name': 'AdultIncomeBinaryClassification',
                      'problem_type': BINARY}
    label = 'class'
    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url'])
    if fast_benchmark:  # subsample for fast_benchmark
        subsample_size = 100
        train_data = train_data.head(subsample_size)
        test_data = test_data.head(subsample_size)
    print(f"Evaluating Advanced Functionality on Benchmark Dataset {dataset['name']}")
    directory = directory_prefix + 'advanced/' + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    predictor = TabularPredictor(label=label, path=savedir).fit(train_data)
    leaderboard = predictor.leaderboard(data=test_data)
    extra_metrics = ['accuracy', 'roc_auc', 'log_loss']
    leaderboard_extra = predictor.leaderboard(data=test_data, extra_info=True, extra_metrics=extra_metrics)
    assert set(predictor.get_model_names()) == set(leaderboard['model'])
    assert set(predictor.get_model_names()) == set(leaderboard_extra['model'])
    assert set(leaderboard_extra.columns).issuperset(set(leaderboard.columns))
    assert len(leaderboard) == len(leaderboard_extra)
    assert set(leaderboard_extra.columns).issuperset(set(extra_metrics))  # Assert that extra_metrics are present in output
    num_models = len(predictor.get_model_names())
    feature_importances = predictor.feature_importance(data=test_data)
    original_features = set(train_data.columns)
    original_features.remove(label)
    assert set(feature_importances.index) == original_features
    assert set(feature_importances.columns) == {'importance', 'stddev', 'p_value', 'n', 'p99_high', 'p99_low'}
    predictor.transform_features()
    predictor.transform_features(data=test_data)
    predictor.info()

    assert predictor.get_model_names_persisted() == []  # Assert that no models were persisted during training
    assert predictor.unpersist_models() == []  # Assert that no models were unpersisted

    persisted_models = predictor.persist_models(models='all', max_memory=None)
    assert set(predictor.get_model_names_persisted()) == set(persisted_models)  # Ensure all models are persisted
    assert predictor.persist_models(models='all', max_memory=None) == []  # Ensure that no additional models are persisted on repeated calls
    unpersised_models = predictor.unpersist_models()
    assert set(unpersised_models) == set(persisted_models)
    assert predictor.get_model_names_persisted() == []  # Assert that all models were unpersisted

    # Raise exception
    with pytest.raises(NetworkXError):
        predictor.persist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2'])

    assert predictor.get_model_names_persisted() == []

    assert predictor.unpersist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2']) == []

    predictor.persist_models(models='all', max_memory=None)
    predictor.save()  # Save predictor while models are persisted: Intended functionality is that they won't be persisted when loaded.
    predictor_loaded = TabularPredictor.load(predictor.path)  # Assert that predictor loading works
    leaderboard_loaded = predictor_loaded.leaderboard(data=test_data)
    assert len(leaderboard) == len(leaderboard_loaded)
    assert predictor_loaded.get_model_names_persisted() == []  # Assert that models were not still persisted after loading predictor

    assert(predictor.get_model_full_dict() == dict())
    predictor.refit_full()
    assert(len(predictor.get_model_full_dict()) == num_models)
    assert(len(predictor.get_model_names()) == num_models * 2)
    for model in predictor.get_model_names():
        predictor.predict(data=test_data, model=model)
    predictor.refit_full()  # Confirm that refit_models aren't further refit.
    assert(len(predictor.get_model_full_dict()) == num_models)
    assert(len(predictor.get_model_names()) == num_models * 2)
    predictor.delete_models(models_to_keep=[])  # Test that dry-run doesn't delete models
    assert(len(predictor.get_model_names()) == num_models * 2)
    predictor.predict(data=test_data)
    predictor.delete_models(models_to_keep=[], dry_run=False)  # Test that dry-run deletes models
    assert len(predictor.get_model_names()) == 0
    assert len(predictor.leaderboard()) == 0
    assert len(predictor.leaderboard(extra_info=True)) == 0
    try:
        predictor.predict(data=test_data)
    except:
        pass
    else:
        raise AssertionError('predictor.predict should raise exception after all models are deleted')
    print('Tabular Advanced Functionality Test Succeeded.')
Esempio n. 26
0
def test_pseudolabeling():
    datasets = get_benchmark_sets()
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    directory_prefix = './datasets/'
    hyperparam_setting = {
        'GBM': {'num_boost_round': 10},
        'XGB': {'n_estimators': 10},
    }

    fit_args = dict(
        hyperparameters=hyperparam_setting,
        time_limit=20,
    )

    fit_args_best = dict(
        presets='best_quality',
        num_bag_folds=2,
        num_bag_sets=1,
        ag_args_ensemble=dict(fold_fitting_strategy='sequential_local'),
    )
    for idx in range(len(datasets)):
        dataset = datasets[idx]
        label = dataset['label']
        problem_type = dataset['problem_type']
        name = dataset['name']
        train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file,
                                          name=dataset['name'], url=dataset['url'])

        print(f"Testing dataset with name: {name}, problem type: {problem_type}")

        train_data = train_data.sample(50, random_state=1)
        test_data = test_data[test_data[label].notna()]

        if problem_type in PROBLEM_TYPES_CLASSIFICATION:
            valid_class_idxes = test_data[label].isin(train_data[label].unique())
            test_data = test_data[valid_class_idxes]

        test_data = test_data.sample(50, random_state=1)

        error_msg_og = f'pseudolabel threw an exception during fit, it should have ' \
                       f'succeeded on problem type:{problem_type} with dataset name:{name}, ' \
                       f'with problem_type: {problem_type}. Under settings:'

        # Test label already given. If test label already given doesn't use pseudo labeling filter.
        try:
            print("Pseudolabel Testing: Pre-labeled data 'fit_pseudolabel'")
            _, y_pred_proba = TabularPredictor(label=label, problem_type=problem_type).fit_pseudolabel(
                pseudo_data=test_data,
                return_pred_prob=True,
                train_data=train_data,
                **fit_args,
            )
        except Exception as e:
            assert False, error_msg_og + 'labeled test data'

        try:
            print("Pseudolabel Testing: Pre-labeled data, best quality 'fit_pseudolabel'")
            _, y_pred_proba = TabularPredictor(label=label, problem_type=problem_type).fit_pseudolabel(
                pseudo_data=test_data,
                return_pred_prob=True,
                train_data=train_data,
                **fit_args_best,
                **fit_args,
            )
        except Exception as e:
            assert False, error_msg_og + 'labeled test data, best quality'

        # Test unlabeled pseudo data
        unlabeled_test_data = test_data.drop(columns=label)
        for flag_ensemble in [True, False]:
            error_prefix = 'ensemble ' if flag_ensemble else ''
            error_msg = error_prefix + error_msg_og
            for is_weighted_ensemble in [True, False]:
                error_suffix = ' with pseudo label model weighted ensembling' if is_weighted_ensemble else ''

                try:
                    print("Pseudolabel Testing: Unlabeled data 'fit_pseudolabel'")
                    _, y_pred_proba = TabularPredictor(label=label, problem_type=problem_type).fit_pseudolabel(
                        pseudo_data=unlabeled_test_data,
                        return_pred_prob=True,
                        train_data=train_data,
                        use_ensemble=flag_ensemble,
                        fit_ensemble=is_weighted_ensemble,
                        **fit_args,
                    )
                except Exception as e:
                    assert False, error_msg + 'unlabeled test data' + error_suffix

                try:
                    print("Pseudolabel Testing: Unlabeled data, best quality 'fit_pseudolabel'")
                    _, y_pred_proba = TabularPredictor(label=label, problem_type=problem_type).fit_pseudolabel(
                        pseudo_data=unlabeled_test_data,
                        return_pred_prob=True,
                        train_data=train_data,
                        use_ensemble=flag_ensemble,
                        fit_ensemble=is_weighted_ensemble,
                        **fit_args_best,
                        **fit_args,
                    )
                except Exception as e:
                    assert False, error_msg + 'unlabeled test data, best quality' + error_suffix
Esempio n. 27
0
def run_tabular_benchmarks(fast_benchmark, subsample_size, perf_threshold, seed_val, fit_args, dataset_indices=None, run_distill=False, crash_in_oof=False):
    print("Running fit with args:")
    print(fit_args)
    # Each train/test dataset must be located in single directory with the given names.
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    EPS = 1e-10

    # List containing dicts for each dataset to include in benchmark (try to order based on runtimes)
    datasets = get_benchmark_sets()
    if dataset_indices is not None: # only run some datasets
        datasets = [datasets[i] for i in dataset_indices]

    # Aggregate performance summaries obtained in previous benchmark run:
    prev_perf_vals = [dataset['performance_val'] for dataset in datasets]
    previous_avg_performance = np.mean(prev_perf_vals)
    previous_median_performance = np.median(prev_perf_vals)
    previous_worst_performance = np.max(prev_perf_vals)

    # Run benchmark:
    performance_vals = [0.0] * len(datasets) # performance obtained in this run
    directory_prefix = './datasets/'
    with warnings.catch_warnings(record=True) as caught_warnings:
        for idx in range(len(datasets)):
            dataset = datasets[idx]
            train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url'])
            if seed_val is not None:
                seed(seed_val)
                np.random.seed(seed_val)
            print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx+1, len(datasets)))
            directory = directory_prefix + dataset['name'] + "/"
            savedir = directory + 'AutogluonOutput/'
            shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
            label = dataset['label']
            y_test = test_data[label]
            test_data = test_data.drop(labels=[label], axis=1)
            if fast_benchmark:
                if subsample_size is None:
                    raise ValueError("fast_benchmark specified without subsample_size")
                if subsample_size < len(train_data):
                    # .sample instead of .head to increase diversity and test cases where data index is not monotonically increasing.
                    train_data = train_data.sample(n=subsample_size, random_state=seed_val)  # subsample for fast_benchmark
            predictor = TabularPredictor(label=label, path=savedir).fit(train_data, **fit_args)
            results = predictor.fit_summary(verbosity=4)
            if predictor.problem_type != dataset['problem_type']:
                warnings.warn("For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type']))
            predictor = TabularPredictor.load(savedir)  # Test loading previously-trained predictor from file
            y_pred_empty = predictor.predict(test_data[0:0])
            assert len(y_pred_empty) == 0
            y_pred = predictor.predict(test_data)
            perf_dict = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
            if dataset['problem_type'] != REGRESSION:
                perf = 1.0 - perf_dict['accuracy']  # convert accuracy to error-rate
            else:
                perf = 1.0 - perf_dict['r2']  # unexplained variance score.
            performance_vals[idx] = perf
            print("Performance on dataset %s: %s   (previous perf=%s)" % (dataset['name'], performance_vals[idx], dataset['performance_val']))
            if (not fast_benchmark) and (performance_vals[idx] > dataset['performance_val'] * perf_threshold):
                warnings.warn("Performance on dataset %s is %s times worse than previous performance." %
                              (dataset['name'], performance_vals[idx]/(EPS+dataset['performance_val'])))
            if predictor._trainer.bagged_mode and not crash_in_oof:
                # TODO: Test index alignment with original training data (first handle duplicated rows / dropped rows edge cases)
                y_pred_oof = predictor.get_oof_pred()
                y_pred_proba_oof = predictor.get_oof_pred_proba(as_multiclass=False)
                y_pred_oof_transformed = predictor.get_oof_pred(transformed=True)
                y_pred_proba_oof_transformed = predictor.get_oof_pred_proba(as_multiclass=False, transformed=True)

                # Assert expected type output
                assert isinstance(y_pred_oof, pd.Series)
                assert isinstance(y_pred_oof_transformed, pd.Series)
                if predictor.problem_type == MULTICLASS:
                    assert isinstance(y_pred_proba_oof, pd.DataFrame)
                    assert isinstance(y_pred_proba_oof_transformed, pd.DataFrame)
                else:
                    if predictor.problem_type == BINARY:
                        assert isinstance(predictor.get_oof_pred_proba(), pd.DataFrame)
                    assert isinstance(y_pred_proba_oof, pd.Series)
                    assert isinstance(y_pred_proba_oof_transformed, pd.Series)

                assert y_pred_oof_transformed.equals(predictor.transform_labels(y_pred_oof, proba=False))

                # Test that the transform_labels method is capable of reproducing the same output when converting back and forth, and test that oof 'transform' parameter works properly.
                y_pred_proba_oof_inverse = predictor.transform_labels(y_pred_proba_oof, proba=True)
                y_pred_proba_oof_inverse_inverse = predictor.transform_labels(y_pred_proba_oof_inverse, proba=True, inverse=True)
                y_pred_oof_inverse = predictor.transform_labels(y_pred_oof)
                y_pred_oof_inverse_inverse = predictor.transform_labels(y_pred_oof_inverse, inverse=True)

                if isinstance(y_pred_proba_oof_transformed, pd.DataFrame):
                    pd.testing.assert_frame_equal(y_pred_proba_oof_transformed, y_pred_proba_oof_inverse)
                    pd.testing.assert_frame_equal(y_pred_proba_oof, y_pred_proba_oof_inverse_inverse)
                else:
                    pd.testing.assert_series_equal(y_pred_proba_oof_transformed, y_pred_proba_oof_inverse)
                    pd.testing.assert_series_equal(y_pred_proba_oof, y_pred_proba_oof_inverse_inverse)
                pd.testing.assert_series_equal(y_pred_oof_transformed, y_pred_oof_inverse)
                pd.testing.assert_series_equal(y_pred_oof, y_pred_oof_inverse_inverse)

                # Test that index of both the internal training data and the oof outputs are consistent in their index values.
                X_internal, y_internal = predictor.load_data_internal()
                y_internal_index = list(y_internal.index)
                assert list(X_internal.index) == y_internal_index
                assert list(y_pred_oof.index) == y_internal_index
                assert list(y_pred_proba_oof.index) == y_internal_index
                assert list(y_pred_oof_transformed.index) == y_internal_index
                assert list(y_pred_proba_oof_transformed.index) == y_internal_index
            else:
                # Raise exception
                with pytest.raises(AssertionError):
                    predictor.get_oof_pred()
                with pytest.raises(AssertionError):
                    predictor.get_oof_pred_proba()
            if run_distill:
                predictor.distill(time_limit=60, augment_args={'size_factor':0.5})

    # Summarize:
    avg_perf = np.mean(performance_vals)
    median_perf = np.median(performance_vals)
    worst_perf = np.max(performance_vals)
    for idx in range(len(datasets)):
        print("Performance on dataset %s: %s   (previous perf=%s)" % (datasets[idx]['name'], performance_vals[idx], datasets[idx]['performance_val']))

    print("Average performance: %s" % avg_perf)
    print("Median performance: %s" % median_perf)
    print("Worst performance: %s" % worst_perf)

    if not fast_benchmark:
        if avg_perf > previous_avg_performance * perf_threshold:
            warnings.warn("Average Performance is %s times worse than previously." % (avg_perf/(EPS+previous_avg_performance)))
        if median_perf > previous_median_performance * perf_threshold:
            warnings.warn("Median Performance is %s times worse than previously." % (median_perf/(EPS+previous_median_performance)))
        if worst_perf > previous_worst_performance * perf_threshold:
            warnings.warn("Worst Performance is %s times worse than previously." % (worst_perf/(EPS+previous_worst_performance)))

    print("Ran fit with args:")
    print(fit_args)
    # List all warnings again to make sure they are seen:
    print("\n\n WARNINGS:")
    for w in caught_warnings:
        warnings.warn(w.message)
Esempio n. 28
0
def run(args):
    if args.task == 'product_sentiment':
        train_df, test_df, label_column = load_machine_hack_product_sentiment(args.train_file,
                                                                              args.test_file)
    elif args.task == 'mercari_price':
        train_df, test_df, label_column = load_mercari_price_prediction(args.train_file,
                                                                        args.test_file)
    elif args.task == 'price_of_books':
        train_df, test_df, label_column = load_price_of_books(args.train_file, args.test_file)
    elif args.task == 'data_scientist_salary':
        train_df, test_df, label_column = load_data_scientist_salary(args.train_file, args.test_file)
    else:
        raise NotImplementedError

    hyperparameters = get_hyperparameter_config('multimodal')
    if args.preset is not None and args.mode in ['stacking', 'weighted']:
        hyperparameters['AG_TEXT_NN']['presets'] = args.preset

    if args.mode == 'stacking':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=args.eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=train_df,
                      hyperparameters=hyperparameters,
                      num_bag_folds=5,
                      num_stack_levels=1)
    elif args.mode == 'weighted':
        predictor = TabularPredictor(label=label_column,
                                     eval_metric=args.eval_metric,
                                     path=args.exp_dir)
        predictor.fit(train_data=train_df,
                      hyperparameters=hyperparameters)
    elif args.mode == 'single':
        # When no embedding is used,
        # we will just use TextPredictor that will train a single model internally.
        predictor = TextPredictor(label=label_column,
                                  eval_metric=args.eval_metric,
                                  path=args.exp_dir)
        predictor.fit(train_data=train_df,
                      presets=args.preset,
                      seed=args.seed)
    else:
        raise NotImplementedError
    if args.task == 'product_sentiment':
        test_probabilities = predictor.predict_proba(test_df, as_pandas=True, as_multiclass=True)
        test_probabilities.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False)
    elif args.task == 'data_scientist_salary':
        predictions = predictor.predict(test_df, as_pandas=False)
        submission = pd.read_excel(args.sample_submission, engine='openpyxl')
        submission.loc[:, label_column] = predictions
        submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx'))
    elif args.task == 'price_of_books':
        predictions = predictor.predict(test_df, as_pandas=False)
        submission = pd.read_excel(args.sample_submission, engine='openpyxl')
        submission.loc[:, label_column] = np.power(10, predictions) - 1
        submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx'))
    elif args.task == 'mercari_price':
        test_predictions = predictor.predict(test_df, as_pandas=False)
        submission = pd.read_csv(args.sample_submission)
        submission.loc[:, label_column] = np.exp(test_predictions) - 1
        submission.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False)
    else:
        raise NotImplementedError
Esempio n. 29
0
def model_fn(model_dir):
    """loads model from previously saved artifact"""
    model = TabularPredictor.load(model_dir)
    globals()["column_names"] = model.feature_metadata_in.get_features()

    return model
Esempio n. 30
0
def train(args):
    model_output_dir = f'{args.output_dir}/data'

    is_distributed = len(args.hosts) > 1
    host_rank = args.hosts.index(args.current_host)
    dist_ip_addrs = args.hosts
    dist_ip_addrs.pop(host_rank)

    # Load training and validation data
    print(f'Train files: {os.listdir(args.train)}')
    train_data = __load_input_data(args.train)

    # Extract column info
    target = args.init_args['label']
    columns = train_data.columns.tolist()
    column_dict = {"columns": columns}
    with open('columns.pkl', 'wb') as f:
        pickle.dump(column_dict, f)

    # Train models

    args.init_args['path'] = args.model_dir
    #args.fit_args.pop('label', None)
    predictor = TabularPredictor(**args.init_args).fit(train_data,
                                                       **args.fit_args)

    # Results summary
    predictor.fit_summary(verbosity=3)
    #model_summary_fname_src = os.path.join(predictor.output_directory, 'SummaryOfModels.html')
    model_summary_fname_src = os.path.join(args.model_dir,
                                           'SummaryOfModels.html')
    model_summary_fname_tgt = os.path.join(model_output_dir,
                                           'SummaryOfModels.html')

    if os.path.exists(model_summary_fname_src):
        shutil.copy(model_summary_fname_src, model_summary_fname_tgt)

    # ensemble visualization
    G = predictor._trainer.model_graph
    remove = [node for node, degree in dict(G.degree()).items() if degree < 1]
    G.remove_nodes_from(remove)
    A = nx.nx_agraph.to_agraph(G)
    A.graph_attr.update(rankdir='BT')
    A.node_attr.update(fontsize=10)
    for node in A.iternodes():
        node.attr['shape'] = 'rectagle'
    A.draw(os.path.join(model_output_dir, 'ensemble-model.png'),
           format='png',
           prog='dot')

    # Optional test data
    if args.test:
        print(f'Test files: {os.listdir(args.test)}')
        test_data = __load_input_data(args.test)
        # Test data must be labeled for scoring
        if target in test_data:
            # Leaderboard on test data
            print('Running model on test data and getting Leaderboard...')
            leaderboard = predictor.leaderboard(test_data, silent=True)
            print(format_for_print(leaderboard), end='\n\n')
            leaderboard.to_csv(f'{model_output_dir}/leaderboard.csv',
                               index=False)

            # Feature importance on test data
            # Note: Feature importance must be calculated on held-out (test) data.
            # If calculated on training data it will be biased due to overfitting.
            if args.feature_importance:
                print('Feature importance:')
                # Increase rows to print feature importance
                pd.set_option('display.max_rows', 500)
                feature_importance_df = predictor.feature_importance(test_data)

                print(feature_importance_df)
                feature_importance_df.to_csv(
                    f'{model_output_dir}/feature_importance.csv', index=True)

            # Classification report and confusion matrix for classification model
            if predictor.problem_type in [BINARY, MULTICLASS]:
                from sklearn.metrics import classification_report, confusion_matrix

                X_test = test_data.drop(target, axis=1)
                y_test_true = test_data[target]
                y_test_pred = predictor.predict(X_test)
                y_test_pred_prob = predictor.predict_proba(X_test,
                                                           as_multiclass=True)

                report_dict = classification_report(
                    y_test_true,
                    y_test_pred,
                    output_dict=True,
                    labels=predictor.class_labels)
                report_dict_df = pd.DataFrame(report_dict).T
                report_dict_df.to_csv(
                    f'{model_output_dir}/classification_report.csv',
                    index=True)

                cm = confusion_matrix(y_test_true,
                                      y_test_pred,
                                      labels=predictor.class_labels)
                cm_df = pd.DataFrame(cm, predictor.class_labels,
                                     predictor.class_labels)
                sns.set(font_scale=1)
                cmap = 'coolwarm'
                sns.heatmap(cm_df, annot=True, fmt='d', cmap=cmap)
                plt.title('Confusion Matrix')
                plt.ylabel('true label')
                plt.xlabel('predicted label')
                plt.show()
                plt.savefig(f'{model_output_dir}/confusion_matrix.png')

                get_roc_auc(y_test_true, y_test_pred_prob,
                            predictor.class_labels,
                            predictor.class_labels_internal, model_output_dir)
        else:
            warnings.warn(
                'Skipping eval on test data since label column is not included.'
            )

    # Files summary
    print(f'Model export summary:')
    print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}")
    models_contents = os.listdir('/opt/ml/model/models')
    print(f"/opt/ml/model/models: {models_contents}")
    print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")