Example #1
0
def test_trivialaugment():
    dataset = ALL_DATASETS["petfinder"]()
    metric_name = dataset.metric

    predictor = MultiModalPredictor(
        label=dataset.label_columns[0],
        problem_type=dataset.problem_type,
        eval_metric=metric_name,
    )
    config = {
        MODEL: f"fusion_mlp_image_text_tabular",
        DATA: "default",
        OPTIMIZATION: "adamw",
        ENVIRONMENT: "default",
    }
    hyperparameters = {
        "optimization.max_epochs":
        1,
        "optimization.top_k_average_method":
        BEST,
        "env.num_workers":
        0,
        "env.num_workers_evaluation":
        0,
        "data.categorical.convert_to_text":
        False,
        "data.numerical.convert_to_text":
        False,
        "data.mixup.turn_on":
        True,
        "model.hf_text.text_trivial_aug_maxscale":
        0.1,
        "model.hf_text.text_aug_detect_length":
        10,
        "model.timm_image.train_transform_types":
        ["resize_shorter_side", "center_crop", "trivial_augment"],
    }

    with tempfile.TemporaryDirectory() as save_path:
        predictor.fit(
            train_data=dataset.train_df,
            config=config,
            time_limit=30,
            save_path=save_path,
            hyperparameters=hyperparameters,
        )

        score = predictor.evaluate(dataset.test_df)
        verify_predictor_save_load(predictor, dataset.test_df)
Example #2
0
def test_hpo(searcher, scheduler):
    dataset = PetFinderDataset()

    hyperparameters = {
        "optimization.learning_rate": tune.uniform(0.0001, 0.01),
        "optimization.max_epochs": 1,
        "model.names": ["numerical_mlp", "categorical_mlp", "fusion_mlp"],
        "data.categorical.convert_to_text": False,
        "data.numerical.convert_to_text": False,
        "env.num_workers": 0,
        "env.num_workers_evaluation": 0,
    }

    hyperparameter_tune_kwargs = {
        "searcher": searcher,
        "scheduler": scheduler,
        "num_trials": 2,
    }

    predictor = MultiModalPredictor(
        label=dataset.label_columns[0],
        problem_type=dataset.problem_type,
        eval_metric=dataset.metric,
    )

    save_path = os.path.join(get_home_dir(), "hpo", f"_{searcher}",
                             f"_{scheduler}")
    if os.path.exists(save_path):
        shutil.rmtree(save_path)

    predictor = predictor.fit(
        train_data=dataset.train_df,
        hyperparameters=hyperparameters,
        time_limit=60,
        save_path=save_path,
        hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
    )

    score = predictor.evaluate(dataset.test_df)
    verify_predictor_save_load(predictor, dataset.test_df)

    # test for continuous training
    predictor = predictor.fit(
        train_data=dataset.train_df,
        hyperparameters=hyperparameters,
        time_limit=60,
        hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
    )
Example #3
0
def train(args):
    set_seed(args.seed)
    train_df = pd.read_csv(os.path.join(args.data_path, 'train.csv'))
    test_df = pd.read_csv(os.path.join(args.data_path, 'test.csv'))
    # For the purpose of generating submission file
    submission_df = pd.read_csv(os.path.join(args.data_path, 'sample_submission.csv'))
    train_df = preprocess(train_df,
                          with_tax_values=args.with_tax_values, has_label=True)
    test_df = preprocess(test_df,
                         with_tax_values=args.with_tax_values, has_label=False)
    label_column = 'Sold Price'
    eval_metric = 'r2'

    automm_hyperparameters = get_automm_hyperparameters(args.automm_mode, args.text_backbone, args.cat_as_text)

    tabular_hyperparameters = {
        'GBM': [
            {},
            {'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}},
        ],
        'CAT': {},
        'AG_AUTOMM': automm_hyperparameters,
    }
    if args.mode == 'single':
        predictor = MultiModalPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path)
        predictor.fit(train_df, hyperparameters=automm_hyperparameters, seed=args.seed)
    elif args.mode == 'weighted' or args.mode == 'stack5' or args.mode == 'single_bag5' or args.mode == 'single_bag4':
        predictor = TabularPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path)

        if args.mode == 'single_bag5':
            tabular_hyperparameters = {
                'AG_AUTOMM': automm_hyperparameters,
            }
            num_bag_folds, num_stack_levels = 5, 0
        elif args.mode == 'weighted':
            num_bag_folds, num_stack_levels = None, None
        elif args.mode == 'stack5':
            num_bag_folds, num_stack_levels = 5, 1
        else:
            raise NotImplementedError
        predictor.fit(train_df,
                      hyperparameters=tabular_hyperparameters,
                      num_bag_folds=num_bag_folds,
                      num_stack_levels=num_stack_levels)
        leaderboard = predictor.leaderboard()
        leaderboard.to_csv(os.path.join(args.exp_path, 'leaderboard.csv'))
    else:
        raise NotImplementedError
    predictions = np.exp(predictor.predict(test_df))
    submission_df['Sold Price'] = predictions
    submission_df.to_csv(os.path.join(args.exp_path, 'submission.csv'), index=None)
Example #4
0
def test_modifying_duplicate_model_names():
    dataset = ALL_DATASETS["petfinder"]()
    metric_name = dataset.metric

    teacher_predictor = MultiModalPredictor(
        label=dataset.label_columns[0],
        problem_type=dataset.problem_type,
        eval_metric=metric_name,
    )
    config = {
        MODEL: f"fusion_mlp_image_text_tabular",
        DATA: "default",
        OPTIMIZATION: "adamw",
        ENVIRONMENT: "default",
    }
    teacher_predictor.fit(
        train_data=dataset.train_df,
        config=config,
        time_limit=1,
    )
    student_predictor = MultiModalPredictor(
        label=dataset.label_columns[0],
        problem_type=dataset.problem_type,
        eval_metric=metric_name,
    )
    student_predictor.fit(
        train_data=dataset.train_df,
        config=config,
        time_limit=0,
    )

    teacher_predictor = modify_duplicate_model_names(
        predictor=teacher_predictor,
        postfix="teacher",
        blacklist=student_predictor._config.model.names,
    )

    # verify teacher and student have no duplicate model names
    assert all([n not in teacher_predictor._config.model.names for n in student_predictor._config.model.names]), \
        f"teacher model names {teacher_predictor._config.model.names} and" \
        f" student model names {student_predictor._config.model.names} have duplicates."

    # verify each model name prefix is valid
    assert teacher_predictor._model.prefix in teacher_predictor._config.model.names
    if isinstance(teacher_predictor._model.model, nn.ModuleList):
        for per_model in teacher_predictor._model.model:
            assert per_model.prefix in teacher_predictor._config.model.names

    # verify each data processor's prefix is valid
    for per_modality_processors in teacher_predictor._data_processors.values():
        for per_processor in per_modality_processors:
            assert per_processor.prefix in teacher_predictor._config.model.names
Example #5
0
 def load(cls, path: str, reset_paths=True, verbose=True):
     model = super().load(path=path,
                          reset_paths=reset_paths,
                          verbose=verbose)
     if model._load_model:
         try_import_autogluon_text()
         from autogluon.multimodal import MultiModalPredictor
         model.model = MultiModalPredictor.load(
             os.path.join(path, cls._NN_MODEL_NAME))
     model._load_model = None
     return model
Example #6
0
    def load(
        path: str,
        verbosity: int = None,
        backend: str = PYTORCH,
        resume: bool = False,
    ):
        """
        Load a TextPredictor object previously produced by `fit()` from file and returns this object. It is highly recommended the predictor be loaded with the exact AutoGluon version it was fit with.

        Parameters
        ----------
        path : str
            The path to directory in which this Predictor was previously saved.
        verbosity : int, default = None
            Sets the verbosity level of this Predictor after it is loaded.
            Valid values range from 0 (least verbose) to 4 (most verbose).
            If None, logging verbosity is not changed from existing values.
            Specify larger values to see more information printed when using Predictor during inference, smaller values to see less information.
            Refer to TextPredictor init for more information.
        backend : pytorch / mxnet
        resume: Whether to resume training from a saved checkpoint

        """
        if backend == PYTORCH:
            _predictor = MultiModalPredictor.load(
                path=path,
                resume=resume,
            )
        elif backend == MXNET:
            from .mx_predictor import MXTextPredictor
            _predictor = MXTextPredictor.load(
                path=path,
                verbosity=verbosity,
            )
        else:
            raise ValueError(f"Unknown backend: {backend}")

        predictor = TextPredictor(label=_predictor.label, )
        predictor._backend = backend
        predictor._predictor = _predictor

        return predictor
Example #7
0
def test_distillation():
    dataset = PetFinderDataset()

    hyperparameters = {
        "optimization.max_epochs": 1,
        "model.names": ["hf_text", "timm_image", "fusion_mlp"],
        "model.hf_text.checkpoint_name": "prajjwal1/bert-tiny",
        "model.timm_image.checkpoint_name": "swin_tiny_patch4_window7_224",
        "env.num_workers": 0,
        "env.num_workers_evaluation": 0,
    }

    teacher_predictor = MultiModalPredictor(
        label=dataset.label_columns[0],
        problem_type=dataset.problem_type,
        eval_metric=dataset.metric,
    )

    teacher_save_path = os.path.join(get_home_dir(), "petfinder", "teacher")
    if os.path.exists(teacher_save_path):
        shutil.rmtree(teacher_save_path)

    teacher_predictor = teacher_predictor.fit(
        train_data=dataset.train_df,
        hyperparameters=hyperparameters,
        time_limit=30,
        save_path=teacher_save_path,
    )

    # test for distillation
    predictor = MultiModalPredictor(
        label=dataset.label_columns[0],
        problem_type=dataset.problem_type,
        eval_metric=dataset.metric,
    )

    student_save_path = os.path.join(get_home_dir(), "petfinder", "student")
    if os.path.exists(student_save_path):
        shutil.rmtree(student_save_path)

    predictor = predictor.fit(
        train_data=dataset.train_df,
        teacher_predictor=teacher_predictor,
        hyperparameters=hyperparameters,
        time_limit=30,
        save_path=student_save_path,
    )
    verify_predictor_save_load(predictor, dataset.test_df)

    # test for distillation with teacher predictor path
    predictor = MultiModalPredictor(
        label=dataset.label_columns[0],
        problem_type=dataset.problem_type,
        eval_metric=dataset.metric,
    )

    student_save_path = os.path.join(get_home_dir(), "petfinder", "student")
    if os.path.exists(student_save_path):
        shutil.rmtree(student_save_path)

    predictor = predictor.fit(
        train_data=dataset.train_df,
        teacher_predictor=teacher_predictor.path,
        hyperparameters=hyperparameters,
        time_limit=30,
        save_path=student_save_path,
    )
    verify_predictor_save_load(predictor, dataset.test_df)
Example #8
0
    data_path = args.data_path  # The path of the training and testing data.
    save_path = args.save_path  # The path of saving the model.
    save_standalone_path = save_path + "_standalone"  # The path of saving the standalone model which includes downloaded model.

    N_FOLDS = args.folds  # The number of folds.

    all_score = []  # The result of folds.
    train_df, train_df_fold, _ = load_data(data_path)

    for i in range(N_FOLDS):
        # The predictor in use.
        predictor = MultiModalPredictor(
            label=args.label_column,  # label indicates the target value
            problem_type=args.
            problem_type,  # problem_type indicates the type of the problem. It can choose "multiclass", # "binary" or "regression"
            eval_metric=args.
            eval_metric,  # eval_metric indicates the evaluation index of the model
            path=save_path,
            verbosity=4,  # verbosity controls how much information is printed.
        )

        # Training process.
        training_df = train_df[train_df_fold != i]
        valid_df = train_df[train_df_fold == i]
        predictor.fit(
            train_data=training_df,
            tuning_data=valid_df,
            save_path=save_path + f"_fold{i}",
            hyperparameters={
                "model.names": args.model_names,
                "model.timm_image.checkpoint_name":
Example #9
0
def main(args):

    if args.gpu_id is not None:
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id

    assert args.dataset_name in TABULAR_DATASETS.keys(
    ), "Unsupported dataset name."

    ### Dataset loading
    train_data = TABULAR_DATASETS[args.dataset_name]("train", args.dataset_dir)

    val_data = TABULAR_DATASETS[args.dataset_name]("val", args.dataset_dir)

    test_data = TABULAR_DATASETS[args.dataset_name]("test", args.dataset_dir)

    automm_hyperparameters["optimization.learning_rate"] = args.lr
    automm_hyperparameters["optimization.end_lr"] = args.end_lr

    if args.embedding_arch is not None:
        automm_hyperparameters[
            "model.numerical_transformer.embedding_arch"] = args.embedding_arch

    tabular_hyperparameters = {
        "GBM": [
            {},
            {
                "extra_trees": True,
                "ag_args": {
                    "name_suffix": "XT"
                }
            },
        ],
        "CAT": {},
        "XGB": {},
        "AG_AUTOMM": automm_hyperparameters,
    }

    if args.mode == "single":
        ### model initalization
        predictor = MultiModalPredictor(
            label=train_data.label_column,
            problem_type=train_data.problem_type,
            eval_metric=train_data.metric,
            path=args.exp_dir,
            verbosity=4,
        )

        ### model training
        predictor.fit(
            train_data=train_data.data,
            tuning_data=val_data.data,
            seed=args.seed,
            hyperparameters=automm_hyperparameters,
        )

        ### model inference
        scores = predictor.evaluate(data=test_data.data,
                                    metrics=[test_data.metric])
        with open(os.path.join(args.exp_dir, "scores.json"), "w") as f:
            json.dump(scores, f)
        print(scores)
    elif args.mode == "weighted" or args.mode == "single_bag5" or args.mode == "stack5":
        if args.mode == "single_bag5":
            tabular_hyperparameters = {
                "AG_AUTOMM": automm_hyperparameters,
            }
            num_bag_folds, num_stack_levels = 5, 0
        elif args.mode == "weighted":
            num_bag_folds, num_stack_levels = None, None
        elif args.mode == "stack5":
            num_bag_folds, num_stack_levels = 5, 1
        else:
            raise NotImplementedError
        from autogluon.tabular import TabularPredictor

        predictor = TabularPredictor(eval_metric=train_data.metric,
                                     label=train_data.label_column,
                                     path=args.exp_dir)
        predictor.fit(
            train_data=train_data.data,
            tuning_data=val_data.data if num_bag_folds is None else None,
            hyperparameters=tabular_hyperparameters,
            num_bag_folds=num_bag_folds,
            num_stack_levels=num_stack_levels,
        )
        leaderboard = predictor.leaderboard()
        leaderboard.to_csv(os.path.join(args.exp_dir, "leaderboard.csv"))
    else:
        raise NotImplementedError
    scores = predictor.evaluate(data=test_data.data)
    with open(os.path.join(args.exp_dir, "scores.json"), "w") as f:
        json.dump(scores, f)
    print(scores)

    predictions = predictor.predict(data=test_data.data)
    predictions.to_csv(os.path.join(args.exp_dir, "predictions.csv"))
Example #10
0
    def _fit(self,
             X: pd.DataFrame,
             y: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             sample_weight=None,
             **kwargs):
        """The internal fit function

        Parameters
        ----------
        X
            Features of the training dataset
        y
            Labels of the training dataset
        X_val
            Features of the validation dataset
        y_val
            Labels of the validation dataset
        time_limit
            The time limits for the fit function
        sample_weight
            The weights of the samples
        kwargs
            Other keyword arguments

        """
        try_import_autogluon_text()
        from autogluon.multimodal import MultiModalPredictor

        # Decide name of the label column
        if 'label' in X.columns:
            label_col_id = 0
            while True:
                self._label_column_name = 'label{}'.format(label_col_id)
                if self._label_column_name not in X.columns:
                    break
                label_col_id += 1
        else:
            self._label_column_name = 'label'
        X_train = self.preprocess(X, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)
        # Get arguments from kwargs
        verbosity = kwargs.get('verbosity', 2)
        num_gpus = kwargs.get('num_gpus', None)
        if sample_weight is not None:  # TODO: support
            logger.log(
                15,
                "sample_weight not yet supported for MultiModalPredictorModel, "
                "this model will ignore them in training.")

        X_train.insert(len(X_train.columns), self._label_column_name, y)
        if X_val is not None:
            X_val.insert(len(X_val.columns), self._label_column_name, y_val)

        verbosity_text = max(0, verbosity - 1)
        root_logger = logging.getLogger('autogluon')
        root_log_level = root_logger.level
        self.model = MultiModalPredictor(label=self._label_column_name,
                                         problem_type=self.problem_type,
                                         path=self.path,
                                         eval_metric=self.eval_metric,
                                         verbosity=verbosity_text)
        params = self._get_model_params()

        if num_gpus is not None:
            params['env.num_gpus'] = num_gpus
        presets = params.pop('presets', None)
        seed = params.pop('seed', 0)

        self.model.fit(train_data=X_train,
                       tuning_data=X_val,
                       time_limit=time_limit,
                       presets=presets,
                       hyperparameters=params,
                       seed=seed)
        self.model.set_verbosity(verbosity)
        root_logger.setLevel(root_log_level)  # Reset log level
Example #11
0
def test_standalone():  # test standalong feature in MultiModalPredictor.save()
    from unittest import mock
    import torch

    requests_gag = mock.patch(
        'requests.Session.request',
        mock.Mock(side_effect=RuntimeError(
            'Please use the `responses` library to mock HTTP in your tests.')))

    dataset = PetFinderDataset()

    config = {
        MODEL: f"fusion_mlp_image_text_tabular",
        DATA: "default",
        OPTIMIZATION: "adamw",
        ENVIRONMENT: "default",
    }

    hyperparameters = {
        "optimization.max_epochs":
        1,
        "model.names": [
            "numerical_mlp", "categorical_mlp", "timm_image", "hf_text",
            "clip", "fusion_mlp"
        ],
        "model.hf_text.checkpoint_name":
        "prajjwal1/bert-tiny",
        "model.timm_image.checkpoint_name":
        "swin_tiny_patch4_window7_224",
        "env.num_workers":
        0,
        "env.num_workers_evaluation":
        0,
    }

    predictor = MultiModalPredictor(
        label=dataset.label_columns[0],
        problem_type=dataset.problem_type,
        eval_metric=dataset.metric,
    )

    save_path = os.path.join(get_home_dir(), "standalone", "false")
    if os.path.exists(save_path):
        shutil.rmtree(save_path)

    predictor.fit(
        train_data=dataset.train_df,
        config=config,
        hyperparameters=hyperparameters,
        time_limit=30,
        save_path=save_path,
    )

    save_path_standalone = os.path.join(get_home_dir(), "standalone", "true")

    predictor.save(
        path=save_path_standalone,
        standalone=True,
    )

    del predictor
    torch.cuda.empty_cache()

    loaded_online_predictor = MultiModalPredictor.load(path=save_path)
    online_predictions = loaded_online_predictor.predict(dataset.test_df,
                                                         as_pandas=False)
    del loaded_online_predictor

    # Check if the predictor can be loaded from an offline enivronment.
    with requests_gag:
        # No internet connection here. If any command require internet connection, a RuntimeError will be raised.
        with tempfile.TemporaryDirectory() as tmpdirname:
            torch.hub.set_dir(tmpdirname)  # block reading files in `.cache`
            loaded_offline_predictor = MultiModalPredictor.load(
                path=save_path_standalone)

    offline_predictions = loaded_offline_predictor.predict(dataset.test_df,
                                                           as_pandas=False)
    del loaded_offline_predictor

    # check if save with standalone=True coincide with standalone=False
    npt.assert_equal(online_predictions, offline_predictions)
Example #12
0
def test_textagumentor_deepcopy():
    dataset = ALL_DATASETS["ae"]()
    metric_name = dataset.metric

    predictor = MultiModalPredictor(
        label=dataset.label_columns[0],
        problem_type=dataset.problem_type,
        eval_metric=metric_name,
    )
    config = {
        MODEL: f"fusion_mlp_image_text_tabular",
        DATA: "default",
        OPTIMIZATION: "adamw",
        ENVIRONMENT: "default",
    }
    hyperparameters = {
        "optimization.max_epochs":
        1,
        "env.num_workers":
        0,
        "env.num_workers_evaluation":
        0,
        "data.categorical.convert_to_text":
        False,
        "data.numerical.convert_to_text":
        False,
        "model.hf_text.text_trivial_aug_maxscale":
        0.05,
        "model.hf_text.text_train_augment_types":
        ["synonym_replacement(0.05)", "random_swap(0.05)"],
        "optimization.top_k_average_method":
        "uniform_soup",
    }

    with tempfile.TemporaryDirectory() as save_path:
        predictor.fit(
            train_data=dataset.train_df,
            config=config,
            time_limit=10,
            save_path=save_path,
            hyperparameters=hyperparameters,
        )

    # Deep copy data preprocessor
    df_preprocessor_copy = copy.deepcopy(predictor._df_preprocessor)
    predictor._df_preprocessor = df_preprocessor_copy

    # Test for copied preprocessor
    predictor.fit(
        train_data=dataset.train_df,
        config=config,
        hyperparameters=hyperparameters,
        time_limit=10,
    )

    # Copy data preprocessor via pickle + load
    df_preprocessor_copy_via_pickle = pickle.loads(
        pickle.dumps(predictor._df_preprocessor))
    predictor._df_preprocessor = df_preprocessor_copy_via_pickle

    # Test for copied preprocessor
    predictor.fit(
        train_data=dataset.train_df,
        config=config,
        hyperparameters=hyperparameters,
        time_limit=10,
    )
Example #13
0
def test_model_configs():
    dataset = ALL_DATASETS["petfinder"]()
    metric_name = dataset.metric

    predictor = MultiModalPredictor(
        label=dataset.label_columns[0],
        problem_type=dataset.problem_type,
        eval_metric=metric_name,
    )

    model_config = {
        'model': {
            'names': [
                'hf_text', 'timm_image', 'clip', 'categorical_transformer',
                'numerical_transformer', 'fusion_transformer'
            ],
            'categorical_transformer': {
                'out_features': 192,
                'd_token': 192,
                'num_trans_blocks': 0,
                'num_attn_heads': 4,
                'residual_dropout': 0.0,
                'attention_dropout': 0.2,
                'ffn_dropout': 0.1,
                'normalization': 'layer_norm',
                'ffn_activation': 'reglu',
                'head_activation': 'relu',
                'data_types': ['categorical']
            },
            'numerical_transformer': {
                'out_features': 192,
                'd_token': 192,
                'num_trans_blocks': 0,
                'num_attn_heads': 4,
                'residual_dropout': 0.0,
                'attention_dropout': 0.2,
                'ffn_dropout': 0.1,
                'normalization': 'layer_norm',
                'ffn_activation': 'reglu',
                'head_activation': 'relu',
                'data_types': ['numerical'],
                'embedding_arch': ['linear', 'relu'],
                'merge': 'concat'
            },
            'hf_text': {
                'checkpoint_name': 'google/electra-base-discriminator',
                'data_types': ['text'],
                'tokenizer_name': 'hf_auto',
                'max_text_len': 512,
                'insert_sep': True,
                'text_segment_num': 2,
                'stochastic_chunk': False,
                'text_aug_detect_length': 10,
                'text_trivial_aug_maxscale': 0.05,
                'test_train_augment_types': ["synonym_replacement(0.1)"],
            },
            'timm_image': {
                'checkpoint_name': 'swin_base_patch4_window7_224',
                'mix_choice': 'all_logits',
                'data_types': ['image'],
                'train_transform_types':
                ['resize_shorter_side', 'center_crop'],
                'val_transform_types': ['resize_shorter_side', 'center_crop'],
                'image_norm': 'imagenet',
                'image_size': 224,
                'max_img_num_per_col': 2,
            },
            'clip': {
                'checkpoint_name': 'openai/clip-vit-base-patch32',
                'data_types': ['image', 'text'],
                'train_transform_types':
                ['resize_shorter_side', 'center_crop'],
                'val_transform_types': ['resize_shorter_side', 'center_crop'],
                'image_norm': 'clip',
                'image_size': 224,
                'max_img_num_per_col': 2,
                'tokenizer_name': 'clip',
                'max_text_len': 77,
                'insert_sep': False,
                'text_segment_num': 1,
                'stochastic_chunk': False,
                'text_aug_detect_length': 10,
                'text_trivial_aug_maxscale': 0.05,
                'test_train_augment_types': ["synonym_replacement(0.1)"],
            },
            'fusion_transformer': {
                'hidden_size': 192,
                'n_blocks': 2,
                'attention_n_heads': 4,
                'adapt_in_features': 'max',
                'attention_dropout': 0.2,
                'residual_dropout': 0.0,
                'ffn_dropout': 0.1,
                'ffn_d_hidden': 192,
                'normalization': 'layer_norm',
                'ffn_activation': 'geglu',
                'head_activation': 'relu',
                'data_types': None
            },
        }
    }

    hyperparameters = {
        "optimization.max_epochs": 1,
        "optimization.top_k_average_method": BEST,
        "env.num_workers": 0,
        "env.num_workers_evaluation": 0,
        "data.categorical.convert_to_text": False,
        "data.numerical.convert_to_text": False,
    }

    config = {
        MODEL: model_config,
        DATA: "default",
        OPTIMIZATION: "adamw",
        ENVIRONMENT: "default",
    }

    with tempfile.TemporaryDirectory() as save_path:
        predictor.fit(
            train_data=dataset.train_df,
            config=config,
            time_limit=30,
            save_path=save_path,
            hyperparameters=hyperparameters,
        )

        score = predictor.evaluate(dataset.test_df)
        verify_predictor_save_load(predictor, dataset.test_df)
Example #14
0
def test_customizing_model_names(hyperparameters, ):
    dataset = ALL_DATASETS["petfinder"]()
    metric_name = dataset.metric

    predictor = MultiModalPredictor(
        label=dataset.label_columns[0],
        problem_type=dataset.problem_type,
        eval_metric=metric_name,
    )
    config = {
        MODEL: f"fusion_mlp_image_text_tabular",
        DATA: "default",
        OPTIMIZATION: "adamw",
        ENVIRONMENT: "default",
    }
    hyperparameters.update({
        "env.num_workers": 0,
        "env.num_workers_evaluation": 0,
    })
    hyperparameters_gt = copy.deepcopy(hyperparameters)
    if isinstance(hyperparameters_gt["model.names"], str):
        hyperparameters_gt["model.names"] = OmegaConf.from_dotlist(
            [f'names={hyperparameters["model.names"]}']).names

    save_path = os.path.join(get_home_dir(), "outputs", "petfinder")
    if os.path.exists(save_path):
        shutil.rmtree(save_path)
    predictor.fit(
        train_data=dataset.train_df,
        config=config,
        hyperparameters=hyperparameters,
        time_limit=10,
        save_path=save_path,
    )

    assert sorted(predictor._config.model.names) == sorted(
        hyperparameters_gt["model.names"])
    for per_name in hyperparameters_gt["model.names"]:
        assert hasattr(predictor._config.model, per_name)

    score = predictor.evaluate(dataset.test_df)
    verify_predictor_save_load(predictor, dataset.test_df)

    # Test for continuous fit
    predictor.fit(
        train_data=dataset.train_df,
        config=config,
        hyperparameters=hyperparameters,
        time_limit=10,
    )
    assert sorted(predictor._config.model.names) == sorted(
        hyperparameters_gt["model.names"])
    for per_name in hyperparameters_gt["model.names"]:
        assert hasattr(predictor._config.model, per_name)
    verify_predictor_save_load(predictor, dataset.test_df)

    # Saving to folder, loading the saved model and call fit again (continuous fit)
    with tempfile.TemporaryDirectory() as root:
        predictor.save(root)
        predictor = MultiModalPredictor.load(root)
        predictor.fit(
            train_data=dataset.train_df,
            config=config,
            hyperparameters=hyperparameters,
            time_limit=10,
        )
        assert sorted(predictor._config.model.names) == sorted(
            hyperparameters_gt["model.names"])
        for per_name in hyperparameters_gt["model.names"]:
            assert hasattr(predictor._config.model, per_name)
Example #15
0
class MultiModalPredictorModel(AbstractModel):
    _NN_MODEL_NAME = 'automm_model'

    def __init__(self, **kwargs):
        """Wrapper of autogluon.multimodal.MultiModalPredictor.

        The features can be a mix of
        - image column
        - text column
        - categorical column
        - numerical column

        The labels can be categorical or numerical.

        Parameters
        ----------
        path
            The directory to store the modeling outputs.
        name
            Name of subdirectory inside path where model will be saved.
        problem_type
            Type of problem that this model will handle.
            Valid options: ['binary', 'multiclass', 'regression'].
        eval_metric
            The evaluation metric.
        num_classes
            The number of classes.
        stopping_metric
            The stopping metric.
        model
            The internal model object.
        hyperparameters
            The hyperparameters of the model
        features
            Names of the features.
        feature_metadata
            The feature metadata.
        """
        super().__init__(**kwargs)
        self._label_column_name = None
        self._load_model = None  # Whether to load inner model when loading.

    def _get_default_auxiliary_params(self) -> dict:
        default_auxiliary_params = super()._get_default_auxiliary_params()
        extra_auxiliary_params = dict(
            valid_raw_types=[R_INT, R_FLOAT, R_CATEGORY, R_OBJECT],
            ignored_type_group_special=[
                S_TEXT_NGRAM, S_TEXT_AS_CATEGORY, S_TEXT_SPECIAL
            ],
        )
        default_auxiliary_params.update(extra_auxiliary_params)
        return default_auxiliary_params

    @classmethod
    def _get_default_ag_args(cls) -> dict:
        default_ag_args = super()._get_default_ag_args()
        extra_ag_args = {'valid_stacker': False}
        default_ag_args.update(extra_ag_args)
        return default_ag_args

    def _set_default_params(self):
        super()._set_default_params()
        try_import_autogluon_text()

    def _fit(self,
             X: pd.DataFrame,
             y: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             sample_weight=None,
             **kwargs):
        """The internal fit function

        Parameters
        ----------
        X
            Features of the training dataset
        y
            Labels of the training dataset
        X_val
            Features of the validation dataset
        y_val
            Labels of the validation dataset
        time_limit
            The time limits for the fit function
        sample_weight
            The weights of the samples
        kwargs
            Other keyword arguments

        """
        try_import_autogluon_text()
        from autogluon.multimodal import MultiModalPredictor

        # Decide name of the label column
        if 'label' in X.columns:
            label_col_id = 0
            while True:
                self._label_column_name = 'label{}'.format(label_col_id)
                if self._label_column_name not in X.columns:
                    break
                label_col_id += 1
        else:
            self._label_column_name = 'label'
        X_train = self.preprocess(X, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)
        # Get arguments from kwargs
        verbosity = kwargs.get('verbosity', 2)
        num_gpus = kwargs.get('num_gpus', None)
        if sample_weight is not None:  # TODO: support
            logger.log(
                15,
                "sample_weight not yet supported for MultiModalPredictorModel, "
                "this model will ignore them in training.")

        X_train.insert(len(X_train.columns), self._label_column_name, y)
        if X_val is not None:
            X_val.insert(len(X_val.columns), self._label_column_name, y_val)

        verbosity_text = max(0, verbosity - 1)
        root_logger = logging.getLogger('autogluon')
        root_log_level = root_logger.level
        self.model = MultiModalPredictor(label=self._label_column_name,
                                         problem_type=self.problem_type,
                                         path=self.path,
                                         eval_metric=self.eval_metric,
                                         verbosity=verbosity_text)
        params = self._get_model_params()

        if num_gpus is not None:
            params['env.num_gpus'] = num_gpus
        presets = params.pop('presets', None)
        seed = params.pop('seed', 0)

        self.model.fit(train_data=X_train,
                       tuning_data=X_val,
                       time_limit=time_limit,
                       presets=presets,
                       hyperparameters=params,
                       seed=seed)
        self.model.set_verbosity(verbosity)
        root_logger.setLevel(root_log_level)  # Reset log level

    def _predict_proba(self, X, **kwargs):
        X = self.preprocess(X, **kwargs)

        if self.problem_type == REGRESSION:
            return self.model.predict(X, as_pandas=False)

        y_pred_proba = self.model.predict_proba(X, as_pandas=False)
        return self._convert_proba_to_unified_form(y_pred_proba)

    def save(self, path: str = None, verbose=True) -> str:
        self._load_model = self.model is not None
        __model = self.model
        self.model = None
        # save this AbstractModel object without NN weights
        path = super().save(path=path, verbose=verbose)
        self.model = __model

        if self._load_model:
            automm_nn_path = os.path.join(path, self._NN_MODEL_NAME)
            self.model.save(automm_nn_path)
            logger.log(
                15,
                f"\tSaved AutoMM model weights and model hyperparameters to '{automm_nn_path}'."
            )
        self._load_model = None
        return path

    @classmethod
    def load(cls, path: str, reset_paths=True, verbose=True):
        model = super().load(path=path,
                             reset_paths=reset_paths,
                             verbose=verbose)
        if model._load_model:
            try_import_autogluon_text()
            from autogluon.multimodal import MultiModalPredictor
            model.model = MultiModalPredictor.load(
                os.path.join(path, cls._NN_MODEL_NAME))
        model._load_model = None
        return model

    def get_memory_size(self) -> int:
        """Return the memory size by calculating the total number of parameters.

        Returns
        -------
        memory_size
            The total memory size in bytes.
        """
        total_size = sum(param.numel()
                         for param in self.model._model.parameters())

        return total_size

    def _get_default_resources(self):
        num_cpus = get_cpu_count()
        num_gpus = min(
            get_gpu_count_torch(), 1
        )  # Use single gpu training by default. Consider to revise it later.
        return num_cpus, num_gpus

    def get_minimum_resources(self) -> Dict[str, int]:
        return {
            'num_cpus': 1,
            'num_gpus': 1,
        }

    def _more_tags(self):
        # `can_refit_full=False` because MultiModalPredictor does not communicate how to train until the best epoch in refit_full.
        return {'can_refit_full': False}
Example #16
0
train_df, test_df = load_data(data_path)

if __name__ == "__main__":
    submission = pd.read_csv("../input/petfinder-pawpularity-score/sample_submission.csv")
    
    configs = [config_6, config_7, config_26]
    model_preds = np.empty(shape=[0,submission.shape[0]])
    for perconfig in configs:
        print(perconfig)
        save_standalone_path = perconfig["save_path"] + '_standalone'
        all_preds = []
        for fold in range(perconfig["N_fold"]):
            predictor = MultiModalPredictor(
                label='Pawpularity',
                problem_type='regression',
                eval_metric='rmse',
                path=perconfig["save_path"],
                verbosity=4,
            )
            pretrained_model = predictor.load(path=save_standalone_path + f'_fold{fold}/')
            pretrained_model._config.env.per_gpu_batch_size_evaluation = perconfig["per_gpu_batch_size_evaluation"]
            df_test = pretrained_model.predict(test_df)
            all_preds.append(df_test)
            del predictor
            torch.cuda.empty_cache()
        model_preds = np.append(model_preds, [np.mean(np.stack(all_preds), axis=0)], axis=0)
    
    submission["Pawpularity"] = model_preds[0] * 0.25 + model_preds[1] * 0.5  + model_preds[2] * 0.25  #Model ensemble.
    submission.to_csv("submission.csv", index=False)

    print(submission)
Example #17
0
def test_predictor(
    dataset_name,
    model_names,
    text_backbone,
    image_backbone,
    top_k_average_method,
    efficient_finetune,
    loss_function,
):
    dataset = ALL_DATASETS[dataset_name]()
    metric_name = dataset.metric

    predictor = MultiModalPredictor(
        label=dataset.label_columns[0],
        problem_type=dataset.problem_type,
        eval_metric=metric_name,
    )
    config = {
        MODEL: f"fusion_mlp_image_text_tabular",
        DATA: "default",
        OPTIMIZATION: "adamw",
        ENVIRONMENT: "default",
    }
    hyperparameters = {
        "optimization.max_epochs": 1,
        "model.names": model_names,
        "env.num_workers": 0,
        "env.num_workers_evaluation": 0,
        "optimization.top_k_average_method": top_k_average_method,
        "optimization.efficient_finetune": efficient_finetune,
        "optimization.loss_function": loss_function,
    }
    if text_backbone is not None:
        hyperparameters.update({
            "model.hf_text.checkpoint_name": text_backbone,
        })
    if image_backbone is not None:
        hyperparameters.update({
            "model.timm_image.checkpoint_name":
            image_backbone,
        })
    save_path = os.path.join(get_home_dir(), "outputs", dataset_name)
    if text_backbone is not None:
        save_path = os.path.join(save_path, text_backbone)
    if image_backbone is not None:
        save_path = os.path.join(save_path, image_backbone)

    if os.path.exists(save_path):
        shutil.rmtree(save_path)
    predictor.fit(
        train_data=dataset.train_df,
        config=config,
        hyperparameters=hyperparameters,
        time_limit=30,
        save_path=save_path,
    )

    score = predictor.evaluate(dataset.test_df)
    verify_predictor_save_load(predictor, dataset.test_df)

    # Test for continuous fit
    predictor.fit(
        train_data=dataset.train_df,
        config=config,
        hyperparameters=hyperparameters,
        time_limit=30,
    )
    verify_predictor_save_load(predictor, dataset.test_df)

    # Saving to folder, loading the saved model and call fit again (continuous fit)
    with tempfile.TemporaryDirectory() as root:
        predictor.save(root)
        predictor = MultiModalPredictor.load(root)
        predictor.fit(
            train_data=dataset.train_df,
            config=config,
            hyperparameters=hyperparameters,
            time_limit=30,
        )