Esempio n. 1
0
def train(args):
    set_seed(args.seed)
    train_df = pd.read_csv(os.path.join(args.data_path, 'train.csv'))
    test_df = pd.read_csv(os.path.join(args.data_path, 'test.csv'))
    # For the purpose of generating submission file
    submission_df = pd.read_csv(os.path.join(args.data_path, 'sample_submission.csv'))
    train_df = preprocess(train_df,
                          with_tax_values=args.with_tax_values, has_label=True)
    test_df = preprocess(test_df,
                         with_tax_values=args.with_tax_values, has_label=False)
    label_column = 'Sold Price'
    eval_metric = 'r2'

    automm_hyperparameters = get_automm_hyperparameters(args.automm_mode, args.text_backbone, args.cat_as_text)

    tabular_hyperparameters = {
        'GBM': [
            {},
            {'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}},
        ],
        'CAT': {},
        'AG_AUTOMM': automm_hyperparameters,
    }
    if args.mode == 'single':
        predictor = MultiModalPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path)
        predictor.fit(train_df, hyperparameters=automm_hyperparameters, seed=args.seed)
    elif args.mode == 'weighted' or args.mode == 'stack5' or args.mode == 'single_bag5' or args.mode == 'single_bag4':
        predictor = TabularPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path)

        if args.mode == 'single_bag5':
            tabular_hyperparameters = {
                'AG_AUTOMM': automm_hyperparameters,
            }
            num_bag_folds, num_stack_levels = 5, 0
        elif args.mode == 'weighted':
            num_bag_folds, num_stack_levels = None, None
        elif args.mode == 'stack5':
            num_bag_folds, num_stack_levels = 5, 1
        else:
            raise NotImplementedError
        predictor.fit(train_df,
                      hyperparameters=tabular_hyperparameters,
                      num_bag_folds=num_bag_folds,
                      num_stack_levels=num_stack_levels)
        leaderboard = predictor.leaderboard()
        leaderboard.to_csv(os.path.join(args.exp_path, 'leaderboard.csv'))
    else:
        raise NotImplementedError
    predictions = np.exp(predictor.predict(test_df))
    submission_df['Sold Price'] = predictions
    submission_df.to_csv(os.path.join(args.exp_path, 'submission.csv'), index=None)
Esempio n. 2
0
def main(args):

    if args.gpu_id is not None:
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id

    assert args.dataset_name in TABULAR_DATASETS.keys(
    ), "Unsupported dataset name."

    ### Dataset loading
    train_data = TABULAR_DATASETS[args.dataset_name]("train", args.dataset_dir)

    val_data = TABULAR_DATASETS[args.dataset_name]("val", args.dataset_dir)

    test_data = TABULAR_DATASETS[args.dataset_name]("test", args.dataset_dir)

    automm_hyperparameters["optimization.learning_rate"] = args.lr
    automm_hyperparameters["optimization.end_lr"] = args.end_lr

    if args.embedding_arch is not None:
        automm_hyperparameters[
            "model.numerical_transformer.embedding_arch"] = args.embedding_arch

    tabular_hyperparameters = {
        "GBM": [
            {},
            {
                "extra_trees": True,
                "ag_args": {
                    "name_suffix": "XT"
                }
            },
        ],
        "CAT": {},
        "XGB": {},
        "AG_AUTOMM": automm_hyperparameters,
    }

    if args.mode == "single":
        ### model initalization
        predictor = MultiModalPredictor(
            label=train_data.label_column,
            problem_type=train_data.problem_type,
            eval_metric=train_data.metric,
            path=args.exp_dir,
            verbosity=4,
        )

        ### model training
        predictor.fit(
            train_data=train_data.data,
            tuning_data=val_data.data,
            seed=args.seed,
            hyperparameters=automm_hyperparameters,
        )

        ### model inference
        scores = predictor.evaluate(data=test_data.data,
                                    metrics=[test_data.metric])
        with open(os.path.join(args.exp_dir, "scores.json"), "w") as f:
            json.dump(scores, f)
        print(scores)
    elif args.mode == "weighted" or args.mode == "single_bag5" or args.mode == "stack5":
        if args.mode == "single_bag5":
            tabular_hyperparameters = {
                "AG_AUTOMM": automm_hyperparameters,
            }
            num_bag_folds, num_stack_levels = 5, 0
        elif args.mode == "weighted":
            num_bag_folds, num_stack_levels = None, None
        elif args.mode == "stack5":
            num_bag_folds, num_stack_levels = 5, 1
        else:
            raise NotImplementedError
        from autogluon.tabular import TabularPredictor

        predictor = TabularPredictor(eval_metric=train_data.metric,
                                     label=train_data.label_column,
                                     path=args.exp_dir)
        predictor.fit(
            train_data=train_data.data,
            tuning_data=val_data.data if num_bag_folds is None else None,
            hyperparameters=tabular_hyperparameters,
            num_bag_folds=num_bag_folds,
            num_stack_levels=num_stack_levels,
        )
        leaderboard = predictor.leaderboard()
        leaderboard.to_csv(os.path.join(args.exp_dir, "leaderboard.csv"))
    else:
        raise NotImplementedError
    scores = predictor.evaluate(data=test_data.data)
    with open(os.path.join(args.exp_dir, "scores.json"), "w") as f:
        json.dump(scores, f)
    print(scores)

    predictions = predictor.predict(data=test_data.data)
    predictions.to_csv(os.path.join(args.exp_dir, "predictions.csv"))
Esempio n. 3
0
                "data.categorical.convert_to_text":
                args.categorical_convert_to_text,
                "env.per_gpu_batch_size": args.per_gpu_batch_size,
                "env.precision": args.precision,
                "optimization.learning_rate": args.learning_rate,
                "optimization.weight_decay": args.weight_decay,
                "optimization.lr_decay": args.learning_rate_decay,
                "optimization.max_epochs": args.max_epochs,
                "optimization.warmup_steps": args.warmup_steps,
                "optimization.loss_function": args.loss_function,
            },
            seed=args.seed,
        )

        # Manual Validating process.
        valid_pred = predictor.predict(data=valid_df)
        score = mean_squared_error(valid_df["Pawpularity"].values,
                                   valid_pred,
                                   squared=False)
        print(f"Fold {i} | Score: {score}")
        predictor.save(
            path=save_standalone_path + f"_fold{i}",
            standalone=True,
        )
        all_score.append(score)

        del predictor
        torch.cuda.empty_cache()

    print(f"all-scores: {all_score}")
    print(f"mean_rmse: {np.mean(all_score)}")
Esempio n. 4
0
class MultiModalPredictorModel(AbstractModel):
    _NN_MODEL_NAME = 'automm_model'

    def __init__(self, **kwargs):
        """Wrapper of autogluon.multimodal.MultiModalPredictor.

        The features can be a mix of
        - image column
        - text column
        - categorical column
        - numerical column

        The labels can be categorical or numerical.

        Parameters
        ----------
        path
            The directory to store the modeling outputs.
        name
            Name of subdirectory inside path where model will be saved.
        problem_type
            Type of problem that this model will handle.
            Valid options: ['binary', 'multiclass', 'regression'].
        eval_metric
            The evaluation metric.
        num_classes
            The number of classes.
        stopping_metric
            The stopping metric.
        model
            The internal model object.
        hyperparameters
            The hyperparameters of the model
        features
            Names of the features.
        feature_metadata
            The feature metadata.
        """
        super().__init__(**kwargs)
        self._label_column_name = None
        self._load_model = None  # Whether to load inner model when loading.

    def _get_default_auxiliary_params(self) -> dict:
        default_auxiliary_params = super()._get_default_auxiliary_params()
        extra_auxiliary_params = dict(
            valid_raw_types=[R_INT, R_FLOAT, R_CATEGORY, R_OBJECT],
            ignored_type_group_special=[
                S_TEXT_NGRAM, S_TEXT_AS_CATEGORY, S_TEXT_SPECIAL
            ],
        )
        default_auxiliary_params.update(extra_auxiliary_params)
        return default_auxiliary_params

    @classmethod
    def _get_default_ag_args(cls) -> dict:
        default_ag_args = super()._get_default_ag_args()
        extra_ag_args = {'valid_stacker': False}
        default_ag_args.update(extra_ag_args)
        return default_ag_args

    def _set_default_params(self):
        super()._set_default_params()
        try_import_autogluon_text()

    def _fit(self,
             X: pd.DataFrame,
             y: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             sample_weight=None,
             **kwargs):
        """The internal fit function

        Parameters
        ----------
        X
            Features of the training dataset
        y
            Labels of the training dataset
        X_val
            Features of the validation dataset
        y_val
            Labels of the validation dataset
        time_limit
            The time limits for the fit function
        sample_weight
            The weights of the samples
        kwargs
            Other keyword arguments

        """
        try_import_autogluon_text()
        from autogluon.multimodal import MultiModalPredictor

        # Decide name of the label column
        if 'label' in X.columns:
            label_col_id = 0
            while True:
                self._label_column_name = 'label{}'.format(label_col_id)
                if self._label_column_name not in X.columns:
                    break
                label_col_id += 1
        else:
            self._label_column_name = 'label'
        X_train = self.preprocess(X, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)
        # Get arguments from kwargs
        verbosity = kwargs.get('verbosity', 2)
        num_gpus = kwargs.get('num_gpus', None)
        if sample_weight is not None:  # TODO: support
            logger.log(
                15,
                "sample_weight not yet supported for MultiModalPredictorModel, "
                "this model will ignore them in training.")

        X_train.insert(len(X_train.columns), self._label_column_name, y)
        if X_val is not None:
            X_val.insert(len(X_val.columns), self._label_column_name, y_val)

        verbosity_text = max(0, verbosity - 1)
        root_logger = logging.getLogger('autogluon')
        root_log_level = root_logger.level
        self.model = MultiModalPredictor(label=self._label_column_name,
                                         problem_type=self.problem_type,
                                         path=self.path,
                                         eval_metric=self.eval_metric,
                                         verbosity=verbosity_text)
        params = self._get_model_params()

        if num_gpus is not None:
            params['env.num_gpus'] = num_gpus
        presets = params.pop('presets', None)
        seed = params.pop('seed', 0)

        self.model.fit(train_data=X_train,
                       tuning_data=X_val,
                       time_limit=time_limit,
                       presets=presets,
                       hyperparameters=params,
                       seed=seed)
        self.model.set_verbosity(verbosity)
        root_logger.setLevel(root_log_level)  # Reset log level

    def _predict_proba(self, X, **kwargs):
        X = self.preprocess(X, **kwargs)

        if self.problem_type == REGRESSION:
            return self.model.predict(X, as_pandas=False)

        y_pred_proba = self.model.predict_proba(X, as_pandas=False)
        return self._convert_proba_to_unified_form(y_pred_proba)

    def save(self, path: str = None, verbose=True) -> str:
        self._load_model = self.model is not None
        __model = self.model
        self.model = None
        # save this AbstractModel object without NN weights
        path = super().save(path=path, verbose=verbose)
        self.model = __model

        if self._load_model:
            automm_nn_path = os.path.join(path, self._NN_MODEL_NAME)
            self.model.save(automm_nn_path)
            logger.log(
                15,
                f"\tSaved AutoMM model weights and model hyperparameters to '{automm_nn_path}'."
            )
        self._load_model = None
        return path

    @classmethod
    def load(cls, path: str, reset_paths=True, verbose=True):
        model = super().load(path=path,
                             reset_paths=reset_paths,
                             verbose=verbose)
        if model._load_model:
            try_import_autogluon_text()
            from autogluon.multimodal import MultiModalPredictor
            model.model = MultiModalPredictor.load(
                os.path.join(path, cls._NN_MODEL_NAME))
        model._load_model = None
        return model

    def get_memory_size(self) -> int:
        """Return the memory size by calculating the total number of parameters.

        Returns
        -------
        memory_size
            The total memory size in bytes.
        """
        total_size = sum(param.numel()
                         for param in self.model._model.parameters())

        return total_size

    def _get_default_resources(self):
        num_cpus = get_cpu_count()
        num_gpus = min(
            get_gpu_count_torch(), 1
        )  # Use single gpu training by default. Consider to revise it later.
        return num_cpus, num_gpus

    def get_minimum_resources(self) -> Dict[str, int]:
        return {
            'num_cpus': 1,
            'num_gpus': 1,
        }

    def _more_tags(self):
        # `can_refit_full=False` because MultiModalPredictor does not communicate how to train until the best epoch in refit_full.
        return {'can_refit_full': False}