def _fit(self,
             X: pd.DataFrame,
             y: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             sample_weight=None,
             **kwargs):
        """The internal fit function

        Parameters
        ----------
        X
            Features of the training dataset
        y
            Labels of the training dataset
        X_val
            Features of the validation dataset
        y_val
            Labels of the validation dataset
        time_limit
            The time limits for the fit function
        kwargs
            Other keyword arguments

        """
        try:
            import mxnet as mx
            from autogluon.text.text_prediction.dataset import TabularDataset, random_split_train_val
            from autogluon.text.text_prediction.text_prediction import get_recommended_resource
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        time_start = time.time()

        # Get arguments from kwargs
        verbosity = kwargs.get('verbosity', 2)
        num_cpus = kwargs.get('num_cpus', None)
        num_gpus = kwargs.get('num_gpus', None)
        if sample_weight is not None:  # TODO: support
            logger.log(
                15,
                "sample_weight not yet supported for TextPredictionV1Model, this model will ignore them in training."
            )

        # Infer resource
        resource = get_recommended_resource(nthreads_per_trial=num_cpus,
                                            ngpus_per_trial=num_gpus)

        if resource['num_gpus'] == 0:
            raise NoGPUError(
                f'\tNo GPUs available to train {self.name}. Resources: {resource}'
            )

        # Set seed
        seed = self.params.get('seed')
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)
            mx.random.seed(seed)

        X = self.preprocess(X, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)

        if not self.feature_metadata.get_features(valid_raw_types=['object']):
            raise NoValidFeatures(f'No text features to train {self.name}.')

        column_properties = self._build_model(X=X,
                                              y=y,
                                              X_val=X_val,
                                              y_val=y_val,
                                              hyperparameters=self.params)
        # Insert the label column
        X.insert(len(X.columns), self._label_column_name, y)
        if X_val is not None:
            X_val.insert(len(X_val.columns), self._label_column_name, y_val)
        scheduler_options = self.params['hpo_params']['scheduler_options']
        search_strategy = self.params['hpo_params']['search_strategy']
        if scheduler_options is None:
            scheduler_options = dict()
        if search_strategy.endswith('hyperband'):
            # Specific defaults for hyperband scheduling
            scheduler_options['reduction_factor'] = scheduler_options.get(
                'reduction_factor', 4)
            scheduler_options['grace_period'] = scheduler_options.get(
                'grace_period', 10)
            scheduler_options['max_t'] = scheduler_options.get('max_t', 50)
        if X_val is None:
            # FIXME: v0.1 Update TextPrediction to use all training data in refit_full
            holdout_frac = default_holdout_frac(len(X), True)
            X, X_val = random_split_train_val(X, valid_ratio=holdout_frac)
        train_data = TabularDataset(X,
                                    column_properties=column_properties,
                                    label_columns=self._label_column_name)
        logger.log(15, 'Train Dataset:')
        logger.log(15, train_data)
        tuning_data = TabularDataset(X_val,
                                     column_properties=column_properties,
                                     label_columns=self._label_column_name)
        logger.log(15, 'Tuning Dataset:')
        logger.log(15, tuning_data)

        if time_limit is not None:
            time_limit = time_limit - (time.time() - time_start)

        # FIXME: Inner error message if no text features is not helpful
        self.model.train(
            train_data=train_data,
            tuning_data=tuning_data,
            resource=resource,
            time_limits=time_limit,
            search_strategy=search_strategy,
            search_options=self.params['hpo_params']['search_options'],
            scheduler_options=scheduler_options,
            num_trials=self.params['hpo_params']['num_trials'],
            console_log=verbosity >= 3,
            ignore_warning=verbosity < 3,
            verbosity=verbosity - 1)
Example #2
0
    def fit(cls,
            train_data,
            label,
            tuning_data=None,
            time_limits=None,
            output_directory='./ag_text',
            feature_columns=None,
            holdout_frac=None,
            eval_metric=None,
            stopping_metric=None,
            nthreads_per_trial=None,
            ngpus_per_trial=None,
            dist_ip_addrs=None,
            num_trials=None,
            search_strategy=None,
            search_options=None,
            scheduler_options=None,
            hyperparameters=None,
            plot_results=None,
            seed=None,
            verbosity=2):
        """Fit models to make predictions based on text inputs.

        Parameters
        ----------
        train_data : :class:`autogluon.task.tabular_prediction.TabularDataset` or `pandas.DataFrame`
            Training dataset where rows = individual training examples, columns = features.
        label : str
            Name of the label column. It can be a stringBy default, we will search for a column named
        tuning_data : :class:`autogluon.task.tabular_prediction.TabularDataset` or `pandas.DataFrame`, default = None
            Another dataset containing validation data reserved for hyperparameter tuning (in same format as training data).
            If `tuning_data = None`, `fit()` will automatically hold out random examples from `train_data` for validation.
        time_limits : int or str, default = None
            Approximately how long `fit()` should run for (wallclock time in seconds if int).
            String values may instead be used to specify time in different units such as: '1min' or '1hour'.
            Longer `time_limits` will usually improve predictive accuracy.
            If not specified, `fit()` will run until all models to try by default have completed training.
        output_directory : str, default = './ag_text'
            Path to directory where models and intermediate outputs should be saved.
        feature_columns : List[str], default = None
            Which columns of table to consider as predictive features (other columns will be ignored, except for label-column).
            If None (by default), all columns of table are considered predictive features.
        holdout_frac : float, default = None
            Fraction of train_data to holdout as tuning data for optimizing hyperparameters (ignored unless `tuning_data = None`).
            If None, default value is selected based on the number of training examples.
        eval_metric : str, default = None
            The evaluation metric that will be used to evaluate the model's predictive performance.
            If None, an appropriate default metric will be selected (accuracy for classification, mean-squared-error for regression).
            Options for classification include: 'acc' (accuracy), 'nll' (negative log-likelihood).
            Additional options for binary classification include: 'f1' (F1 score), 'mcc' (Matthews coefficient), 'auc' (area under ROC curve).
            Options for regression include: 'mse' (mean squared error), 'rmse' (root mean squared error), 'mae' (mean absolute error).
        stopping_metric, default = None
            Metric which iteratively-trained models use to early stop to avoid overfitting.
            Defaults to `eval_metric` value (if None).
            Options are identical to options for `eval_metric`.
        nthreads_per_trial, default = None
            The number of threads per individual model training run. By default, all available CPUs are used.
        ngpus_per_trial, default = None
            The number of GPUs to use per individual model training run. If unspecified, a default value is chosen based on total number of GPUs available.
        dist_ip_addrs, default = None
            List of IP addresses corresponding to remote workers, in order to leverage distributed computation.
        num_trials : , default = None
            The number of trials in the HPO search
        search_strategy : str, default = None
            Which hyperparameter search algorithm to use. Options include:
            'random' (random search), 'bayesopt' (Gaussian process Bayesian optimization),
            'skopt' (SKopt Bayesian optimization), 'grid' (grid search),
            'hyperband' (Hyperband scheduling with random search), 'bayesopt-hyperband'
            (Hyperband scheduling with GP-BO search).
            If unspecified, the default is 'random'.
        search_options : dict, default = None
            Options passed to searcher.
        scheduler_options : dict, default = None
            Additional kwargs passed to scheduler __init__.
        hyperparameters : dict, default = None
            Determines the hyperparameters used by the models. Each hyperparameter may be either fixed value or search space of many values.
            For example of default hyperparameters, see: `autogluon.task.text_prediction.text_prediction.default()`
        plot_results : bool, default = None
            Whether or not to plot intermediate training results during `fit()`.
        seed : int, default = None
            Seed value for random state used inside `fit()`. 
        verbosity : int, default = 2
            Verbosity levels range from 0 to 4 and control how much information is printed
            during fit().
            Higher levels correspond to more detailed print statements
            (you can set verbosity = 0 to suppress warnings).
            If using logging, you can alternatively control amount of information printed
            via `logger.setLevel(L)`,
            where `L` ranges from 0 to 50 (Note: higher values of `L` correspond to fewer print
            statements, opposite of verbosity levels)

        Returns
        -------
        model
            A `BertForTextPredictionBasic` object that can be used for making predictions on new data.
        """
        assert dist_ip_addrs is None, 'Training on remote machine is currently not supported.'
        # Version check of MXNet
        if version.parse(mxnet.__version__) < version.parse('1.7.0') \
                or version.parse(mxnet.__version__) >= version.parse('2.0.0'):
            raise ImportError(
                'You will need to ensure that you have mxnet>=1.7.0, <2.0.0. '
                'For more information about how to install mxnet, you can refer to '
                'https://sxjscience.github.io/KDD2020/ .')

        if verbosity < 0:
            verbosity = 0
        elif verbosity > 4:
            verbosity = 4
        console_log = verbosity >= 2
        logging_config(folder=output_directory,
                       name='ag_text_prediction',
                       logger=logger,
                       level=verbosity2loglevel(verbosity),
                       console=console_log)
        # Parse the hyper-parameters
        if hyperparameters is None:
            hyperparameters = ag_text_prediction_params.create('default')
        elif isinstance(hyperparameters, str):
            hyperparameters = ag_text_prediction_params.create(hyperparameters)
        else:
            base_params = ag_text_prediction_params.create('default')
            hyperparameters = merge_params(base_params, hyperparameters)
        np.random.seed(seed)
        if not isinstance(train_data, pd.DataFrame):
            train_data = load_pd.load(train_data)
        # Inference the label
        if not isinstance(label, list):
            label = [label]
        label_columns = []
        for ele in label:
            if isinstance(ele, int):
                label_columns.append(train_data.columns[ele])
            else:
                label_columns.append(ele)
        if feature_columns is None:
            all_columns = list(train_data.columns)
            feature_columns = [
                ele for ele in all_columns if ele not in label_columns
            ]
        else:
            if isinstance(feature_columns, str):
                feature_columns = [feature_columns]
            for col in feature_columns:
                assert col not in label_columns, 'Feature columns and label columns cannot overlap.'
                assert col in train_data.columns,\
                    'Feature columns must be in the pandas dataframe! Received col = "{}", ' \
                    'all columns = "{}"'.format(col, train_data.columns)
            all_columns = feature_columns + label_columns
            all_columns = [
                ele for ele in train_data.columns if ele in all_columns
            ]
        if tuning_data is None:
            if holdout_frac is None:
                holdout_frac = default_holdout_frac(len(train_data), True)
            train_data, tuning_data = random_split_train_val(
                train_data, valid_ratio=holdout_frac)
        else:
            if not isinstance(tuning_data, pd.DataFrame):
                tuning_data = load_pd.load(tuning_data)
        train_data = train_data[all_columns]
        tuning_data = tuning_data[all_columns]
        column_properties = get_column_properties(
            pd.concat([train_data, tuning_data]),
            metadata=None,
            label_columns=label_columns,
            provided_column_properties=None,
            categorical_default_handle_missing_value=True)
        train_data = TabularDataset(train_data,
                                    column_properties=column_properties,
                                    label_columns=label_columns)
        tuning_data = TabularDataset(
            tuning_data,
            column_properties=train_data.column_properties,
            label_columns=label_columns)

        logger.info('Train Dataset:')
        logger.info(train_data)
        logger.info('Tuning Dataset:')
        logger.info(tuning_data)
        logger.debug('Hyperparameters:')
        logger.debug(hyperparameters)
        has_text_column = False
        for k, v in column_properties.items():
            if v.type == _C.TEXT:
                has_text_column = True
                break
        if not has_text_column:
            raise NotImplementedError('No Text Column is found! This is currently not supported by '
                                      'the TextPrediction task. You may try to use '
                                      'TabularPrediction.fit().\n' \
                                      'The inferred column properties of the training data is {}'
                                      .format(train_data))
        problem_types = []
        label_shapes = []
        for label_col_name in label_columns:
            problem_type, label_shape = infer_problem_type(
                column_properties=column_properties,
                label_col_name=label_col_name)
            problem_types.append(problem_type)
            label_shapes.append(label_shape)
        logging.info(
            'Label columns={}, Feature columns={}, Problem types={}, Label shapes={}'
            .format(label_columns, feature_columns, problem_types,
                    label_shapes))
        eval_metric, stopping_metric, log_metrics =\
            infer_eval_stop_log_metrics(problem_types[0],
                                        label_shapes[0],
                                        eval_metric=eval_metric,
                                        stopping_metric=stopping_metric)
        logging.info('Eval Metric={}, Stop Metric={}, Log Metrics={}'.format(
            eval_metric, stopping_metric, log_metrics))
        model_candidates = []
        for model_type, kwargs in hyperparameters['models'].items():
            search_space = kwargs['search_space']
            if model_type == 'BertForTextPredictionBasic':
                model = BertForTextPredictionBasic(
                    column_properties=column_properties,
                    label_columns=label_columns,
                    feature_columns=feature_columns,
                    label_shapes=label_shapes,
                    problem_types=problem_types,
                    stopping_metric=stopping_metric,
                    log_metrics=log_metrics,
                    base_config=None,
                    search_space=search_space,
                    output_directory=output_directory,
                    logger=logger)
                model_candidates.append(model)
            else:
                raise ValueError(
                    'model_type = "{}" is not supported. You can try to use '
                    'model_type = "BertForTextPredictionBasic"'.format(
                        model_type))
        assert len(
            model_candidates) == 1, 'Only one model is supported currently'
        recommended_resource = get_recommended_resource(
            nthreads_per_trial=nthreads_per_trial,
            ngpus_per_trial=ngpus_per_trial)
        if search_strategy is None:
            search_strategy = hyperparameters['hpo_params']['search_strategy']
        if time_limits is None:
            time_limits = hyperparameters['hpo_params']['time_limits']
        else:
            if isinstance(time_limits, str):
                if time_limits.endswith('min'):
                    time_limits = int(float(time_limits[:-3]) * 60)
                elif time_limits.endswith('hour'):
                    time_limits = int(float(time_limits[:-4]) * 60 * 60)
                else:
                    raise ValueError(
                        'The given time_limits="{}" cannot be parsed!'.format(
                            time_limits))
        if num_trials is None:
            num_trials = hyperparameters['hpo_params']['num_trials']
        if scheduler_options is None:
            scheduler_options = hyperparameters['hpo_params'][
                'scheduler_options']
            if scheduler_options is None:
                scheduler_options = dict()
        if search_strategy.endswith('hyperband'):
            # Specific defaults for hyperband scheduling
            scheduler_options['reduction_factor'] = scheduler_options.get(
                'reduction_factor', 4)
            scheduler_options['grace_period'] = scheduler_options.get(
                'grace_period', 10)
            scheduler_options['max_t'] = scheduler_options.get('max_t', 50)

        if recommended_resource['num_gpus'] == 0:
            warnings.warn(
                'Recommend to use GPU to run the TextPrediction task!')
        model = model_candidates[0]
        if plot_results is None:
            if in_ipynb():
                plot_results = True
            else:
                plot_results = False
        model.train(train_data=train_data,
                    tuning_data=tuning_data,
                    resource=recommended_resource,
                    time_limits=time_limits,
                    search_strategy=search_strategy,
                    search_options=search_options,
                    scheduler_options=scheduler_options,
                    num_trials=num_trials,
                    plot_results=plot_results,
                    console_log=verbosity > 2,
                    ignore_warning=verbosity <= 2)
        return model
Example #3
0
    def fit(self,
            train_data,
            tuning_data=None,
            time_limit=None,
            presets=None,
            hyperparameters=None,
            feature_metadata=None,
            **kwargs):
        """
        Fit models to predict a column of data table based on the other columns.

        # TODO: Move documentation from TabularPrediction.fit to here
        # TODO: Move num_cpu/num_gpu to AG_args_fit
        # TODO: AG_args -> ag_args? +1 -> Will change after replacing original TabularPredictor to avoid extra API breaks.
        # TODO: consider adding kwarg option for data which has already been preprocessed by feature generator to skip feature generation.

        """
        if self._learner.is_fit:
            raise AssertionError(
                'Predictor is already fit! To fit additional models, refer to `predictor.fit_extra`.'
            )
        kwargs_orig = kwargs.copy()
        kwargs = self._validate_fit_kwargs(kwargs)

        verbosity = kwargs.get('verbosity', self.verbosity)
        set_logger_verbosity(verbosity, logger=logger)

        if verbosity >= 3:
            logger.log(20, '============ fit kwarg info ============')
            logger.log(20, 'User Specified kwargs:')
            logger.log(20, f'{pprint.pformat(kwargs_orig)}')
            logger.log(20, 'Full kwargs:')
            logger.log(20, f'{pprint.pformat(kwargs)}')
            logger.log(20, '========================================')

        holdout_frac = kwargs['holdout_frac']
        num_bag_folds = kwargs['num_bag_folds']
        num_bag_sets = kwargs['num_bag_sets']
        num_stack_levels = kwargs['num_stack_levels']
        auto_stack = kwargs['auto_stack']
        hyperparameter_tune_kwargs = kwargs['hyperparameter_tune_kwargs']
        num_cpus = kwargs['num_cpus']
        num_gpus = kwargs['num_gpus']
        feature_generator = kwargs['feature_generator']
        unlabeled_data = kwargs['unlabeled_data']
        save_bagged_folds = kwargs['save_bagged_folds']

        ag_args = kwargs['AG_args']
        ag_args_fit = kwargs['AG_args_fit']
        ag_args_ensemble = kwargs['AG_args_ensemble']
        excluded_model_types = kwargs['excluded_model_types']

        self._set_feature_generator(feature_generator=feature_generator,
                                    feature_metadata=feature_metadata)
        train_data, tuning_data, unlabeled_data = self._validate_fit_data(
            train_data=train_data,
            tuning_data=tuning_data,
            unlabeled_data=unlabeled_data)

        if hyperparameters is None:
            hyperparameters = 'default'
        if isinstance(hyperparameters, str):
            hyperparameters = get_hyperparameter_config(hyperparameters)

        # Process kwargs to create trainer, schedulers, searchers:
        num_bag_folds, num_bag_sets, num_stack_levels = self._sanitize_stack_args(
            num_bag_folds=num_bag_folds,
            num_bag_sets=num_bag_sets,
            num_stack_levels=num_stack_levels,
            time_limit=time_limit,
            auto_stack=auto_stack,
            num_train_rows=len(train_data),
        )

        if hyperparameter_tune_kwargs is not None:
            scheduler_options = self._init_scheduler(
                hyperparameter_tune_kwargs, time_limit, hyperparameters,
                num_cpus, num_gpus, num_bag_folds, num_stack_levels)
        else:
            scheduler_options = None
        hyperparameter_tune = scheduler_options is not None
        if hyperparameter_tune:
            logger.log(
                30,
                'Warning: hyperparameter tuning is currently experimental and may cause the process to hang. Setting `auto_stack=True` instead is recommended to achieve maximum quality models.'
            )

        if holdout_frac is None:
            holdout_frac = default_holdout_frac(len(train_data),
                                                hyperparameter_tune)

        if ag_args_fit is None:
            ag_args_fit = dict()
        # TODO: v0.1: Update to be 'auto' or None by default to give full control to individual models.
        if 'num_cpus' not in ag_args_fit and num_cpus is not None:
            ag_args_fit['num_cpus'] = num_cpus
        if 'num_gpus' not in ag_args_fit and num_gpus is not None:
            ag_args_fit['num_gpus'] = num_gpus

        # TODO: v0.1: make core_kwargs a kwargs argument to predictor.fit, add aux_kwargs to predictor.fit
        core_kwargs = {
            'ag_args': ag_args,
            'ag_args_ensemble': ag_args_ensemble,
            'ag_args_fit': ag_args_fit,
            'excluded_model_types': excluded_model_types
        }
        self._learner.fit(X=train_data,
                          X_val=tuning_data,
                          X_unlabeled=unlabeled_data,
                          hyperparameter_tune_kwargs=scheduler_options,
                          holdout_frac=holdout_frac,
                          num_bagging_folds=num_bag_folds,
                          num_bagging_sets=num_bag_sets,
                          stack_ensemble_levels=num_stack_levels,
                          hyperparameters=hyperparameters,
                          core_kwargs=core_kwargs,
                          time_limit=time_limit,
                          save_bagged_folds=save_bagged_folds,
                          verbosity=verbosity)
        self._set_post_fit_vars()

        self._post_fit(
            keep_only_best=kwargs['keep_only_best'],
            refit_full=kwargs['refit_full'],
            set_best_to_refit_full=kwargs['set_best_to_refit_full'],
            save_space=kwargs['save_space'],
        )
        self.save()
        return self
Example #4
0
    def fit(self,
            train_data,
            tuning_data=None,
            time_limit=None,
            presets=None,
            hyperparameters=None,
            column_types=None,
            num_cpus=None,
            num_gpus=None,
            num_trials=None,
            plot_results=None,
            holdout_frac=None,
            seed=0):
        """
        Fit Transformer models to predict label column of a data table based on the other columns (which may contain text or numeric/categorical features).

        Parameters
        ----------
        train_data : str or :class:`TabularDataset` or :class:`pd.DataFrame`
            Table of the training data, which is similar to a pandas DataFrame.
            If str is passed, `train_data` will be loaded using the str value as the file path.
        tuning_data : str or :class:`TabularDataset` or :class:`pd.DataFrame`, default = None
            Another dataset containing validation data reserved for tuning processes such as early stopping and hyperparameter tuning.
            This dataset should be in the same format as `train_data`.
            If str is passed, `tuning_data` will be loaded using the str value as the file path.
            Note: final model returned may be fit on `tuning_data` as well as `train_data`. Do not provide your evaluation test data here!
            If `tuning_data = None`, `fit()` will automatically hold out some random validation examples from `train_data`.
        time_limit : int, default = None
            Approximately how long `fit()` should run for (wallclock time in seconds).
            If not specified, `fit()` will run until the model has completed training.
        presets : str, default = None
            Presets are pre-registered configurations that control training (hyperparameters and other aspects).
            It is recommended to specify presets and avoid specifying most other `fit()` arguments or model hyperparameters prior to becoming familiar with AutoGluon.
            Print all available presets via `autogluon.text.list_presets()`.
            Some notable presets include:
                - "best_quality": produce the most accurate overall predictor (regardless of its efficiency).
                - "medium_quality_faster_train": produce an accurate predictor but take efficiency into account (this is the default preset).
                - "lower_quality_fast_train": produce a predict that is quick to train and make predictions with, even if its accuracy is worse.
        hyperparameters : dict, default = None
            The hyperparameters of the `fit()` function, which affect the resulting accuracy of the trained predictor.
            Experienced AutoGluon users can use this argument to specify neural network hyperparameter values/search-spaces as well as which hyperparameter-tuning strategy should be employed. See the "Text Prediction" tutorials for examples.
        column_types : dict, default = None
            The type of data in each table column can be specified via a dictionary that maps the column name to its data type.
            For example: `column_types = {"item_name": "text", "brand": "text", "product_description": "text", "height": "numerical"}` may be used for a table with columns: "item_name", "brand", "product_description", and "height".
            If None, column_types will be automatically inferred from the data.
            The current supported types are:
            - "text": each row in this column contains text (sentence, paragraph, etc.).
            - "numerical": each row in this column contains a number.
            - "categorical": each row in this column belongs to one of K categories.
        num_cpus : int, default = None
            The number of CPUs to use for each training run (i.e. one hyperparameter-tuning trial).
        num_gpus : int, default = None
            The number of GPUs to use to use for each training run (i.e. one hyperparameter-tuning trial). We recommend at least 1 GPU for TextPredictor as its neural network models are computationally intensive.
        num_trials : int, default = None
            If hyperparameter-tuning is used, specifies how many HPO trials should be run (assuming `time_limit` has not been exceeded).
            By default, this is the provided number of trials in the `hyperparameters` or `presets`.
            If specified here, this value will overwrite the value in `hyperparameters['tune_kwargs']['num_trials']`.
        plot_results : bool, default = None
            Whether to plot intermediate results from training. If None, will be decided based on the environment in which `fit()` is run.
        holdout_frac : float, default = None
            Fraction of train_data to holdout as tuning data for optimizing hyperparameters (ignored unless `tuning_data = None`).
            Default value (if None) is selected based on the number of rows in the training data and whether hyperparameter-tuning is utilized.
        seed : int, default = 0
            The random seed to use for this training run. If None, no seed will be specified and repeated runs will produce different results.

        Returns
        -------
        :class:`TextPredictor` object. Returns self.
        """
        assert self._fit_called is False
        verbosity = self.verbosity
        if verbosity is None:
            verbosity = 3
        if presets is not None:
            preset_hparams = ag_text_presets.create(presets)
        else:
            preset_hparams = ag_text_presets.create('default')
        hyperparameters = merge_params(preset_hparams, hyperparameters)
        if num_trials is not None:
            hyperparameters['tune_kwargs']['num_trials'] = num_trials
        if isinstance(self._label, str):
            label_columns = [self._label]
        else:
            label_columns = list(self._label)
        # Get the training and tuning data as pandas dataframe
        if isinstance(train_data, str):
            train_data = load_pd.load(train_data)
        if not isinstance(train_data, pd.DataFrame):
            raise AssertionError(
                f'train_data is required to be a pandas DataFrame, but was instead: {type(train_data)}'
            )
        all_columns = list(train_data.columns)
        feature_columns = [
            ele for ele in all_columns if ele not in label_columns
        ]
        train_data = train_data[all_columns]
        # Get tuning data
        if tuning_data is not None:
            if isinstance(tuning_data, str):
                tuning_data = load_pd.load(tuning_data)
            if not isinstance(tuning_data, pd.DataFrame):
                raise AssertionError(
                    f'tuning_data is required to be a pandas DataFrame, but was instead: {type(tuning_data)}'
                )
            tuning_data = tuning_data[all_columns]
        else:
            if holdout_frac is None:
                num_trials = hyperparameters['tune_kwargs']['num_trials']
                if num_trials == 1:
                    holdout_frac = default_holdout_frac(len(train_data), False)
                else:
                    # For HPO, we will need to use a larger held-out ratio
                    holdout_frac = default_holdout_frac(len(train_data), True)
            train_data, tuning_data = train_test_split(
                train_data,
                test_size=holdout_frac,
                random_state=np.random.RandomState(seed))
        column_types, problem_type = infer_column_problem_types(
            train_data,
            tuning_data,
            label_columns=label_columns,
            problem_type=self._problem_type,
            provided_column_types=column_types)
        self._eval_metric, log_metrics = infer_eval_log_metrics(
            problem_type=problem_type, eval_metric=self._eval_metric)
        has_text_column = False
        for k, v in column_types.items():
            if v == _C.TEXT:
                has_text_column = True
                break
        if not has_text_column:
            raise AssertionError(
                'No Text Column is found! This is currently not supported by '
                'the TextPredictor. You may try to use '
                'autogluon.tabular.TabularPredictor.\n'
                'The inferred column properties of the training data is {}'.
                format(column_types))
        logger.info('Problem Type="{}"'.format(problem_type))
        logger.info(printable_column_type_string(column_types))
        self._problem_type = problem_type
        if 'models' not in hyperparameters or 'MultimodalTextModel' not in hyperparameters[
                'models']:
            raise ValueError(
                'The current TextPredictor only supports "MultimodalTextModel" '
                'and you must ensure that '
                'hyperparameters["models"]["MultimodalTextModel"] can be accessed.'
            )
        model_hparams = hyperparameters['models']['MultimodalTextModel']
        self._backend = model_hparams['backend']
        if plot_results is None:
            plot_results = in_ipynb()
        if self._backend == 'gluonnlp_v0':
            import warnings
            warnings.filterwarnings('ignore', module='mxnet')
            from ..mx.models import MultiModalTextModel
            self._model = MultiModalTextModel(column_types=column_types,
                                              feature_columns=feature_columns,
                                              label_columns=label_columns,
                                              problem_type=self._problem_type,
                                              eval_metric=self._eval_metric,
                                              log_metrics=log_metrics,
                                              output_directory=self._path)
            self._model.train(train_data=train_data,
                              tuning_data=tuning_data,
                              num_cpus=num_cpus,
                              num_gpus=num_gpus,
                              search_space=model_hparams['search_space'],
                              tune_kwargs=hyperparameters['tune_kwargs'],
                              time_limit=time_limit,
                              seed=seed,
                              plot_results=plot_results,
                              verbosity=verbosity)
        else:
            raise NotImplementedError(
                "Currently, we only support using "
                "the autogluon-contrib-nlp and MXNet "
                "as the backend of AutoGluon-Text. In the future, "
                "we will support other models.")
        logger.info(f'Training completed. Auto-saving to "{self.path}". '
                    f'For loading the model, you can use'
                    f' `predictor = TextPredictor.load("{self.path}")`')
        self.save(self.path)
        return self
    def fit(self,
            train_data,
            tuning_data=None,
            time_limit=None,
            presets=None,
            hyperparameters=None,
            feature_columns=None,
            column_types=None,
            num_cpus=None,
            num_gpus=None,
            num_trials=None,
            seed=None):
        """Fit the predictor

        Parameters
        ----------
        train_data
            The training data
        tuning_data
            The tuning data
        time_limit
            The time limits
        presets
            The user can specify the presets of the hyper-parameters.
        hyperparameters
            The hyper-parameters
        feature_columns
            Specify which columns in the data
        column_types
            The provided type of the columns
        num_cpus
            The number of CPUs to use for each trial
        num_gpus
            The number of GPUs to use for each trial
        num_trials
            The number of trials. By default, we will use the provided number of trials in the
            hyperparameters or presets. This will overwrite the provided value.
        seed
            The seed of the experiment

        Returns
        -------
        self
        """
        assert self._fit_called is False
        if presets is not None:
            preset_hparams = ag_text_presets.create(presets)
        else:
            preset_hparams = ag_text_presets.create('default')
        hyperparameters = merge_params(preset_hparams, hyperparameters)
        if seed is not None:
            hyperparameters['seed'] = seed
        seed = hyperparameters['seed']
        if num_trials is not None:
            hyperparameters['hpo_params']['num_trials'] = num_trials
        if isinstance(self._label, str):
            label_columns = [self._label]
        else:
            label_columns = list(self._label)
        # Get the training and tuning data as pandas dataframe
        if not isinstance(train_data, pd.DataFrame):
            train_data = load_pd.load(train_data)
        if feature_columns is None:
            all_columns = list(train_data.columns)
            feature_columns = [
                ele for ele in all_columns if ele not in label_columns
            ]
        else:
            if isinstance(feature_columns, str):
                feature_columns = [feature_columns]
            for col in feature_columns:
                assert col not in label_columns, 'Feature columns and label columns cannot overlap.'
                assert col in train_data.columns,\
                    'Feature columns must be in the pandas dataframe! Received col = "{}", ' \
                    'all columns = "{}"'.format(col, train_data.columns)
            all_columns = feature_columns + label_columns
        train_data = train_data[all_columns]
        # Get tuning data
        if tuning_data is not None:
            if not isinstance(tuning_data, pd.DataFrame):
                tuning_data = load_pd.load(tuning_data)
            tuning_data = tuning_data[all_columns]
        else:
            if hyperparameters['misc']['holdout_frac'] is not None:
                holdout_frac = hyperparameters['misc']['holdout_frac']
            else:
                num_trials = hyperparameters['hpo_params']['num_trials']
                if num_trials == 1:
                    holdout_frac = default_holdout_frac(len(train_data), False)
                else:
                    # For HPO, we will need to use a larger held-out ratio
                    holdout_frac = default_holdout_frac(len(train_data), True)
            train_data, tuning_data = train_test_split(
                train_data,
                test_size=holdout_frac,
                random_state=np.random.RandomState(seed))
        column_types, problem_type = infer_column_problem_types(
            train_data,
            tuning_data,
            label_columns=label_columns,
            problem_type=self._problem_type,
            provided_column_types=column_types)
        self._eval_metric, log_metrics = infer_eval_log_metrics(
            problem_type=problem_type, eval_metric=self._eval_metric)
        has_text_column = False
        for k, v in column_types.items():
            if v == _C.TEXT:
                has_text_column = True
                break
        if not has_text_column:
            raise AssertionError(
                'No Text Column is found! This is currently not supported by '
                'the TextPrediction task. You may try to use '
                'autogluon.tabular.TabularPredictor.\n'
                'The inferred column properties of the training data is {}'.
                format(train_data))
        logger.log(25, 'Problem Type="{}"'.format(problem_type))
        logger.log(25, printable_column_type_string(column_types))
        self._problem_type = problem_type
        model_hparams = hyperparameters['models']['MultimodalTextModel']
        self._backend = model_hparams['backend']
        if model_hparams['backend'] == 'gluonnlp_v0':
            from ..mx.models import MultiModalTextModel
            self._model = MultiModalTextModel(column_types=column_types,
                                              feature_columns=feature_columns,
                                              label_columns=label_columns,
                                              problem_type=self._problem_type,
                                              eval_metric=self._eval_metric,
                                              log_metrics=log_metrics,
                                              output_directory=self._path)
            self._model.train(train_data=train_data,
                              tuning_data=tuning_data,
                              num_cpus=num_cpus,
                              num_gpus=num_gpus,
                              search_space=model_hparams['search_space'],
                              hpo_params=hyperparameters['hpo_params'],
                              time_limit=time_limit,
                              seed=seed,
                              verbosity=self.verbosity)
        else:
            raise NotImplementedError(
                "Currently, we only support using "
                "the autogluon-contrib-nlp and MXNet "
                "as the backend of AutoGluon-Text. In the future, "
                "we will support other models.")
        return self
    def _fit(self,
             X_train: pd.DataFrame,
             y_train: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             **kwargs):
        """The internal fit function

        Parameters
        ----------
        X_train
            Features of the training dataset
        y_train
            Labels of the training dataset
        X_val
            Features of the validation dataset
        y_val
            Labels of the validation dataset
        time_limit
            The time limits for the fit function
        kwargs
            Other keyword arguments

        """
        try:
            import mxnet as mx
            from autogluon.text.text_prediction.dataset import TabularDataset, random_split_train_val
            from autogluon.text.text_prediction.text_prediction import get_recommended_resource
        except ImportError:
            raise ImportError(AG_TEXT_IMPORT_ERROR)

        # Get arguments from kwargs
        verbosity = kwargs.get('verbosity', 2)
        num_cpus = kwargs.get('num_cpus', None)
        num_gpus = kwargs.get('num_gpus', None)

        # Infer resource
        resource = get_recommended_resource(nthreads_per_trial=num_cpus,
                                            ngpus_per_trial=num_gpus)

        if resource['num_gpus'] == 0:
            raise NoGPUError(
                f'\tNo GPUs available to train {self.name}. Resources: {resource}'
            )

        # Set seed
        seed = self.params.get('seed')
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)
            mx.random.seed(seed)

        X_train = self.preprocess(X_train, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)
        column_properties = self._build_model(X_train=X_train,
                                              y_train=y_train,
                                              X_val=X_val,
                                              y_val=y_val,
                                              hyperparameters=self.params)
        # Insert the label column
        X_train.insert(len(X_train.columns), self._label_column_name, y_train)
        if X_val is not None:
            X_val.insert(len(X_val.columns), self._label_column_name, y_val)
        scheduler_options = self.params['hpo_params']['scheduler_options']
        search_strategy = self.params['hpo_params']['search_strategy']
        if scheduler_options is None:
            scheduler_options = dict()
        if search_strategy.endswith('hyperband'):
            # Specific defaults for hyperband scheduling
            scheduler_options['reduction_factor'] = scheduler_options.get(
                'reduction_factor', 4)
            scheduler_options['grace_period'] = scheduler_options.get(
                'grace_period', 10)
            scheduler_options['max_t'] = scheduler_options.get('max_t', 50)
        if X_val is None:
            # FIXME: v0.1 Update TextPrediction to use all training data in refit_full
            holdout_frac = default_holdout_frac(len(X_train), True)
            X_train, X_val = random_split_train_val(X_train,
                                                    valid_ratio=holdout_frac)
        train_data = TabularDataset(X_train,
                                    column_properties=column_properties,
                                    label_columns=self._label_column_name)
        logger.info('Train Dataset:')
        logger.info(train_data)
        tuning_data = TabularDataset(X_val,
                                     column_properties=column_properties,
                                     label_columns=self._label_column_name)
        logger.info('Tuning Dataset:')
        logger.info(tuning_data)
        self.model.train(
            train_data=train_data,
            tuning_data=tuning_data,
            resource=resource,
            time_limits=time_limit,
            search_strategy=search_strategy,
            search_options=self.params['hpo_params']['search_options'],
            scheduler_options=scheduler_options,
            num_trials=self.params['hpo_params']['num_trials'],
            console_log=verbosity >= 2,
            ignore_warning=verbosity < 2)