def _fit(self, X: pd.DataFrame, y: pd.Series, X_val: Optional[pd.DataFrame] = None, y_val: Optional[pd.Series] = None, time_limit: Optional[int] = None, sample_weight=None, **kwargs): """The internal fit function Parameters ---------- X Features of the training dataset y Labels of the training dataset X_val Features of the validation dataset y_val Labels of the validation dataset time_limit The time limits for the fit function kwargs Other keyword arguments """ try: import mxnet as mx from autogluon.text.text_prediction.dataset import TabularDataset, random_split_train_val from autogluon.text.text_prediction.text_prediction import get_recommended_resource except ImportError: raise ImportError(AG_TEXT_IMPORT_ERROR) time_start = time.time() # Get arguments from kwargs verbosity = kwargs.get('verbosity', 2) num_cpus = kwargs.get('num_cpus', None) num_gpus = kwargs.get('num_gpus', None) if sample_weight is not None: # TODO: support logger.log( 15, "sample_weight not yet supported for TextPredictionV1Model, this model will ignore them in training." ) # Infer resource resource = get_recommended_resource(nthreads_per_trial=num_cpus, ngpus_per_trial=num_gpus) if resource['num_gpus'] == 0: raise NoGPUError( f'\tNo GPUs available to train {self.name}. Resources: {resource}' ) # Set seed seed = self.params.get('seed') if seed is not None: random.seed(seed) np.random.seed(seed) mx.random.seed(seed) X = self.preprocess(X, fit=True) if X_val is not None: X_val = self.preprocess(X_val) if not self.feature_metadata.get_features(valid_raw_types=['object']): raise NoValidFeatures(f'No text features to train {self.name}.') column_properties = self._build_model(X=X, y=y, X_val=X_val, y_val=y_val, hyperparameters=self.params) # Insert the label column X.insert(len(X.columns), self._label_column_name, y) if X_val is not None: X_val.insert(len(X_val.columns), self._label_column_name, y_val) scheduler_options = self.params['hpo_params']['scheduler_options'] search_strategy = self.params['hpo_params']['search_strategy'] if scheduler_options is None: scheduler_options = dict() if search_strategy.endswith('hyperband'): # Specific defaults for hyperband scheduling scheduler_options['reduction_factor'] = scheduler_options.get( 'reduction_factor', 4) scheduler_options['grace_period'] = scheduler_options.get( 'grace_period', 10) scheduler_options['max_t'] = scheduler_options.get('max_t', 50) if X_val is None: # FIXME: v0.1 Update TextPrediction to use all training data in refit_full holdout_frac = default_holdout_frac(len(X), True) X, X_val = random_split_train_val(X, valid_ratio=holdout_frac) train_data = TabularDataset(X, column_properties=column_properties, label_columns=self._label_column_name) logger.log(15, 'Train Dataset:') logger.log(15, train_data) tuning_data = TabularDataset(X_val, column_properties=column_properties, label_columns=self._label_column_name) logger.log(15, 'Tuning Dataset:') logger.log(15, tuning_data) if time_limit is not None: time_limit = time_limit - (time.time() - time_start) # FIXME: Inner error message if no text features is not helpful self.model.train( train_data=train_data, tuning_data=tuning_data, resource=resource, time_limits=time_limit, search_strategy=search_strategy, search_options=self.params['hpo_params']['search_options'], scheduler_options=scheduler_options, num_trials=self.params['hpo_params']['num_trials'], console_log=verbosity >= 3, ignore_warning=verbosity < 3, verbosity=verbosity - 1)
def fit(cls, train_data, label, tuning_data=None, time_limits=None, output_directory='./ag_text', feature_columns=None, holdout_frac=None, eval_metric=None, stopping_metric=None, nthreads_per_trial=None, ngpus_per_trial=None, dist_ip_addrs=None, num_trials=None, search_strategy=None, search_options=None, scheduler_options=None, hyperparameters=None, plot_results=None, seed=None, verbosity=2): """Fit models to make predictions based on text inputs. Parameters ---------- train_data : :class:`autogluon.task.tabular_prediction.TabularDataset` or `pandas.DataFrame` Training dataset where rows = individual training examples, columns = features. label : str Name of the label column. It can be a stringBy default, we will search for a column named tuning_data : :class:`autogluon.task.tabular_prediction.TabularDataset` or `pandas.DataFrame`, default = None Another dataset containing validation data reserved for hyperparameter tuning (in same format as training data). If `tuning_data = None`, `fit()` will automatically hold out random examples from `train_data` for validation. time_limits : int or str, default = None Approximately how long `fit()` should run for (wallclock time in seconds if int). String values may instead be used to specify time in different units such as: '1min' or '1hour'. Longer `time_limits` will usually improve predictive accuracy. If not specified, `fit()` will run until all models to try by default have completed training. output_directory : str, default = './ag_text' Path to directory where models and intermediate outputs should be saved. feature_columns : List[str], default = None Which columns of table to consider as predictive features (other columns will be ignored, except for label-column). If None (by default), all columns of table are considered predictive features. holdout_frac : float, default = None Fraction of train_data to holdout as tuning data for optimizing hyperparameters (ignored unless `tuning_data = None`). If None, default value is selected based on the number of training examples. eval_metric : str, default = None The evaluation metric that will be used to evaluate the model's predictive performance. If None, an appropriate default metric will be selected (accuracy for classification, mean-squared-error for regression). Options for classification include: 'acc' (accuracy), 'nll' (negative log-likelihood). Additional options for binary classification include: 'f1' (F1 score), 'mcc' (Matthews coefficient), 'auc' (area under ROC curve). Options for regression include: 'mse' (mean squared error), 'rmse' (root mean squared error), 'mae' (mean absolute error). stopping_metric, default = None Metric which iteratively-trained models use to early stop to avoid overfitting. Defaults to `eval_metric` value (if None). Options are identical to options for `eval_metric`. nthreads_per_trial, default = None The number of threads per individual model training run. By default, all available CPUs are used. ngpus_per_trial, default = None The number of GPUs to use per individual model training run. If unspecified, a default value is chosen based on total number of GPUs available. dist_ip_addrs, default = None List of IP addresses corresponding to remote workers, in order to leverage distributed computation. num_trials : , default = None The number of trials in the HPO search search_strategy : str, default = None Which hyperparameter search algorithm to use. Options include: 'random' (random search), 'bayesopt' (Gaussian process Bayesian optimization), 'skopt' (SKopt Bayesian optimization), 'grid' (grid search), 'hyperband' (Hyperband scheduling with random search), 'bayesopt-hyperband' (Hyperband scheduling with GP-BO search). If unspecified, the default is 'random'. search_options : dict, default = None Options passed to searcher. scheduler_options : dict, default = None Additional kwargs passed to scheduler __init__. hyperparameters : dict, default = None Determines the hyperparameters used by the models. Each hyperparameter may be either fixed value or search space of many values. For example of default hyperparameters, see: `autogluon.task.text_prediction.text_prediction.default()` plot_results : bool, default = None Whether or not to plot intermediate training results during `fit()`. seed : int, default = None Seed value for random state used inside `fit()`. verbosity : int, default = 2 Verbosity levels range from 0 to 4 and control how much information is printed during fit(). Higher levels correspond to more detailed print statements (you can set verbosity = 0 to suppress warnings). If using logging, you can alternatively control amount of information printed via `logger.setLevel(L)`, where `L` ranges from 0 to 50 (Note: higher values of `L` correspond to fewer print statements, opposite of verbosity levels) Returns ------- model A `BertForTextPredictionBasic` object that can be used for making predictions on new data. """ assert dist_ip_addrs is None, 'Training on remote machine is currently not supported.' # Version check of MXNet if version.parse(mxnet.__version__) < version.parse('1.7.0') \ or version.parse(mxnet.__version__) >= version.parse('2.0.0'): raise ImportError( 'You will need to ensure that you have mxnet>=1.7.0, <2.0.0. ' 'For more information about how to install mxnet, you can refer to ' 'https://sxjscience.github.io/KDD2020/ .') if verbosity < 0: verbosity = 0 elif verbosity > 4: verbosity = 4 console_log = verbosity >= 2 logging_config(folder=output_directory, name='ag_text_prediction', logger=logger, level=verbosity2loglevel(verbosity), console=console_log) # Parse the hyper-parameters if hyperparameters is None: hyperparameters = ag_text_prediction_params.create('default') elif isinstance(hyperparameters, str): hyperparameters = ag_text_prediction_params.create(hyperparameters) else: base_params = ag_text_prediction_params.create('default') hyperparameters = merge_params(base_params, hyperparameters) np.random.seed(seed) if not isinstance(train_data, pd.DataFrame): train_data = load_pd.load(train_data) # Inference the label if not isinstance(label, list): label = [label] label_columns = [] for ele in label: if isinstance(ele, int): label_columns.append(train_data.columns[ele]) else: label_columns.append(ele) if feature_columns is None: all_columns = list(train_data.columns) feature_columns = [ ele for ele in all_columns if ele not in label_columns ] else: if isinstance(feature_columns, str): feature_columns = [feature_columns] for col in feature_columns: assert col not in label_columns, 'Feature columns and label columns cannot overlap.' assert col in train_data.columns,\ 'Feature columns must be in the pandas dataframe! Received col = "{}", ' \ 'all columns = "{}"'.format(col, train_data.columns) all_columns = feature_columns + label_columns all_columns = [ ele for ele in train_data.columns if ele in all_columns ] if tuning_data is None: if holdout_frac is None: holdout_frac = default_holdout_frac(len(train_data), True) train_data, tuning_data = random_split_train_val( train_data, valid_ratio=holdout_frac) else: if not isinstance(tuning_data, pd.DataFrame): tuning_data = load_pd.load(tuning_data) train_data = train_data[all_columns] tuning_data = tuning_data[all_columns] column_properties = get_column_properties( pd.concat([train_data, tuning_data]), metadata=None, label_columns=label_columns, provided_column_properties=None, categorical_default_handle_missing_value=True) train_data = TabularDataset(train_data, column_properties=column_properties, label_columns=label_columns) tuning_data = TabularDataset( tuning_data, column_properties=train_data.column_properties, label_columns=label_columns) logger.info('Train Dataset:') logger.info(train_data) logger.info('Tuning Dataset:') logger.info(tuning_data) logger.debug('Hyperparameters:') logger.debug(hyperparameters) has_text_column = False for k, v in column_properties.items(): if v.type == _C.TEXT: has_text_column = True break if not has_text_column: raise NotImplementedError('No Text Column is found! This is currently not supported by ' 'the TextPrediction task. You may try to use ' 'TabularPrediction.fit().\n' \ 'The inferred column properties of the training data is {}' .format(train_data)) problem_types = [] label_shapes = [] for label_col_name in label_columns: problem_type, label_shape = infer_problem_type( column_properties=column_properties, label_col_name=label_col_name) problem_types.append(problem_type) label_shapes.append(label_shape) logging.info( 'Label columns={}, Feature columns={}, Problem types={}, Label shapes={}' .format(label_columns, feature_columns, problem_types, label_shapes)) eval_metric, stopping_metric, log_metrics =\ infer_eval_stop_log_metrics(problem_types[0], label_shapes[0], eval_metric=eval_metric, stopping_metric=stopping_metric) logging.info('Eval Metric={}, Stop Metric={}, Log Metrics={}'.format( eval_metric, stopping_metric, log_metrics)) model_candidates = [] for model_type, kwargs in hyperparameters['models'].items(): search_space = kwargs['search_space'] if model_type == 'BertForTextPredictionBasic': model = BertForTextPredictionBasic( column_properties=column_properties, label_columns=label_columns, feature_columns=feature_columns, label_shapes=label_shapes, problem_types=problem_types, stopping_metric=stopping_metric, log_metrics=log_metrics, base_config=None, search_space=search_space, output_directory=output_directory, logger=logger) model_candidates.append(model) else: raise ValueError( 'model_type = "{}" is not supported. You can try to use ' 'model_type = "BertForTextPredictionBasic"'.format( model_type)) assert len( model_candidates) == 1, 'Only one model is supported currently' recommended_resource = get_recommended_resource( nthreads_per_trial=nthreads_per_trial, ngpus_per_trial=ngpus_per_trial) if search_strategy is None: search_strategy = hyperparameters['hpo_params']['search_strategy'] if time_limits is None: time_limits = hyperparameters['hpo_params']['time_limits'] else: if isinstance(time_limits, str): if time_limits.endswith('min'): time_limits = int(float(time_limits[:-3]) * 60) elif time_limits.endswith('hour'): time_limits = int(float(time_limits[:-4]) * 60 * 60) else: raise ValueError( 'The given time_limits="{}" cannot be parsed!'.format( time_limits)) if num_trials is None: num_trials = hyperparameters['hpo_params']['num_trials'] if scheduler_options is None: scheduler_options = hyperparameters['hpo_params'][ 'scheduler_options'] if scheduler_options is None: scheduler_options = dict() if search_strategy.endswith('hyperband'): # Specific defaults for hyperband scheduling scheduler_options['reduction_factor'] = scheduler_options.get( 'reduction_factor', 4) scheduler_options['grace_period'] = scheduler_options.get( 'grace_period', 10) scheduler_options['max_t'] = scheduler_options.get('max_t', 50) if recommended_resource['num_gpus'] == 0: warnings.warn( 'Recommend to use GPU to run the TextPrediction task!') model = model_candidates[0] if plot_results is None: if in_ipynb(): plot_results = True else: plot_results = False model.train(train_data=train_data, tuning_data=tuning_data, resource=recommended_resource, time_limits=time_limits, search_strategy=search_strategy, search_options=search_options, scheduler_options=scheduler_options, num_trials=num_trials, plot_results=plot_results, console_log=verbosity > 2, ignore_warning=verbosity <= 2) return model
def fit(self, train_data, tuning_data=None, time_limit=None, presets=None, hyperparameters=None, feature_metadata=None, **kwargs): """ Fit models to predict a column of data table based on the other columns. # TODO: Move documentation from TabularPrediction.fit to here # TODO: Move num_cpu/num_gpu to AG_args_fit # TODO: AG_args -> ag_args? +1 -> Will change after replacing original TabularPredictor to avoid extra API breaks. # TODO: consider adding kwarg option for data which has already been preprocessed by feature generator to skip feature generation. """ if self._learner.is_fit: raise AssertionError( 'Predictor is already fit! To fit additional models, refer to `predictor.fit_extra`.' ) kwargs_orig = kwargs.copy() kwargs = self._validate_fit_kwargs(kwargs) verbosity = kwargs.get('verbosity', self.verbosity) set_logger_verbosity(verbosity, logger=logger) if verbosity >= 3: logger.log(20, '============ fit kwarg info ============') logger.log(20, 'User Specified kwargs:') logger.log(20, f'{pprint.pformat(kwargs_orig)}') logger.log(20, 'Full kwargs:') logger.log(20, f'{pprint.pformat(kwargs)}') logger.log(20, '========================================') holdout_frac = kwargs['holdout_frac'] num_bag_folds = kwargs['num_bag_folds'] num_bag_sets = kwargs['num_bag_sets'] num_stack_levels = kwargs['num_stack_levels'] auto_stack = kwargs['auto_stack'] hyperparameter_tune_kwargs = kwargs['hyperparameter_tune_kwargs'] num_cpus = kwargs['num_cpus'] num_gpus = kwargs['num_gpus'] feature_generator = kwargs['feature_generator'] unlabeled_data = kwargs['unlabeled_data'] save_bagged_folds = kwargs['save_bagged_folds'] ag_args = kwargs['AG_args'] ag_args_fit = kwargs['AG_args_fit'] ag_args_ensemble = kwargs['AG_args_ensemble'] excluded_model_types = kwargs['excluded_model_types'] self._set_feature_generator(feature_generator=feature_generator, feature_metadata=feature_metadata) train_data, tuning_data, unlabeled_data = self._validate_fit_data( train_data=train_data, tuning_data=tuning_data, unlabeled_data=unlabeled_data) if hyperparameters is None: hyperparameters = 'default' if isinstance(hyperparameters, str): hyperparameters = get_hyperparameter_config(hyperparameters) # Process kwargs to create trainer, schedulers, searchers: num_bag_folds, num_bag_sets, num_stack_levels = self._sanitize_stack_args( num_bag_folds=num_bag_folds, num_bag_sets=num_bag_sets, num_stack_levels=num_stack_levels, time_limit=time_limit, auto_stack=auto_stack, num_train_rows=len(train_data), ) if hyperparameter_tune_kwargs is not None: scheduler_options = self._init_scheduler( hyperparameter_tune_kwargs, time_limit, hyperparameters, num_cpus, num_gpus, num_bag_folds, num_stack_levels) else: scheduler_options = None hyperparameter_tune = scheduler_options is not None if hyperparameter_tune: logger.log( 30, 'Warning: hyperparameter tuning is currently experimental and may cause the process to hang. Setting `auto_stack=True` instead is recommended to achieve maximum quality models.' ) if holdout_frac is None: holdout_frac = default_holdout_frac(len(train_data), hyperparameter_tune) if ag_args_fit is None: ag_args_fit = dict() # TODO: v0.1: Update to be 'auto' or None by default to give full control to individual models. if 'num_cpus' not in ag_args_fit and num_cpus is not None: ag_args_fit['num_cpus'] = num_cpus if 'num_gpus' not in ag_args_fit and num_gpus is not None: ag_args_fit['num_gpus'] = num_gpus # TODO: v0.1: make core_kwargs a kwargs argument to predictor.fit, add aux_kwargs to predictor.fit core_kwargs = { 'ag_args': ag_args, 'ag_args_ensemble': ag_args_ensemble, 'ag_args_fit': ag_args_fit, 'excluded_model_types': excluded_model_types } self._learner.fit(X=train_data, X_val=tuning_data, X_unlabeled=unlabeled_data, hyperparameter_tune_kwargs=scheduler_options, holdout_frac=holdout_frac, num_bagging_folds=num_bag_folds, num_bagging_sets=num_bag_sets, stack_ensemble_levels=num_stack_levels, hyperparameters=hyperparameters, core_kwargs=core_kwargs, time_limit=time_limit, save_bagged_folds=save_bagged_folds, verbosity=verbosity) self._set_post_fit_vars() self._post_fit( keep_only_best=kwargs['keep_only_best'], refit_full=kwargs['refit_full'], set_best_to_refit_full=kwargs['set_best_to_refit_full'], save_space=kwargs['save_space'], ) self.save() return self
def fit(self, train_data, tuning_data=None, time_limit=None, presets=None, hyperparameters=None, column_types=None, num_cpus=None, num_gpus=None, num_trials=None, plot_results=None, holdout_frac=None, seed=0): """ Fit Transformer models to predict label column of a data table based on the other columns (which may contain text or numeric/categorical features). Parameters ---------- train_data : str or :class:`TabularDataset` or :class:`pd.DataFrame` Table of the training data, which is similar to a pandas DataFrame. If str is passed, `train_data` will be loaded using the str value as the file path. tuning_data : str or :class:`TabularDataset` or :class:`pd.DataFrame`, default = None Another dataset containing validation data reserved for tuning processes such as early stopping and hyperparameter tuning. This dataset should be in the same format as `train_data`. If str is passed, `tuning_data` will be loaded using the str value as the file path. Note: final model returned may be fit on `tuning_data` as well as `train_data`. Do not provide your evaluation test data here! If `tuning_data = None`, `fit()` will automatically hold out some random validation examples from `train_data`. time_limit : int, default = None Approximately how long `fit()` should run for (wallclock time in seconds). If not specified, `fit()` will run until the model has completed training. presets : str, default = None Presets are pre-registered configurations that control training (hyperparameters and other aspects). It is recommended to specify presets and avoid specifying most other `fit()` arguments or model hyperparameters prior to becoming familiar with AutoGluon. Print all available presets via `autogluon.text.list_presets()`. Some notable presets include: - "best_quality": produce the most accurate overall predictor (regardless of its efficiency). - "medium_quality_faster_train": produce an accurate predictor but take efficiency into account (this is the default preset). - "lower_quality_fast_train": produce a predict that is quick to train and make predictions with, even if its accuracy is worse. hyperparameters : dict, default = None The hyperparameters of the `fit()` function, which affect the resulting accuracy of the trained predictor. Experienced AutoGluon users can use this argument to specify neural network hyperparameter values/search-spaces as well as which hyperparameter-tuning strategy should be employed. See the "Text Prediction" tutorials for examples. column_types : dict, default = None The type of data in each table column can be specified via a dictionary that maps the column name to its data type. For example: `column_types = {"item_name": "text", "brand": "text", "product_description": "text", "height": "numerical"}` may be used for a table with columns: "item_name", "brand", "product_description", and "height". If None, column_types will be automatically inferred from the data. The current supported types are: - "text": each row in this column contains text (sentence, paragraph, etc.). - "numerical": each row in this column contains a number. - "categorical": each row in this column belongs to one of K categories. num_cpus : int, default = None The number of CPUs to use for each training run (i.e. one hyperparameter-tuning trial). num_gpus : int, default = None The number of GPUs to use to use for each training run (i.e. one hyperparameter-tuning trial). We recommend at least 1 GPU for TextPredictor as its neural network models are computationally intensive. num_trials : int, default = None If hyperparameter-tuning is used, specifies how many HPO trials should be run (assuming `time_limit` has not been exceeded). By default, this is the provided number of trials in the `hyperparameters` or `presets`. If specified here, this value will overwrite the value in `hyperparameters['tune_kwargs']['num_trials']`. plot_results : bool, default = None Whether to plot intermediate results from training. If None, will be decided based on the environment in which `fit()` is run. holdout_frac : float, default = None Fraction of train_data to holdout as tuning data for optimizing hyperparameters (ignored unless `tuning_data = None`). Default value (if None) is selected based on the number of rows in the training data and whether hyperparameter-tuning is utilized. seed : int, default = 0 The random seed to use for this training run. If None, no seed will be specified and repeated runs will produce different results. Returns ------- :class:`TextPredictor` object. Returns self. """ assert self._fit_called is False verbosity = self.verbosity if verbosity is None: verbosity = 3 if presets is not None: preset_hparams = ag_text_presets.create(presets) else: preset_hparams = ag_text_presets.create('default') hyperparameters = merge_params(preset_hparams, hyperparameters) if num_trials is not None: hyperparameters['tune_kwargs']['num_trials'] = num_trials if isinstance(self._label, str): label_columns = [self._label] else: label_columns = list(self._label) # Get the training and tuning data as pandas dataframe if isinstance(train_data, str): train_data = load_pd.load(train_data) if not isinstance(train_data, pd.DataFrame): raise AssertionError( f'train_data is required to be a pandas DataFrame, but was instead: {type(train_data)}' ) all_columns = list(train_data.columns) feature_columns = [ ele for ele in all_columns if ele not in label_columns ] train_data = train_data[all_columns] # Get tuning data if tuning_data is not None: if isinstance(tuning_data, str): tuning_data = load_pd.load(tuning_data) if not isinstance(tuning_data, pd.DataFrame): raise AssertionError( f'tuning_data is required to be a pandas DataFrame, but was instead: {type(tuning_data)}' ) tuning_data = tuning_data[all_columns] else: if holdout_frac is None: num_trials = hyperparameters['tune_kwargs']['num_trials'] if num_trials == 1: holdout_frac = default_holdout_frac(len(train_data), False) else: # For HPO, we will need to use a larger held-out ratio holdout_frac = default_holdout_frac(len(train_data), True) train_data, tuning_data = train_test_split( train_data, test_size=holdout_frac, random_state=np.random.RandomState(seed)) column_types, problem_type = infer_column_problem_types( train_data, tuning_data, label_columns=label_columns, problem_type=self._problem_type, provided_column_types=column_types) self._eval_metric, log_metrics = infer_eval_log_metrics( problem_type=problem_type, eval_metric=self._eval_metric) has_text_column = False for k, v in column_types.items(): if v == _C.TEXT: has_text_column = True break if not has_text_column: raise AssertionError( 'No Text Column is found! This is currently not supported by ' 'the TextPredictor. You may try to use ' 'autogluon.tabular.TabularPredictor.\n' 'The inferred column properties of the training data is {}'. format(column_types)) logger.info('Problem Type="{}"'.format(problem_type)) logger.info(printable_column_type_string(column_types)) self._problem_type = problem_type if 'models' not in hyperparameters or 'MultimodalTextModel' not in hyperparameters[ 'models']: raise ValueError( 'The current TextPredictor only supports "MultimodalTextModel" ' 'and you must ensure that ' 'hyperparameters["models"]["MultimodalTextModel"] can be accessed.' ) model_hparams = hyperparameters['models']['MultimodalTextModel'] self._backend = model_hparams['backend'] if plot_results is None: plot_results = in_ipynb() if self._backend == 'gluonnlp_v0': import warnings warnings.filterwarnings('ignore', module='mxnet') from ..mx.models import MultiModalTextModel self._model = MultiModalTextModel(column_types=column_types, feature_columns=feature_columns, label_columns=label_columns, problem_type=self._problem_type, eval_metric=self._eval_metric, log_metrics=log_metrics, output_directory=self._path) self._model.train(train_data=train_data, tuning_data=tuning_data, num_cpus=num_cpus, num_gpus=num_gpus, search_space=model_hparams['search_space'], tune_kwargs=hyperparameters['tune_kwargs'], time_limit=time_limit, seed=seed, plot_results=plot_results, verbosity=verbosity) else: raise NotImplementedError( "Currently, we only support using " "the autogluon-contrib-nlp and MXNet " "as the backend of AutoGluon-Text. In the future, " "we will support other models.") logger.info(f'Training completed. Auto-saving to "{self.path}". ' f'For loading the model, you can use' f' `predictor = TextPredictor.load("{self.path}")`') self.save(self.path) return self
def fit(self, train_data, tuning_data=None, time_limit=None, presets=None, hyperparameters=None, feature_columns=None, column_types=None, num_cpus=None, num_gpus=None, num_trials=None, seed=None): """Fit the predictor Parameters ---------- train_data The training data tuning_data The tuning data time_limit The time limits presets The user can specify the presets of the hyper-parameters. hyperparameters The hyper-parameters feature_columns Specify which columns in the data column_types The provided type of the columns num_cpus The number of CPUs to use for each trial num_gpus The number of GPUs to use for each trial num_trials The number of trials. By default, we will use the provided number of trials in the hyperparameters or presets. This will overwrite the provided value. seed The seed of the experiment Returns ------- self """ assert self._fit_called is False if presets is not None: preset_hparams = ag_text_presets.create(presets) else: preset_hparams = ag_text_presets.create('default') hyperparameters = merge_params(preset_hparams, hyperparameters) if seed is not None: hyperparameters['seed'] = seed seed = hyperparameters['seed'] if num_trials is not None: hyperparameters['hpo_params']['num_trials'] = num_trials if isinstance(self._label, str): label_columns = [self._label] else: label_columns = list(self._label) # Get the training and tuning data as pandas dataframe if not isinstance(train_data, pd.DataFrame): train_data = load_pd.load(train_data) if feature_columns is None: all_columns = list(train_data.columns) feature_columns = [ ele for ele in all_columns if ele not in label_columns ] else: if isinstance(feature_columns, str): feature_columns = [feature_columns] for col in feature_columns: assert col not in label_columns, 'Feature columns and label columns cannot overlap.' assert col in train_data.columns,\ 'Feature columns must be in the pandas dataframe! Received col = "{}", ' \ 'all columns = "{}"'.format(col, train_data.columns) all_columns = feature_columns + label_columns train_data = train_data[all_columns] # Get tuning data if tuning_data is not None: if not isinstance(tuning_data, pd.DataFrame): tuning_data = load_pd.load(tuning_data) tuning_data = tuning_data[all_columns] else: if hyperparameters['misc']['holdout_frac'] is not None: holdout_frac = hyperparameters['misc']['holdout_frac'] else: num_trials = hyperparameters['hpo_params']['num_trials'] if num_trials == 1: holdout_frac = default_holdout_frac(len(train_data), False) else: # For HPO, we will need to use a larger held-out ratio holdout_frac = default_holdout_frac(len(train_data), True) train_data, tuning_data = train_test_split( train_data, test_size=holdout_frac, random_state=np.random.RandomState(seed)) column_types, problem_type = infer_column_problem_types( train_data, tuning_data, label_columns=label_columns, problem_type=self._problem_type, provided_column_types=column_types) self._eval_metric, log_metrics = infer_eval_log_metrics( problem_type=problem_type, eval_metric=self._eval_metric) has_text_column = False for k, v in column_types.items(): if v == _C.TEXT: has_text_column = True break if not has_text_column: raise AssertionError( 'No Text Column is found! This is currently not supported by ' 'the TextPrediction task. You may try to use ' 'autogluon.tabular.TabularPredictor.\n' 'The inferred column properties of the training data is {}'. format(train_data)) logger.log(25, 'Problem Type="{}"'.format(problem_type)) logger.log(25, printable_column_type_string(column_types)) self._problem_type = problem_type model_hparams = hyperparameters['models']['MultimodalTextModel'] self._backend = model_hparams['backend'] if model_hparams['backend'] == 'gluonnlp_v0': from ..mx.models import MultiModalTextModel self._model = MultiModalTextModel(column_types=column_types, feature_columns=feature_columns, label_columns=label_columns, problem_type=self._problem_type, eval_metric=self._eval_metric, log_metrics=log_metrics, output_directory=self._path) self._model.train(train_data=train_data, tuning_data=tuning_data, num_cpus=num_cpus, num_gpus=num_gpus, search_space=model_hparams['search_space'], hpo_params=hyperparameters['hpo_params'], time_limit=time_limit, seed=seed, verbosity=self.verbosity) else: raise NotImplementedError( "Currently, we only support using " "the autogluon-contrib-nlp and MXNet " "as the backend of AutoGluon-Text. In the future, " "we will support other models.") return self
def _fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val: Optional[pd.DataFrame] = None, y_val: Optional[pd.Series] = None, time_limit: Optional[int] = None, **kwargs): """The internal fit function Parameters ---------- X_train Features of the training dataset y_train Labels of the training dataset X_val Features of the validation dataset y_val Labels of the validation dataset time_limit The time limits for the fit function kwargs Other keyword arguments """ try: import mxnet as mx from autogluon.text.text_prediction.dataset import TabularDataset, random_split_train_val from autogluon.text.text_prediction.text_prediction import get_recommended_resource except ImportError: raise ImportError(AG_TEXT_IMPORT_ERROR) # Get arguments from kwargs verbosity = kwargs.get('verbosity', 2) num_cpus = kwargs.get('num_cpus', None) num_gpus = kwargs.get('num_gpus', None) # Infer resource resource = get_recommended_resource(nthreads_per_trial=num_cpus, ngpus_per_trial=num_gpus) if resource['num_gpus'] == 0: raise NoGPUError( f'\tNo GPUs available to train {self.name}. Resources: {resource}' ) # Set seed seed = self.params.get('seed') if seed is not None: random.seed(seed) np.random.seed(seed) mx.random.seed(seed) X_train = self.preprocess(X_train, fit=True) if X_val is not None: X_val = self.preprocess(X_val) column_properties = self._build_model(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, hyperparameters=self.params) # Insert the label column X_train.insert(len(X_train.columns), self._label_column_name, y_train) if X_val is not None: X_val.insert(len(X_val.columns), self._label_column_name, y_val) scheduler_options = self.params['hpo_params']['scheduler_options'] search_strategy = self.params['hpo_params']['search_strategy'] if scheduler_options is None: scheduler_options = dict() if search_strategy.endswith('hyperband'): # Specific defaults for hyperband scheduling scheduler_options['reduction_factor'] = scheduler_options.get( 'reduction_factor', 4) scheduler_options['grace_period'] = scheduler_options.get( 'grace_period', 10) scheduler_options['max_t'] = scheduler_options.get('max_t', 50) if X_val is None: # FIXME: v0.1 Update TextPrediction to use all training data in refit_full holdout_frac = default_holdout_frac(len(X_train), True) X_train, X_val = random_split_train_val(X_train, valid_ratio=holdout_frac) train_data = TabularDataset(X_train, column_properties=column_properties, label_columns=self._label_column_name) logger.info('Train Dataset:') logger.info(train_data) tuning_data = TabularDataset(X_val, column_properties=column_properties, label_columns=self._label_column_name) logger.info('Tuning Dataset:') logger.info(tuning_data) self.model.train( train_data=train_data, tuning_data=tuning_data, resource=resource, time_limits=time_limit, search_strategy=search_strategy, search_options=self.params['hpo_params']['search_options'], scheduler_options=scheduler_options, num_trials=self.params['hpo_params']['num_trials'], console_log=verbosity >= 2, ignore_warning=verbosity < 2)