def _fit( self, train_data: TimeSeriesDataFrame, val_data: Optional[TimeSeriesDataFrame] = None, time_limit: int = None, **kwargs, ) -> None: verbosity = kwargs.get("verbosity", 2) set_logger_verbosity(verbosity, logger=logger) gts_logger.setLevel(logging.ERROR if verbosity <= 3 else logging.INFO) if verbosity > 3: logger.warning( "GluonTS logging is turned on during training. Note that losses reported by GluonTS " "may not correspond to those specified via `eval_metric`.") self._check_fit_params() # update auxiliary parameters self._deferred_init_params_aux(dataset=train_data, callback=TimeLimitCallback(time_limit), **kwargs) estimator = self._get_estimator() with warning_filter(), disable_root_logger(): self.gts_predictor = estimator.train( self._to_gluonts_dataset(train_data), validation_data=self._to_gluonts_dataset(val_data), )
def __init__(self, label, problem_type=None, eval_metric=None, path=None, verbosity=3, warn_if_exist=True): self.verbosity = verbosity if self.verbosity is not None: set_logger_verbosity(self.verbosity) self._label = label self._problem_type = problem_type self._eval_metric = eval_metric self._path = setup_outputdir(path, warn_if_exist=warn_if_exist) self._model = None self._fit_called = False self._backend = None
def __init__( self, target: Optional[str] = None, eval_metric: Optional[str] = None, path: Optional[str] = None, verbosity: int = 2, prediction_length: int = 1, quantile_levels: Optional[List[float]] = None, **kwargs, ): self.verbosity = verbosity set_logger_verbosity(self.verbosity, logger=logger) self.path = setup_outputdir(path) if target is not None and kwargs.get("label") is not None: raise ValueError( "Both `label` and `target` are specified. Please specify at most one of these. " "arguments." ) self.target = target or kwargs.get("label", "target") self.prediction_length = prediction_length self.eval_metric = eval_metric self.quantile_levels = quantile_levels or kwargs.get( "quantiles", [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] ) learner_type = kwargs.pop("learner_type", TimeSeriesLearner) learner_kwargs = kwargs.pop("learner_kwargs", dict()) learner_kwargs = learner_kwargs.copy() learner_kwargs.update( dict( path_context=self.path, eval_metric=eval_metric, target=self.target, prediction_length=self.prediction_length, quantile_levels=self.quantile_levels, ) ) self._learner: AbstractLearner = learner_type(**learner_kwargs) self._learner_type = type(self._learner)
def _fit( self, train_data: TimeSeriesDataFrame, time_limit: int = None, **kwargs, ) -> None: verbosity = kwargs.get("verbosity", 2) set_logger_verbosity(verbosity, logger=logger) skt_logger.setLevel(logging.ERROR if verbosity <= 3 else logging.INFO) self._check_fit_params() self.skt_forecaster = self._get_skt_forecaster() with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) warnings.simplefilter("ignore", category=ConvergenceWarning) warnings.simplefilter("ignore", category=RuntimeWarning) self.skt_forecaster.fit(self._to_skt_data_frame( train_data[[self.target]]), fh=self._fh()) self._fit_index = train_data.index.copy()
def __init__( self, path: str, prediction_length: Optional[int] = 1, eval_metric: Optional[str] = None, save_data: bool = True, enable_ensemble: bool = True, verbosity: int = 2, **kwargs, ): super().__init__(path=path, save_data=save_data, low_memory=True, **kwargs) self.prediction_length = prediction_length self.quantile_levels = kwargs.get( "quantile_levels", kwargs.get("quantiles", [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), ) self.target = kwargs.get("target", "target") self.is_data_saved = False self.enable_ensemble = enable_ensemble self.verbosity = verbosity set_logger_verbosity(self.verbosity, logger=logger) # Dict of normal model -> FULL model. FULL models are produced by # self.refit_single_full() and self.refit_ensemble_full(). self.model_full_dict = {} # Dict of FULL model -> normal model validation score in case the normal model had been deleted. self._model_full_dict_val_score = {} self.eval_metric = TimeSeriesEvaluator.check_get_evaluation_metric( eval_metric) self.hpo_results = {}
def fit(self, train_data, tuning_data=None, time_limit='auto', presets=None, hyperparameters=None, **kwargs): """Automatic fit process for image prediction. Parameters ---------- train_data : pd.DataFrame Training data, can be a dataframe like image dataset. For dataframe like datasets, `image` and `label` columns are required. `image`: raw image paths. `label`: categorical integer id, starting from 0. tuning_data : pd.DataFrame, default = None Another dataset containing validation data reserved for model selection and hyperparameter-tuning, can be a dataframe like image dataset. If `None`, the validation dataset will be randomly split from `train_data` according to `holdout_frac`. time_limit : int, default = 'auto' (defaults to 2 hours if no presets detected) Time limit in seconds, if `None`, will run until all tuning and training finished. If `time_limit` is hit during `fit`, the HPO process will interrupt and return the current best configuration. presets : list or str or dict, default = ['medium_quality_faster_train'] List of preset configurations for various arguments in `fit()`. Can significantly impact predictive accuracy, memory-footprint, and inference latency of trained models, and various other properties of the returned `predictor`. It is recommended to specify presets and avoid specifying most other `fit()` arguments or model hyperparameters prior to becoming familiar with AutoGluon. As an example, to get the most accurate overall predictor (regardless of its efficiency), set `presets='best_quality'`. To get good quality with faster inference speed, set `presets='good_quality_faster_inference'` Any user-specified arguments in `fit()` will override the values used by presets. If specifying a list of presets, later presets will override earlier presets if they alter the same argument. For precise definitions of the provided presets, see file: `autogluon/vision/configs/presets_configs.py`. Users can specify custom presets by passing in a dictionary of argument values as an element to the list. Available Presets: ['best_quality', 'high_quality_fast_inference', 'good_quality_faster_inference', 'medium_quality_faster_train'] It is recommended to only use one `quality` based preset in a given call to `fit()` as they alter many of the same arguments and are not compatible with each-other. Note that depending on your specific hardware limitation(# gpu, size of gpu memory...) your mileage may vary a lot, you may choose lower quality presets if necessary, and try to reduce `batch_size` if OOM("RuntimeError: CUDA error: out of memory") happens frequently during the `fit`. In-depth Preset Info: # Best predictive accuracy with little consideration to inference time or model size. Achieve even better results by specifying a large time_limit value. # Recommended for applications that benefit from the best possible model accuracy. best_quality={ 'hyperparameters': { 'model': Categorical('coat_lite_small', 'twins_pcpvt_base', 'swin_base_patch4_window7_224'), 'lr': Real(1e-5, 1e-2, log=True), 'batch_size': Categorical(8, 16, 32, 64, 128), 'epochs': 200, 'early_stop_patience': 50 }, 'hyperparameter_tune_kwargs': { 'num_trials': 1024, 'searcher': 'random', }, 'time_limit': 12*3600, }, # Good predictive accuracy with fast inference. # Recommended for applications that require reasonable inference speed and/or model size. good_quality_fast_inference={ 'hyperparameters': { 'model': Categorical('resnet50d', 'efficientnet_b1', 'mobilenetv3_large_100'), 'lr': Real(1e-4, 1e-2, log=True), 'batch_size': Categorical(8, 16, 32, 64, 128), 'epochs': 150, 'early_stop_patience': 20 }, 'hyperparameter_tune_kwargs': { 'num_trials': 512, 'searcher': 'random', }, 'time_limit': 8*3600, }, # Medium predictive accuracy with very fast inference and very fast training time. medium_quality_faster_train={ 'hyperparameters': { 'model': 'resnet50d', 'lr': 0.01, 'batch_size': 64, 'epochs': 50, 'early_stop_patience': 5 }, 'time_limit': 1*3600, }, # Medium predictive accuracy with very fast inference. # Comparing with `medium_quality_faster_train` it uses faster model but explores more hyperparameters. medium_quality_faster_inference={ 'hyperparameters': { 'model': Categorical('resnet18', 'mobilenetv3_small_100', 'resnet18_v1b'), 'lr': Categorical(0.01, 0.005, 0.001), 'batch_size': Categorical(64, 128), 'epochs': Categorical(50, 100), 'early_stop_patience': 10 }, 'hyperparameter_tune_kwargs': { 'num_trials': 32, 'searcher': 'random', }, 'time_limit': 2*3600, }, hyperparameters : dict, default = None Extra hyperparameters for specific models. Accepted args includes(not limited to): epochs : int, default value based on network The `epochs` for model training. net : mx.gluon.Block The custom network. If defined, the model name in config will be ignored so your custom network will be used for training rather than pulling it from model zoo. optimizer : mx.Optimizer The custom optimizer object. If defined, the optimizer will be ignored in config but this object will be used in training instead. batch_size : int Mini batch size lr : float Trainer learning rate for optimization process. early_stop_patience : int, default=10 Number of epochs with no improvement after which train is early stopped. Use `None` to disable. early_stop_min_delta : float, default=1e-4 The small delta value to ignore when evaluating the metric. A large delta helps stablize the early stopping strategy against tiny fluctuation, e.g. 0.5->0.49->0.48->0.499->0.500001 is still considered as a good timing for early stopping. early_stop_baseline : float, default=None The minimum(baseline) value to trigger early stopping. For example, with `early_stop_baseline=0.5`, early stopping won't be triggered if the metric is less than 0.5 even if plateau is detected. Use `None` to disable. early_stop_max_value : float, default=None The max value for metric, early stop training instantly once the max value is achieved. Use `None` to disable. You can get the list of accepted hyperparameters in `config.yaml` saved by this predictor. **kwargs : holdout_frac : float, default = 0.1 The random split ratio for `tuning_data` if `tuning_data==None`. random_state : int, default = None The random_state(seed) for shuffling data, only used if `tuning_data==None`. Note that the `random_state` only affect the splitting process, not model training. If not specified(None), will leave the original random sampling intact. nthreads_per_trial : int, default = (# cpu cores) Number of CPU threads for each trial, if `None`, will detect the # cores on current instance. ngpus_per_trial : int, default = (# gpus) Number of GPUs to use for each trial, if `None`, will detect the # gpus on current instance. hyperparameter_tune_kwargs: dict, default = None num_trials : int, default = 1 The limit of HPO trials that can be performed within `time_limit`. The HPO process will be terminated when `num_trials` trials have finished or wall clock `time_limit` is reached, whichever comes first. searcher : str, default = 'random' Searcher strategy for HPO, 'random' by default. Options include: ‘random’ (random search), ‘grid’ (grid search). max_reward : float, default = None The reward threashold for stopping criteria. If `max_reward` is reached during HPO, the scheduler will terminate earlier to reduce time cost. scheduler_options : dict, default = None Extra options for HPO scheduler, please refer to :class:`autogluon.core.Searcher` for details. """ if self._problem_type is None: # options: multiclass, binary, regression self._problem_type = MULTICLASS assert self._problem_type in ( MULTICLASS, BINARY, REGRESSION), f"Invalid problem_type: {self._problem_type}" if self._eval_metric is None: if self._problem_type == REGRESSION: # options: rmse self._eval_metric = 'rmse' logger.log( 20, 'ImagePredictor sets rmse as default eval_metric for regression problems.' ) else: # options: accuracy self._eval_metric = 'accuracy' logger.log( 20, 'ImagePredictor sets accuracy as default eval_metric for classification problems.' ) # init/validate kwargs kwargs = self._validate_kwargs(kwargs) # unpack num_trials = kwargs['hyperparameter_tune_kwargs']['num_trials'] nthreads_per_trial = kwargs['nthreads_per_trial'] ngpus_per_trial = kwargs['ngpus_per_trial'] holdout_frac = kwargs['holdout_frac'] random_state = kwargs['random_state'] scheduler = kwargs['hyperparameter_tune_kwargs']['scheduler'] searcher = kwargs['hyperparameter_tune_kwargs']['searcher'] max_reward = kwargs['hyperparameter_tune_kwargs']['max_reward'] scheduler_options = kwargs['hyperparameter_tune_kwargs'][ 'scheduler_options'] # deep copy to avoid inplace overwrite train_data = copy.deepcopy(train_data) tuning_data = copy.deepcopy(tuning_data) log_level = verbosity2loglevel(self._verbosity) set_logger_verbosity(self._verbosity) if presets: if not isinstance(presets, list): presets = [presets] logger.log(20, f'Presets specified: {presets}') if time_limit == 'auto': # no presets, no user specified time_limit time_limit = 7200 logger.log(20, f'`time_limit=auto` set to `time_limit={time_limit}`.') use_rec = False if isinstance(train_data, str) and train_data == 'imagenet': # FIXME: imagenet does not work, crashes in validating data due to empty DataFrames. logger.warning( 'ImageNet is a huge dataset which cannot be downloaded directly, ' + 'please follow the data preparation tutorial in GluonCV.' + 'The following record files(symlinks) will be used: \n' + 'rec_train : ~/.mxnet/datasets/imagenet/rec/train.rec\n' + 'rec_train_idx : ~/.mxnet/datasets/imagenet/rec/train.idx\n' + 'rec_val : ~/.mxnet/datasets/imagenet/rec/val.rec\n' + 'rec_val_idx : ~/.mxnet/datasets/imagenet/rec/val.idx\n') train_data = pd.DataFrame({'image': [], self._label_inner: []}) tuning_data = pd.DataFrame({'image': [], self._label_inner: []}) use_rec = True if isinstance(train_data, str): try_import_d8() from d8.image_classification import Dataset as D8D names = D8D.list() if train_data.lower() in names: train_data = D8D.get(train_data) else: valid_names = '\n'.join(names) raise ValueError( f'`train_data` {train_data} is not among valid list {valid_names}' ) if tuning_data is None: train_data, tuning_data = train_data.split(1 - holdout_frac) if isinstance(tuning_data, str): try_import_d8() from d8.image_classification import Dataset as D8D names = D8D.list() if tuning_data.lower() in names: tuning_data = D8D.get(tuning_data) else: valid_names = '\n'.join(names) raise ValueError( f'`tuning_data` {tuning_data} is not among valid list {valid_names}' ) # data sanity check train_data = self._validate_data(train_data) train_labels = _get_valid_labels(train_data) self._label_cleaner = LabelCleaner.construct( problem_type=self._problem_type, y=train_labels, y_uncleaned=train_labels) train_labels_cleaned = self._label_cleaner.transform(train_labels) if train_labels_cleaned.dtype.kind in ('i', 'u'): train_labels_cleaned = train_labels_cleaned.astype('int64') # converting to internal label set _set_valid_labels(train_data, train_labels_cleaned) tuning_data_validated = False if tuning_data is None: train_data, tuning_data, _, _ = generate_train_test_split( X=train_data, y=train_data[self._label_inner], problem_type=self._problem_type, test_size=holdout_frac) logger.info( 'Randomly split train_data into train[%d]/validation[%d] splits.', len(train_data), len(tuning_data)) train_data = train_data.reset_index(drop=True) tuning_data = tuning_data.reset_index(drop=True) tuning_data_validated = True train_data = self._validate_data(train_data) if isinstance(train_data, self.Dataset): train_data = self.Dataset(train_data, classes=train_data.classes) if tuning_data is not None and not tuning_data_validated: tuning_data = self._validate_data(tuning_data) # converting to internal label set tuning_labels_cleaned = self._label_cleaner.transform( _get_valid_labels(tuning_data)) if tuning_labels_cleaned.dtype.kind in ('i', 'u'): tuning_labels_cleaned = tuning_labels_cleaned.astype('int64') _set_valid_labels(tuning_data, tuning_labels_cleaned) if isinstance(tuning_data, self.Dataset): tuning_data = self.Dataset(tuning_data, classes=tuning_data.classes) if self._classifier is not None: logging.getLogger("ImageClassificationEstimator").propagate = True self._classifier._logger.setLevel(log_level) self._fit_summary = self._classifier.fit(train_data, tuning_data, 1 - holdout_frac, random_state, resume=False) if hasattr(self._classifier, 'fit_history'): self._fit_summary[ 'fit_history'] = self._classifier.fit_history() return self # new HPO task if time_limit is not None and num_trials is None: num_trials = 99999 if time_limit is None and num_trials is None: raise ValueError( '`time_limit` and `num_trials` can not be `None` at the same time, ' 'otherwise the training will not be terminated gracefully.') config = { 'log_dir': self._log_dir, 'num_trials': 99999 if num_trials is None else max(1, num_trials), 'time_limits': 2147483647 if time_limit is None else max(1, time_limit), 'searcher': searcher, # needed for gluon-cv TODO: remove after gluon-cv is updated https://github.com/dmlc/gluon-cv/issues/1633 'search_strategy': searcher, 'scheduler': scheduler, } if max_reward is not None: config['max_reward'] = max_reward if nthreads_per_trial is not None: config['nthreads_per_trial'] = nthreads_per_trial if ngpus_per_trial is not None: config['ngpus_per_trial'] = ngpus_per_trial if isinstance(hyperparameters, dict): if 'batch_size' in hyperparameters: bs = hyperparameters['batch_size'] _check_gpu_memory_presets(bs, ngpus_per_trial, 4, 256) # 256MB per sample net = hyperparameters.pop('net', None) if net is not None: config['custom_net'] = net optimizer = hyperparameters.pop('optimizer', None) if optimizer is not None: config['custom_optimizer'] = optimizer # check if hyperparameters overwriting existing config for k, v in hyperparameters.items(): if k in config: raise ValueError( f'Overwriting {k} = {config[k]} to {v} by hyperparameters is ambiguous.' ) config.update(hyperparameters) if scheduler_options is not None: config.update(scheduler_options) if use_rec == True: config['use_rec'] = True if 'early_stop_patience' not in config: config['early_stop_patience'] = 10 if config['early_stop_patience'] == None: config['early_stop_patience'] = -1 # TODO(zhreshold): expose the transform function(or sign function) for converting custom metrics if 'early_stop_baseline' not in config or config[ 'early_stop_baseline'] == None: config['early_stop_baseline'] = -np.Inf if 'early_stop_max_value' not in config or config[ 'early_stop_max_value'] == None: config['early_stop_max_value'] = np.Inf # batch size cannot be larger than dataset size if ngpus_per_trial is not None and ngpus_per_trial > 1: min_value = ngpus_per_trial else: min_value = 1 bs = sanitize_batch_size(config.get('batch_size', 16), min_value=min_value, max_value=len(train_data)) config['batch_size'] = bs # TODO: remove this once mxnet is deprecated if timm is None and config.get('model', None) is None: config['model'] = 'resnet50_v1b' # verbosity if log_level > logging.INFO: logging.getLogger("ImageClassificationEstimator").propagate = False logging.getLogger("ImageClassificationEstimator").setLevel( log_level) task = ImageClassification(config=config, problem_type=self._problem_type) # GluonCV can't handle these separately - patching created config task.search_strategy = scheduler task.scheduler_options['searcher'] = searcher task._logger.setLevel(log_level) task._logger.propagate = True self._train_classes = train_data.classes with warnings.catch_warnings(record=True) as w: # TODO: MXNetErrorCatcher was removed because it didn't return traceback # Re-add once it returns full traceback regardless of which exception was caught self._classifier = task.fit(train_data, tuning_data, 1 - holdout_frac, random_state) self._classifier._logger.setLevel(log_level) self._classifier._logger.propagate = True self._fit_summary = task.fit_summary() if hasattr(task, 'fit_history'): self._fit_summary['fit_history'] = task.fit_history() return self
def fit( self, train_data: TimeSeriesDataFrame, tuning_data: Optional[TimeSeriesDataFrame] = None, time_limit: Optional[int] = None, presets: Optional[str] = None, hyperparameters: Dict[Union[str, Type], Any] = None, hyperparameter_tune_kwargs: Optional[Union[str, Dict]] = None, **kwargs, ) -> "TimeSeriesPredictor": """Fit models to predict distributional forecasts of multiple related time series based on historical observations. Parameters ---------- train_data: TimeSeriesDataFrame Training data in the :class:``~autogluon.timeseries.TimeSeriesDataFrame`` format. tuning_data: TimeSeriesDataFrame, default = None Data reserved for model selection and hyperparameter tuning, rather than training individual models. If ``None``, AutoGluon will reserve the most recent ``prediction_length`` time steps of each ``item_id`` in ``train_data`` for tuning. Validation scores will by default be calculated on ``tuning_data``. time_limit: int, default = None Approximately how long :meth:`~autogluon.timeseries.TimeSeriesPredictor.fit` will run for (wall-clock time in seconds). If not specified, :meth:`~autogluon.timeseries.TimeSeriesPredictor.fit` will run until all models have completed training. presets: str, default = None Optional preset configurations for various arguments in :meth:`~autogluon.timeseries.TimeSeriesPredictor.fit`. Can significantly impact predictive accuracy, memory footprint, inference latency of trained models, and various other properties of the returned predictor. It is recommended to specify presets and avoid specifying most other :meth:`~autogluon.timeseries.TimeSeriesPredictor.fit` arguments or model hyperparameters prior to becoming familiar with AutoGluon. For example, set ``presets="best_quality"`` to get a high-accuracy predictor, or set ``presets="low_quality"`` to get a toy predictor that trains quickly but lacks accuracy. Any user-specified arguments in :meth:`~autogluon.timeseries.TimeSeriesPredictor.fit` will override the values used by presets. Available presets are "best_quality", "high_quality", "good_quality", "medium_quality", "low_quality", and "low_quality_hpo". Details for these presets can be found in ``autogluon/timeseries/configs/presets_configs.py``. If not provided, user-provided values for other arguments (specifically, ``hyperparameters`` and ``hyperparameter_tune_kwargs`` will be used (defaulting to their default values specified below). hyperparameters: str or dict, default = "default" Determines the hyperparameters used by each model. If str is passed, will use a preset hyperparameter configuration, can be one of "default", "default_hpo", "toy", or "toy_hpo", where "toy" settings correspond to models only intended for prototyping. If dict is provided, the keys are strings or Types that indicate which model types to train. In this case, the predictor will only train the given model types. Stable model options include: 'DeepAR', 'MQCNN', and 'SFF' (SimpleFeedForward). See References for more detail on these models. Values in the ``hyperparameters`` dict are themselves dictionaries of hyperparameter settings for each model type. Each hyperparameter can either be a single fixed value or a search space containing many possible values. A search space should only be provided when ``hyperparameter_tune_kwargs`` is specified (i.e., hyperparameter-tuning is utilized). Any omitted hyperparameters not specified here will be set to default values which are given in``autogluon/timeseries/trainer/models/presets.py``. Specific hyperparameter choices for each of the recommended models can be found in the references. hyperparameter_tune_kwargs: str or dict, default = None # TODO References ---------- - DeepAR: https://ts.gluon.ai/api/gluonts/gluonts.model.deepar.html - MQCNN: https://ts.gluon.ai/api/gluonts/gluonts.model.seq2seq.html - SFF: https://ts.gluon.ai/api/gluonts/gluonts.model.simple_feedforward.html """ time_start = time.time() if self._learner.is_fit: raise AssertionError( "Predictor is already fit! To fit additional models create a new `Predictor`." ) if self.target not in train_data.columns: raise ValueError( f"Target column `{self.target}` not found in the training data set." ) if tuning_data is not None and self.target not in tuning_data.columns: raise ValueError( f"Target column `{self.target}` not found in the tuning data set." ) if hyperparameters is None: hyperparameters = "default" verbosity = kwargs.get("verbosity", self.verbosity) set_logger_verbosity(verbosity, logger=logger) if presets is not None: logger.info(f"presets is set to {presets}") fit_args = dict( prediction_length=self.prediction_length, target_column=self.target, time_limit=time_limit, evaluation_metric=self.eval_metric, hyperparameters=hyperparameters, hyperparameter_tune_kwargs=hyperparameter_tune_kwargs, **kwargs, ) logger.info("================ TimeSeriesPredictor ================") logger.info("TimeSeriesPredictor.fit() called") if presets is not None: logger.info(f"Setting presets to: {presets}") logger.info("Fitting with arguments:") logger.info(f"{pprint.pformat(fit_args)}") logger.info( f"Provided training data set with {len(train_data)} rows, {train_data.num_items} items. " f"Average time series length is {len(train_data) / train_data.num_items}." ) if tuning_data is not None: logger.info( f"Provided tuning data set with {len(tuning_data)} rows, {tuning_data.num_items} items. " f"Average time series length is {len(tuning_data) / tuning_data.num_items}." ) logger.info(f"Training artifacts will be saved to: {Path(self.path).resolve()}") logger.info("=====================================================") # Inform the user extra columns in dataset will not be used. extra_columns = [c for c in train_data.columns.copy() if c != self.target] if len(extra_columns) > 0: logger.warning(f"Provided columns {extra_columns} will not be used.") if tuning_data is None: logger.warning( f"Validation data is None, will hold the last prediction_length {self.prediction_length} " f"time steps out to use as validation set.", ) tuning_data = train_data train_data = train_data.slice_by_timestep( slice(None, -self.prediction_length) ) scheduler_options = self._get_scheduler_options( hyperparameter_tune_kwargs, time_limit=time_limit ) time_left = ( None if time_limit is None else time_limit - (time.time() - time_start) ) self._learner.fit( train_data=train_data, val_data=tuning_data, scheduler_options=scheduler_options, hyperparameters=hyperparameters, hyperparameter_tune=all(scheduler_options), time_limit=time_left, verbosity=verbosity, ) self.save() return self
def fit(self, train_data, tuning_data=None, time_limit='auto', presets=None, hyperparameters=None, **kwargs): """Automatic fit process for object detection. Tip: if you observe very slow training speed only happening at the first epoch and your overall time budget is not large, you may disable `CUDNN_AUTOTUNE` by setting the environment variable `export MXNET_CUDNN_AUTOTUNE_DEFAULT=0` before running your python script or insert `import os; os.environ['MXNET_CUDNN_AUTOTUNE_DEFAULT'] = '0'` before any code block. The tuning is beneficial in terms of training speed in the long run, but may cost your noticeble overhead at the begining of each trial. Parameters ---------- train_data : pd.DataFrame or str Training data, can be a dataframe like image dataset. For more details of how to construct a object detection dataset, please checkout: `http://preview.d2l.ai/d8/main/object_detection/getting_started.html`. If a string is provided, will search for d8 datasets. tuning_data : pd.DataFrame or str, default = None Holdout tuning data for validation, reserved for model selection and hyperparameter-tuning, can be a dataframe like image dataset. If a string is provided, will search for k8 datasets. If `None`, the validation dataset will be randomly split from `train_data` according to `holdout_frac`. time_limit : int, default = 'auto'(defaults to 2 hours if no presets detected) Time limit in seconds, if `None`, will run until all tuning and training finished. If `time_limit` is hit during `fit`, the HPO process will interrupt and return the current best configuration. presets : list or str or dict, default = ['medium_quality_faster_train'] List of preset configurations for various arguments in `fit()`. Can significantly impact predictive accuracy, memory-footprint, and inference latency of trained models, and various other properties of the returned `predictor`. It is recommended to specify presets and avoid specifying most other `fit()` arguments or model hyperparameters prior to becoming familiar with AutoGluon. As an example, to get the most accurate overall predictor (regardless of its efficiency), set `presets='best_quality'`. To get good quality with faster inference speed, set `presets='good_quality_faster_inference'` Any user-specified arguments in `fit()` will override the values used by presets. If specifying a list of presets, later presets will override earlier presets if they alter the same argument. For precise definitions of the provided presets, see file: `autogluon/vision/configs/presets_configs.py`. Users can specify custom presets by passing in a dictionary of argument values as an element to the list. Available Presets: ['best_quality', 'high_quality_fast_inference', 'good_quality_faster_inference', 'medium_quality_faster_train'] It is recommended to only use one `quality` based preset in a given call to `fit()` as they alter many of the same arguments and are not compatible with each-other. Note that depending on your specific hardware limitation(# gpu, size of gpu memory...) your mileage may vary a lot, you may choose lower quality presets if necessary, and try to reduce `batch_size` if OOM("RuntimeError: CUDA error: out of memory") happens frequently during the `fit`. In-depth Preset Info: # Best predictive accuracy with little consideration to inference time or model size. Achieve even better results by specifying a large time_limit value. # Recommended for applications that benefit from the best possible model accuracy. best_quality={ 'hyperparameters': { 'transfer': 'faster_rcnn_fpn_resnet101_v1d_coco', 'lr': Real(1e-5, 1e-3, log=True), 'batch_size': Categorical(4, 8), 'epochs': 30, 'early_stop_patience': 50 }, 'hyperparameter_tune_kwargs': { 'num_trials': 128, 'searcher': 'random', }, 'time_limit': 24*3600, }, # Good predictive accuracy with fast inference. # Recommended for applications that require reasonable inference speed and/or model size. good_quality_fast_inference={ 'hyperparameters': { 'transfer': Categorical('ssd_512_resnet50_v1_coco', 'yolo3_darknet53_coco', 'center_net_resnet50_v1b_coco'), 'lr': Real(1e-4, 1e-2, log=True), 'batch_size': Categorical(8, 16, 32, 64), 'epochs': 50, 'early_stop_patience': 20 }, 'hyperparameter_tune_kwargs': { 'num_trials': 512, 'searcher': 'random', }, 'time_limit': 12*3600, }, # Medium predictive accuracy with very fast inference and very fast training time. # This is the default preset in AutoGluon, but should generally only be used for quick prototyping. medium_quality_faster_train={ 'hyperparameters': { 'transfer': 'ssd_512_resnet50_v1_coco', 'lr': 0.01, 'batch_size': Categorical(8, 16), 'epochs': 30, 'early_stop_patience': 5 }, 'hyperparameter_tune_kwargs': { 'num_trials': 16, 'searcher': 'random', }, 'time_limit': 2*3600, }, # Medium predictive accuracy with very fast inference. # Comparing with `medium_quality_faster_train` it uses faster model but explores more hyperparameters. medium_quality_faster_inference={ 'hyperparameters': { 'transfer': Categorical('center_net_resnet18_v1b_coco', 'yolo3_mobilenet1.0_coco'), 'lr': Categorical(0.01, 0.005, 0.001), 'batch_size': Categorical(32, 64, 128), 'epochs': Categorical(30, 50), 'early_stop_patience': 10 }, 'hyperparameter_tune_kwargs': { 'num_trials': 32, 'searcher': 'random', }, 'time_limit': 4*3600, }, hyperparameters : dict, default = None Extra hyperparameters for specific models. Accepted args includes(not limited to): epochs : int, default value based on network The `epochs` for model training. batch_size : int Mini batch size lr : float Trainer learning rate for optimization process. early_stop_patience : int, default=10 Number of epochs with no improvement after which train is early stopped. Use `None` to disable. early_stop_min_delta : float, default=1e-4 The small delta value to ignore when evaluating the metric. A large delta helps stablize the early stopping strategy against tiny fluctuation, e.g. 0.5->0.49->0.48->0.499->0.500001 is still considered as a good timing for early stopping. early_stop_baseline : float, default=None The minimum(baseline) value to trigger early stopping. For example, with `early_stop_baseline=0.5`, early stopping won't be triggered if the metric is less than 0.5 even if plateau is detected. Use `None` to disable. early_stop_max_value : float, default=None The max value for metric, early stop training instantly once the max value is achieved. Use `None` to disable. You can get the list of accepted hyperparameters in `config.yaml` saved by this predictor. **kwargs : holdout_frac : float, default = 0.1 The random split ratio for `tuning_data` if `tuning_data==None`. random_state : int, default = None The random_state(seed) for shuffling data, only used if `tuning_data==None`. Note that the `random_state` only affect the splitting process, not model training. If not specified(None), will leave the original random sampling intact. nthreads_per_trial : int, default = (# cpu cores) Number of CPU threads for each trial, if `None`, will detect the # cores on current instance. ngpus_per_trial : int, default = (# gpus) Number of GPUs to use for each trial, if `None`, will detect the # gpus on current instance. hyperparameter_tune_kwargs: dict, default = None num_trials : int, default = 1 The limit of HPO trials that can be performed within `time_limit`. The HPO process will be terminated when `num_trials` trials have finished or wall clock `time_limit` is reached, whichever comes first. searcher : str, default = 'random' Searcher strategy for HPO, 'random' by default. Options include: ‘random’ (random search), ‘grid’ (grid search). max_reward : float, default = None The reward threashold for stopping criteria. If `max_reward` is reached during HPO, the scheduler will terminate earlier to reduce time cost. scheduler_options : dict, default = None Extra options for HPO scheduler, please refer to :class:`autogluon.core.Searcher` for details. """ # init/validate kwargs kwargs = self._validate_kwargs(kwargs) # unpack num_trials = kwargs['hyperparameter_tune_kwargs']['num_trials'] nthreads_per_trial = kwargs['nthreads_per_trial'] ngpus_per_trial = kwargs['ngpus_per_trial'] holdout_frac = kwargs['holdout_frac'] random_state = kwargs['random_state'] scheduler = kwargs['hyperparameter_tune_kwargs']['scheduler'] searcher = kwargs['hyperparameter_tune_kwargs']['searcher'] max_reward = kwargs['hyperparameter_tune_kwargs']['max_reward'] scheduler_options = kwargs['hyperparameter_tune_kwargs'][ 'scheduler_options'] log_level = verbosity2loglevel(self._verbosity) set_logger_verbosity(self._verbosity) if platform.system() == 'Windows': logger.log( 40, '=============================================================================\n' 'WARNING: Windows OS detected, but ObjectDetector is not supported on Windows!\n' 'You may run into many errors. Consider running on Linux instead.\n' '=============================================================================\n' ) logger.log( 30, '=============================================================================\n' 'WARNING: ObjectDetector is deprecated as of v0.4.0 and may contain various bugs and issues!\n' 'In a future release ObjectDetector may be entirely reworked to use Torch as a backend.\n' 'This future change will likely be API breaking.' 'Users should ensure they update their code that depends on ObjectDetector when upgrading to future AutoGluon releases.\n' 'For more information, refer to ObjectDetector refactor GitHub issue: https://github.com/awslabs/autogluon/issues/1559\n' '=============================================================================\n' ) if presets: if not isinstance(presets, list): presets = [presets] logger.log(20, f'Presets specified: {presets}') if time_limit == 'auto': # no presets, no user specified time_limit time_limit = 7200 logger.log(20, f'`time_limit=auto` set to `time_limit={time_limit}`.') # data sanity check train_data = self._validate_data(train_data) if tuning_data is not None: # FIXME: Use ImagePredictor's tuning_data split logic when None, currently this does not perform an ideal split. tuning_data = self._validate_data(tuning_data) if self._detector is not None: self._detector._logger.setLevel(log_level) self._detector._logger.propagate = True self._fit_summary = self._detector.fit(train_data, tuning_data, 1 - holdout_frac, random_state, resume=False) if hasattr(self._detector, 'fit_history'): self._fit_summary['fit_history'] = self._detector.fit_history() return self # new HPO task if time_limit is not None and num_trials is None: num_trials = 99999 if time_limit is None and num_trials is None: raise ValueError( "`time_limit` and kwargs['hyperparameter_tune_kwargs']['num_trials'] can not be `None` at the same time, " "otherwise the training will not be terminated gracefully.") config = { 'log_dir': self._log_dir, 'num_trials': 99999 if num_trials is None else max(1, num_trials), 'time_limits': 2147483647 if time_limit is None else max(1, time_limit), 'search_strategy': searcher, 'scheduler': scheduler, } if max_reward is not None: config['max_reward'] = max_reward if nthreads_per_trial is not None: config['nthreads_per_trial'] = nthreads_per_trial if ngpus_per_trial is not None: config['ngpus_per_trial'] = ngpus_per_trial if isinstance(hyperparameters, dict): if 'batch_size' in hyperparameters: bs = hyperparameters['batch_size'] _check_gpu_memory_presets(bs, ngpus_per_trial, 4, 1280) # 1280MB per sample # check if hyperparameters overwriting existing config for k, v in hyperparameters.items(): if k in config: raise ValueError( f'Overwriting {k} = {config[k]} to {v} by hyperparameters is ambiguous.' ) config.update(hyperparameters) if scheduler_options is not None: config.update(scheduler_options) if 'early_stop_patience' not in config: config['early_stop_patience'] = 10 if config['early_stop_patience'] == None: config['early_stop_patience'] = -1 # TODO(zhreshold): expose the transform function(or sign function) for converting custom metrics if 'early_stop_baseline' not in config or config[ 'early_stop_baseline'] == None: config['early_stop_baseline'] = -np.Inf if 'early_stop_max_value' not in config or config[ 'early_stop_max_value'] == None: config['early_stop_max_value'] = np.Inf # verbosity if log_level > logging.INFO: logging.getLogger( 'gluoncv.auto.tasks.object_detection').propagate = False for logger_name in ('SSDEstimator', 'CenterNetEstimator', 'YOLOv3Estimator', 'FasterRCNNEstimator'): logging.getLogger(logger_name).setLevel(log_level) logging.getLogger(logger_name).propagate = False task = ObjectDetection(config=config) task.search_strategy = scheduler task.scheduler_options['searcher'] = searcher task._logger.setLevel(log_level) task._logger.propagate = True with warnings.catch_warnings(record=True) as w: # TODO: MXNetErrorCatcher was removed because it didn't return traceback, # Re-add once it returns full traceback regardless of which exception was caught self._detector = task.fit(train_data, tuning_data, 1 - holdout_frac, random_state) self._detector._logger.setLevel(log_level) self._detector._logger.propagate = True self._fit_summary = task.fit_summary() if hasattr(task, 'fit_history'): self._fit_summary['fit_history'] = task.fit_history() return self
def set_verbosity(self, verbosity: int): self.verbosity = verbosity set_logger_verbosity(self.verbosity)