Exemple #1
0
    def _hyperparameter_tune(self, X, y, X_val, y_val, scheduler_options, **kwargs):
        """ Performs HPO and sets self.params to best hyperparameter values """
        try_import_mxnet()
        from .tabular_nn_trial import tabular_nn_trial
        from .tabular_nn_dataset import TabularNNDataset

        time_start = time.time()
        self.verbosity = kwargs.get('verbosity', 2)
        logger.log(15, "Beginning hyperparameter tuning for Neural Network...")
        self._set_default_searchspace()  # changes non-specified default hyperparams from fixed values to search-spaces.
        scheduler_cls, scheduler_params = scheduler_options  # Unpack tuple
        if scheduler_cls is None or scheduler_params is None:
            raise ValueError("scheduler_cls and scheduler_params cannot be None for hyperparameter tuning")
        num_cpus = scheduler_params['resource']['num_cpus']

        params_copy = self._get_params()

        self.num_dataloading_workers = max(1, int(num_cpus/2.0))
        self.batch_size = params_copy['batch_size']
        train_dataset, val_dataset = self.generate_datasets(X=X, y=y, params=params_copy, X_val=X_val, y_val=y_val)
        train_path = self.path + "train"
        val_path = self.path + "validation"
        train_dataset.save(file_prefix=train_path)
        val_dataset.save(file_prefix=val_path)

        if not np.any([isinstance(params_copy[hyperparam], Space) for hyperparam in params_copy]):
            logger.warning("Warning: Attempting to do hyperparameter optimization without any search space (all hyperparameters are already fixed values)")
        else:
            logger.log(15, "Hyperparameter search space for Neural Network: ")
            for hyperparam in params_copy:
                if isinstance(params_copy[hyperparam], Space):
                    logger.log(15, str(hyperparam)+ ":   "+str(params_copy[hyperparam]))

        util_args = dict(
            train_path=train_path,
            val_path=val_path,
            model=self,
            time_start=time_start,
            time_limit=scheduler_params['time_out'],
            fit_kwargs=scheduler_params['resource'],
        )
        tabular_nn_trial.register_args(util_args=util_args, **params_copy)
        scheduler = scheduler_cls(tabular_nn_trial, **scheduler_params)
        if ('dist_ip_addrs' in scheduler_params) and (len(scheduler_params['dist_ip_addrs']) > 0):
            # TODO: Ensure proper working directory setup on remote machines
            # This is multi-machine setting, so need to copy dataset to workers:
            logger.log(15, "Uploading preprocessed data to remote workers...")
            scheduler.upload_files([
                train_path + TabularNNDataset.DATAOBJ_SUFFIX,
                train_path + TabularNNDataset.DATAVALUES_SUFFIX,
                val_path + TabularNNDataset.DATAOBJ_SUFFIX,
                val_path + TabularNNDataset.DATAVALUES_SUFFIX
            ])  # TODO: currently does not work.
            logger.log(15, "uploaded")

        scheduler.run()
        scheduler.join_jobs()

        return self._get_hpo_results(scheduler=scheduler, scheduler_params=scheduler_params, time_start=time_start)
Exemple #2
0
    def _fit(self,
             X: pd.DataFrame,
             y: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             sample_weight=None,
             verbosity=2,
             **kwargs):
        try_import_mxnet()
        try_import_autogluon_vision()
        from autogluon.vision import ImagePredictor
        params = self._get_model_params()

        X = self.preprocess(X, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)

        if sample_weight is not None:  # TODO: support
            logger.log(
                15,
                "\tsample_weight not yet supported for ImagePredictorModel, this model will ignore them in training."
            )

        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)
        if X_val is not None:
            X_val = X_val.reset_index(drop=True)
            y_val = y_val.reset_index(drop=True)
        X[self._label_column_name] = y
        if X_val is not None:
            X_val[self._label_column_name] = y_val

        verbosity_image = max(0, verbosity - 1)
        root_logger = logging.getLogger()
        root_log_level = root_logger.level
        # TODO: ImagePredictor doesn't use problem_type in any way at present.
        #  It also doesn't error or warn if problem_type is not one it expects.
        self.model = ImagePredictor(
            problem_type=self.problem_type,
            path=self.path,
            # eval_metric=self.eval_metric,  # TODO: multiclass/binary vision problem works only with accuracy, regression with rmse
            verbosity=verbosity_image)

        logger.log(15, f'\tHyperparameters: {params}')

        # FIXME: ImagePredictor crashes if given float time_limit
        if time_limit is not None:
            time_limit = int(time_limit)

        self.model.fit(train_data=X,
                       tuning_data=X_val,
                       time_limit=time_limit,
                       hyperparameters=params,
                       random_state=0)
        # self.model.set_verbosity(verbosity)  # TODO: How to set verbosity of fit predictor?
        root_logger.setLevel(root_log_level)  # Reset log level
Exemple #3
0
    def _fit(self, X, y, X_val=None, y_val=None,
             time_limit=None, sample_weight=None, num_cpus=1, num_gpus=0, reporter=None, **kwargs):
        """ X (pd.DataFrame): training data features (not necessarily preprocessed yet)
            X_val (pd.DataFrame): test data features (should have same column names as Xtrain)
            y (pd.Series):
            y_val (pd.Series): are pandas Series
            kwargs: Can specify amount of compute resources to utilize (num_cpus, num_gpus).
        """
        start_time = time.time()
        try_import_mxnet()
        import mxnet as mx
        self.verbosity = kwargs.get('verbosity', 2)
        global _has_warned_mxnet_deprecation
        if not _has_warned_mxnet_deprecation:
            _has_warned_mxnet_deprecation = True
            logger.log(30, '\tWARNING: TabularNeuralNetMxnetModel (alias "NN" & "NN_MXNET") has been deprecated in v0.4.0.\n'
                           '\t\tStarting in v0.6.0, calling TabularNeuralNetMxnetModel will raise an exception.\n'
                           '\t\tConsider instead using TabularNeuralNetTorchModel via "NN_TORCH".')

        if sample_weight is not None:  # TODO: support
            logger.log(15, "sample_weight not yet supported for TabularNeuralNetModel, this model will ignore them in training.")

        params = self._get_model_params()
        if num_cpus is not None:
            self.num_dataloading_workers = max(1, int(num_cpus/2.0))
        else:
            self.num_dataloading_workers = 1
        if self.num_dataloading_workers == 1:
            self.num_dataloading_workers = 0  # 0 is always faster and uses less memory than 1
        self.batch_size = params['batch_size']
        train_dataset, val_dataset = self.generate_datasets(X=X, y=y, params=params, X_val=X_val, y_val=y_val)
        logger.log(15, "Training data for neural network has: %d examples, %d features (%d vector, %d embedding)" %
                   (train_dataset.num_examples, train_dataset.num_features, len(train_dataset.feature_groups['vector']), len(train_dataset.feature_groups['embed'])
                  ))
        # self._save_preprocessor()  # TODO: should save these things for hyperparam tunning. Need one HP tuner for network-specific HPs, another for preprocessing HPs.

        if num_gpus is not None and num_gpus >= 1:
            self.ctx = mx.gpu()  # Currently cannot use more than 1 GPU
        else:
            self.ctx = mx.cpu()
        self.get_net(train_dataset, params=params)

        if time_limit is not None:
            time_elapsed = time.time() - start_time
            time_limit_orig = time_limit
            time_limit = time_limit - time_elapsed
            if time_limit <= time_limit_orig * 0.4:  # if 60% of time was spent preprocessing, likely not enough time to train model
                raise TimeLimitExceeded

        self.train_net(train_dataset=train_dataset, params=params, val_dataset=val_dataset, initialize=True, setup_trainer=True, time_limit=time_limit, reporter=reporter)
        self.params_post_fit = params
        """
Exemple #4
0
    def _fit(self, X, y, X_val=None, y_val=None, time_limit=None, num_cpus=1, num_gpus=0, reporter=None, sample_weight=None, **kwargs):
        """ X (pd.DataFrame): training data features (not necessarily preprocessed yet)
            X_val (pd.DataFrame): test data features (should have same column names as Xtrain)
            y (pd.Series):
            y_val (pd.Series): are pandas Series
            kwargs: Can specify amount of compute resources to utilize (num_cpus, num_gpus).
        """
        start_time = time.time()
        try_import_mxnet()
        import mxnet as mx
        self.verbosity = kwargs.get('verbosity', 2)
        if sample_weight is not None:  # TODO: support
            logger.log(15, "sample_weight not yet supported for TabularNeuralNetModel, this model will ignore them in training.")

        params = self.params.copy()
        params = fixedvals_from_searchspaces(params)
        if self.feature_metadata is None:
            raise ValueError("Trainer class must set feature_metadata for this model")
        if num_cpus is not None:
            self.num_dataloading_workers = max(1, int(num_cpus/2.0))
        else:
            self.num_dataloading_workers = 1
        if self.num_dataloading_workers == 1:
            self.num_dataloading_workers = 0  # 0 is always faster and uses less memory than 1
        self.batch_size = params['batch_size']
        train_dataset, val_dataset = self.generate_datasets(X=X, y=y, params=params, X_val=X_val, y_val=y_val)
        logger.log(15, "Training data for neural network has: %d examples, %d features (%d vector, %d embedding, %d language)" %
              (train_dataset.num_examples, train_dataset.num_features,
               len(train_dataset.feature_groups['vector']), len(train_dataset.feature_groups['embed']),
               len(train_dataset.feature_groups['language']) ))
        # self._save_preprocessor()  # TODO: should save these things for hyperparam tunning. Need one HP tuner for network-specific HPs, another for preprocessing HPs.

        if num_gpus is not None and num_gpus >= 1:
            self.ctx = mx.gpu()  # Currently cannot use more than 1 GPU
        else:
            self.ctx = mx.cpu()
        self.get_net(train_dataset, params=params)

        if time_limit is not None:
            time_elapsed = time.time() - start_time
            time_limit_orig = time_limit
            time_limit = time_limit - time_elapsed
            if time_limit <= time_limit_orig * 0.4:  # if 60% of time was spent preprocessing, likely not enough time to train model
                raise TimeLimitExceeded

        self.train_net(train_dataset=train_dataset, params=params, val_dataset=val_dataset, initialize=True, setup_trainer=True, time_limit=time_limit, reporter=reporter)
        self.params_post_fit = params
        """
    def _fit(self,
             X: pd.DataFrame,
             y: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             sample_weight=None,
             verbosity=2,
             **kwargs):
        try_import_mxnet()
        try_import_autogluon_vision()
        from autogluon.vision import ImagePredictor
        params = self._get_model_params()

        X = self.preprocess(X, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)

        if sample_weight is not None:  # TODO: support
            logger.log(15, "\tsample_weight not yet supported for ImagePredictorModel, this model will ignore them in training.")

        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)
        if X_val is not None:
            X_val = X_val.reset_index(drop=True)
            y_val = y_val.reset_index(drop=True)
        X[self._label_column_name] = y
        if X_val is not None:
            X_val[self._label_column_name] = y_val

        null_indices = X['image'] == ''

        # TODO: Consider some kind of weighting of the two options so there isn't a harsh cutoff at 50
        # FIXME: What if all rows in a class are null? Will probably crash.
        if null_indices.sum() > 50:
            self._dummy_pred_proba = self._compute_dummy_pred_proba(y[null_indices])  # FIXME: Do this one for better results
        else:
            # Not enough null to get a confident estimate of null label average, instead use all data average
            self._dummy_pred_proba = self._compute_dummy_pred_proba(y)

        if null_indices.sum() > 0:
            X = X[~null_indices]
        if X_val is not None:
            null_indices_val = X_val['image'] == ''
            if null_indices_val.sum() > 0:
                X_val = X_val[~null_indices_val]

        verbosity_image = max(0, verbosity - 1)
        root_logger = logging.getLogger()
        root_log_level = root_logger.level
        # TODO: ImagePredictor doesn't use problem_type in any way at present.
        #  It also doesn't error or warn if problem_type is not one it expects.
        self.model = ImagePredictor(
            problem_type=self.problem_type,
            path=self.path,
            # eval_metric=self.eval_metric,  # TODO: multiclass/binary vision problem works only with accuracy, regression with rmse
            verbosity=verbosity_image
        )

        logger.log(15, f'\tHyperparameters: {params}')

        # FIXME: ImagePredictor crashes if given float time_limit
        if time_limit is not None:
            time_limit = int(time_limit)

        self.model.fit(train_data=X,
                       tuning_data=X_val,
                       time_limit=time_limit,
                       hyperparameters=params,
                       random_state=0)
        # self.model.set_verbosity(verbosity)  # TODO: How to set verbosity of fit predictor?
        root_logger.setLevel(root_log_level)  # Reset log level
Exemple #6
0
    def _fit(self,
             X: pd.DataFrame,
             y: pd.Series,
             X_val: Optional[pd.DataFrame] = None,
             y_val: Optional[pd.Series] = None,
             time_limit: Optional[int] = None,
             sample_weight=None,
             **kwargs):
        """The internal fit function

        Parameters
        ----------
        X
            Features of the training dataset
        y
            Labels of the training dataset
        X_val
            Features of the validation dataset
        y_val
            Labels of the validation dataset
        time_limit
            The time limits for the fit function
        kwargs
            Other keyword arguments

        """
        try_import_mxnet()
        try_import_autogluon_text()
        from autogluon.text import TextPredictor

        # Decide name of the label column
        if 'label' in X.columns:
            label_col_id = 0
            while True:
                self._label_column_name = 'label{}'.format(label_col_id)
                if self._label_column_name not in X.columns:
                    break
                label_col_id += 1
        else:
            self._label_column_name = 'label'
        X_train = self.preprocess(X, fit=True)
        if X_val is not None:
            X_val = self.preprocess(X_val)
        # Get arguments from kwargs
        verbosity = kwargs.get('verbosity', 2)
        num_cpus = kwargs.get('num_cpus', None)
        num_gpus = kwargs.get('num_gpus', None)
        if sample_weight is not None:  # TODO: support
            logger.log(
                15,
                "sample_weight not yet supported for TextPredictorModel, this model will ignore them in training."
            )

        X_train.insert(len(X_train.columns), self._label_column_name, y)
        if X_val is not None:
            X_val.insert(len(X_val.columns), self._label_column_name, y_val)
        assert self.params['tune_kwargs']['num_trials'] == 1 \
               or self.params['tune_kwargs']['num_trials'] is None,\
            'Currently, you cannot nest the hyperparameter search in text neural network ' \
            'and the AutoGluon Tabular.'

        verbosity_text = max(0, verbosity - 1)
        root_logger = logging.getLogger()
        root_log_level = root_logger.level
        self.model = TextPredictor(label=self._label_column_name,
                                   problem_type=self.problem_type,
                                   path=self.path,
                                   eval_metric=self.eval_metric,
                                   verbosity=verbosity_text)
        self.model.fit(train_data=X_train,
                       tuning_data=X_val,
                       time_limit=time_limit,
                       num_gpus=num_gpus,
                       num_cpus=num_cpus,
                       hyperparameters=self.params,
                       seed=self.params.get('seed', 0))
        self.model.set_verbosity(verbosity)
        root_logger.setLevel(root_log_level)  # Reset log level