def _run_dist_job(cls, task_id, fn, args, gpu_ids):
        """Remote function Executing the task
        """
        if '_default_config' in args['args']:
            args['args'].pop('_default_config')

        if 'reporter' in args:
            local_reporter = LocalStatusReporter()
            dist_reporter = args['reporter']
            args['reporter'] = local_reporter

        manager = mp.Manager()
        return_list = manager.list()

        try:
            # Starting local process
            # Note: we have to use dill here because every argument passed to a child process over spawn or forkserver
            # has to be pickled. fork mode does not require this because memory sharing, but it is unusable for CUDA
            # applications (CUDA does not support fork) and multithreading issues (hanged threads).
            # Usage of decorators makes standard pickling unusable (https://docs.python.org/3/library/pickle.html#what-can-be-pickled-and-unpickled)
            # Dill enables sending of decorated classes. Please note if some classes are used in the training function,
            # those classes are best be defined inside the function - this way those can be constructed 'on-the-other-side'
            # after deserialization.
            pickled_fn = fn if is_fork_enabled() else dill.dumps(fn)

            # Reporter has to be separated since it's used for cross-process communication and has to be passed as-is
            args_ = {k: v for (k, v) in args.items() if k not in ['reporter']}
            pickled_args = args_ if is_fork_enabled() else dill.dumps(args_)

            cross_process_args = {
                k: v
                for (k, v) in args.items() if k not in ['fn', 'args']
            }

            with make_temp_directory() as tempdir:
                p = CustomProcess(target=partial(cls._worker, tempdir, task_id,
                                                 pickled_fn, pickled_args),
                                  args=(return_list, gpu_ids,
                                        cross_process_args))
                p.start()
                if 'reporter' in args:
                    cp = Communicator.Create(p, local_reporter, dist_reporter)
                p.join()
                # Get processes outputs
                if not is_fork_enabled():
                    cls.__print(tempdir, task_id, 'out')
                    cls.__print(tempdir, task_id, 'err')
        except Exception as e:
            logger.error('Exception in worker process: {}'.format(e))
        ret = return_list[0] if len(return_list) > 0 else None
        return ret
Ejemplo n.º 2
0
    def _fit(self,
             X,
             y,
             X_val=None,
             y_val=None,
             time_limit=None,
             num_cpus=None,
             num_gpus=0,
             sample_weight=None,
             **kwargs):
        try_import_fastai()
        from fastai.tabular.model import tabular_config
        from fastai.tabular.learner import tabular_learner
        from fastcore.basics import defaults
        from .callbacks import AgSaveModelCallback, EarlyStoppingCallbackWithTimeLimit
        from .quantile_helpers import HuberPinballLoss
        import torch

        start_time = time.time()
        if sample_weight is not None:  # TODO: support
            logger.log(15, "sample_weight not yet supported for NNFastAiTabularModel, this model will ignore them in training.")

        params = self._get_model_params()

        self.y_scaler = params.get('y_scaler', None)
        if self.y_scaler is None:
            if self.problem_type == REGRESSION:
                self.y_scaler = sklearn.preprocessing.StandardScaler()
            elif self.problem_type == QUANTILE:
                self.y_scaler = sklearn.preprocessing.MinMaxScaler()
        else:
            self.y_scaler = copy.deepcopy(self.y_scaler)

        if num_cpus is None:
            num_cpus = defaults.cpus
        # additional workers are helping only when fork is enabled; in other mp modes, communication overhead reduces performance
        num_workers = int(num_cpus / 2)
        if not is_fork_enabled():
            num_workers = 0
        if num_gpus is not None:
            if num_gpus == 0:
                # TODO: Does not obviously impact inference speed
                defaults.device = torch.device('cpu')
            else:
                defaults.device = torch.device('cuda')

        logger.log(15, f'Fitting Neural Network with parameters {params}...')
        data = self._preprocess_train(X, y, X_val, y_val)

        nn_metric, objective_func_name = self.__get_objective_func_name(self.stopping_metric)
        objective_func_name_to_monitor = self.__get_objective_func_to_monitor(objective_func_name)
        objective_optim_mode = np.less if objective_func_name in [
            'log_loss',
            'root_mean_squared_error', 'mean_squared_error', 'mean_absolute_error', 'median_absolute_error',  # Regression objectives
            'pinball_loss',  # Quantile objective
        ] else np.greater

        # TODO: calculate max emb concat layer size and use 1st layer as that value and 2nd in between number of classes and the value
        if params.get('layers', None) is not None:
            layers = params['layers']
        elif self.problem_type in [REGRESSION, BINARY]:
            layers = [200, 100]
        elif self.problem_type == QUANTILE:
            base_size = max(len(self.quantile_levels) * 4, 128)
            layers = [base_size, base_size, base_size]
        else:
            base_size = max(data.c * 2, 100)
            layers = [base_size * 2, base_size]

        loss_func = None
        if self.problem_type == QUANTILE:
            loss_func = HuberPinballLoss(self.quantile_levels, alpha=self.params['alpha'])

        best_epoch_stop = params.get("best_epoch", None)  # Use best epoch for refit_full.
        dls = data.dataloaders(bs=self.params['bs'] if len(X) > self.params['bs'] else 32)

        if self.problem_type == QUANTILE:
            dls.c = len(self.quantile_levels)

        self.model = tabular_learner(
            dls, layers=layers, metrics=nn_metric,
            config=tabular_config(ps=params['ps'], embed_p=params['emb_drop']),
            loss_func=loss_func,
        )
        logger.log(15, self.model.model)

        save_callback = AgSaveModelCallback(
            monitor=objective_func_name_to_monitor, comp=objective_optim_mode, fname=self.name,
            best_epoch_stop=best_epoch_stop, with_opt=True
        )

        if time_limit is not None:
            time_elapsed = time.time() - start_time
            time_left = time_limit - time_elapsed
            if time_left <= time_limit * 0.7:  # if 30% of time was spent preprocessing, likely not enough time to train model
                raise TimeLimitExceeded
        else:
            time_left = None

        early_stopping = EarlyStoppingCallbackWithTimeLimit(
            monitor=objective_func_name_to_monitor,
            comp=objective_optim_mode,
            min_delta=params['early.stopping.min_delta'],
            patience=params['early.stopping.patience'],
            time_limit=time_left, best_epoch_stop=best_epoch_stop
        )

        callbacks = [save_callback, early_stopping]

        with make_temp_directory() as temp_dir:
            with self.model.no_bar():
                with self.model.no_logging():
                    original_path = self.model.path
                    self.model.path = Path(temp_dir)
                    self.model.fit_one_cycle(params['epochs'], params['lr'], cbs=callbacks)

                    # Load the best one and export it
                    self.model = self.model.load(self.name)

                    if objective_func_name == 'log_loss':
                        eval_result = self.model.validate(dl=dls.valid)[0]
                    else:
                        eval_result = self.model.validate(dl=dls.valid)[1]

                    logger.log(15, f'Model validation metrics: {eval_result}')
                    self.model.path = original_path

            self.params_trained['best_epoch'] = save_callback.best_epoch
Ejemplo n.º 3
0
    def _fit(self,
             X_train,
             y_train,
             X_val=None,
             y_val=None,
             time_limit=None,
             **kwargs):
        try_import_fastai_v1()
        from fastai.layers import LabelSmoothingCrossEntropy
        from fastai.tabular import tabular_learner
        from fastai.utils.mod_display import progress_disabled_ctx
        from .callbacks import EarlyStoppingCallbackWithTimeLimit, SaveModelCallback

        start_time = time.time()

        params = self.params.copy()

        self.y_scaler = params.get('y_scaler', None)
        if self.y_scaler is not None:
            self.y_scaler = copy.deepcopy(self.y_scaler)

        logger.log(15, f'Fitting Neural Network with parameters {params}...')
        data = self._preprocess_train(X_train, y_train, X_val, y_val)

        nn_metric, objective_func_name = self.__get_objective_func_name()
        objective_func_name_to_monitor = self.__get_objective_func_to_monitor(
            objective_func_name)
        objective_optim_mode = 'min' if objective_func_name in [
            'root_mean_squared_error',
            'mean_squared_error',
            'mean_absolute_error',
            'r2'  # Regression objectives
        ] else 'auto'

        # TODO: calculate max emb concat layer size and use 1st layer as that value and 2nd in between number of classes and the value
        if params.get('layers', None) is not None:
            layers = params['layers']
        elif self.problem_type in [REGRESSION, BINARY]:
            layers = [200, 100]
        else:
            base_size = max(len(data.classes) * 2, 100)
            layers = [base_size * 2, base_size]

        loss_func = None
        if self.problem_type in [BINARY, MULTICLASS
                                 ] and params.get('smoothing', 0.0) > 0.0:
            loss_func = LabelSmoothingCrossEntropy(params['smoothing'])

        ps = params['ps']
        if type(ps) != list:
            ps = [ps]

        if time_limit:
            time_elapsed = time.time() - start_time
            time_left = time_limit - time_elapsed
        else:
            time_left = None

        best_epoch_stop = params.get("best_epoch",
                                     None)  # Use best epoch for refit_full.
        early_stopping_fn = partial(
            EarlyStoppingCallbackWithTimeLimit,
            monitor=objective_func_name_to_monitor,
            mode=objective_optim_mode,
            min_delta=params['early.stopping.min_delta'],
            patience=params['early.stopping.patience'],
            time_limit=time_left,
            best_epoch_stop=best_epoch_stop)

        self.model = tabular_learner(data,
                                     layers=layers,
                                     ps=ps,
                                     emb_drop=params['emb_drop'],
                                     metrics=nn_metric,
                                     loss_func=loss_func,
                                     callback_fns=[early_stopping_fn])
        logger.log(15, self.model.model)

        with make_temp_directory() as temp_dir:
            save_callback = SaveModelCallback(
                self.model,
                monitor=objective_func_name_to_monitor,
                mode=objective_optim_mode,
                name=self.name,
                best_epoch_stop=best_epoch_stop)
            with progress_disabled_ctx(self.model) as model:
                original_path = model.path
                model.path = Path(temp_dir)
                model.fit_one_cycle(params['epochs'],
                                    params['lr'],
                                    callbacks=save_callback)

                # Load the best one and export it
                model.load(self.name)

                if objective_func_name == 'log_loss':
                    eval_result = model.validate()[0]
                else:
                    eval_result = model.validate()[1].numpy().reshape(-1)[0]

                logger.log(15, f'Model validation metrics: {eval_result}')
                model.path = original_path
            self.params_trained['best_epoch'] = save_callback.best_epoch
Ejemplo n.º 4
0
    def _fit(self,
             X,
             y,
             X_val=None,
             y_val=None,
             time_limit=None,
             num_cpus=None,
             num_gpus=0,
             sample_weight=None,
             **kwargs):
        try_import_fastai()
        from fastai.tabular.model import tabular_config
        from fastai.tabular.learner import tabular_learner
        from fastai import torch_core
        from .callbacks import AgSaveModelCallback, EarlyStoppingCallbackWithTimeLimit
        from .quantile_helpers import HuberPinballLoss

        start_time = time.time()
        if sample_weight is not None:  # TODO: support
            logger.log(
                15,
                "sample_weight not yet supported for NNFastAiTabularModel, this model will ignore them in training."
            )

        params = self._get_model_params()

        self.y_scaler = params.get('y_scaler', None)
        if self.y_scaler is None:
            if self.problem_type == REGRESSION:
                self.y_scaler = sklearn.preprocessing.StandardScaler()
            elif self.problem_type == QUANTILE:
                self.y_scaler = sklearn.preprocessing.MinMaxScaler()
        else:
            self.y_scaler = copy.deepcopy(self.y_scaler)

        if num_gpus is not None:
            # TODO: Control CPU vs GPU usage during inference
            if num_gpus == 0:
                torch_core.default_device(use_cuda=False)
            else:
                # TODO: respect CUDA_VISIBLE_DEVICES to select proper GPU
                torch_core.default_device(use_cuda=True)

        logger.log(15, f'Fitting Neural Network with parameters {params}...')
        data = self._preprocess_train(X, y, X_val, y_val)

        nn_metric, objective_func_name = self.__get_objective_func_name(
            self.stopping_metric)
        objective_func_name_to_monitor = self.__get_objective_func_to_monitor(
            objective_func_name)
        objective_optim_mode = np.less if objective_func_name in [
            'log_loss',
            'root_mean_squared_error',
            'mean_squared_error',
            'mean_absolute_error',
            'median_absolute_error',  # Regression objectives
            'pinball_loss',  # Quantile objective
        ] else np.greater

        # TODO: calculate max emb concat layer size and use 1st layer as that value and 2nd in between number of classes and the value
        if params.get('layers', None) is not None:
            layers = params['layers']
        elif self.problem_type in [REGRESSION, BINARY]:
            layers = [200, 100]
        elif self.problem_type == QUANTILE:
            base_size = max(len(self.quantile_levels) * 4, 128)
            layers = [base_size, base_size, base_size]
        else:
            base_size = max(data.c * 2, 100)
            layers = [base_size * 2, base_size]

        loss_func = None
        if self.problem_type == QUANTILE:
            loss_func = HuberPinballLoss(self.quantile_levels,
                                         alpha=self.params['alpha'])

        best_epoch_stop = params.get("best_epoch",
                                     None)  # Use best epoch for refit_full.
        batch_size = self._get_batch_size(X)
        dls = data.dataloaders(bs=batch_size)

        # Make deterministic
        from fastai.torch_core import set_seed
        set_seed(0, True)
        dls.rng.seed(0)

        if self.problem_type == QUANTILE:
            dls.c = len(self.quantile_levels)

        self.model = tabular_learner(
            dls,
            layers=layers,
            metrics=nn_metric,
            config=tabular_config(ps=params['ps'], embed_p=params['emb_drop']),
            loss_func=loss_func,
        )
        logger.log(15, self.model.model)

        fname = 'model'
        save_callback = AgSaveModelCallback(
            monitor=objective_func_name_to_monitor,
            comp=objective_optim_mode,
            fname=fname,
            best_epoch_stop=best_epoch_stop,
            with_opt=True)

        if time_limit is not None:
            time_elapsed = time.time() - start_time
            time_left = time_limit - time_elapsed
            if time_left <= time_limit * 0.7:  # if 30% of time was spent preprocessing, likely not enough time to train model
                raise TimeLimitExceeded
        else:
            time_left = None

        early_stopping = EarlyStoppingCallbackWithTimeLimit(
            monitor=objective_func_name_to_monitor,
            comp=objective_optim_mode,
            min_delta=params['early.stopping.min_delta'],
            patience=params['early.stopping.patience'],
            time_limit=time_left,
            best_epoch_stop=best_epoch_stop)

        callbacks = [save_callback, early_stopping]

        with make_temp_directory() as temp_dir:
            with self.model.no_bar():
                with self.model.no_logging():
                    original_path = self.model.path
                    self.model.path = Path(temp_dir)

                    len_val = len(X_val) if X_val is not None else 0
                    epochs = self._get_epochs_number(samples_num=len(X) +
                                                     len_val,
                                                     epochs=params['epochs'],
                                                     batch_size=batch_size,
                                                     time_left=time_left)
                    if epochs == 0:
                        # Stop early if there is not enough time to train a full epoch
                        raise TimeLimitExceeded

                    self.model.fit_one_cycle(epochs,
                                             params['lr'],
                                             cbs=callbacks)

                    # Load the best one and export it
                    self.model = self.model.load(fname)

                    if objective_func_name == 'log_loss':
                        eval_result = self.model.validate(dl=dls.valid)[0]
                    else:
                        eval_result = self.model.validate(dl=dls.valid)[1]

                    logger.log(15, f'Model validation metrics: {eval_result}')
                    self.model.path = original_path

            self.params_trained['epochs'] = epochs
            self.params_trained['best_epoch'] = save_callback.best_epoch
Ejemplo n.º 5
0
    def _fit(self,
             X,
             y,
             X_val=None,
             y_val=None,
             time_limit=None,
             num_cpus=None,
             num_gpus=0,
             sample_weight=None,
             **kwargs):
        try_import_fastai_v1()
        import torch
        from fastai.layers import LabelSmoothingCrossEntropy
        from fastai.tabular import tabular_learner
        from fastai.utils.mod_display import progress_disabled_ctx
        from fastai.core import defaults
        from .callbacks import EarlyStoppingCallbackWithTimeLimit, SaveModelCallback

        start_time = time.time()
        if sample_weight is not None:  # TODO: support
            logger.log(
                15,
                "sample_weight not yet supported for NNFastAiTabularModel, this model will ignore them in training."
            )

        params = self.params.copy()

        self.y_scaler = params.get('y_scaler', None)
        if self.y_scaler is not None:
            self.y_scaler = copy.deepcopy(self.y_scaler)

        if num_cpus is None:
            num_cpus = defaults.cpus
        # additional workers are helping only when fork is enabled; in other mp modes, communication overhead reduces performance
        num_workers = int(num_cpus / 2)
        if not is_fork_enabled():
            num_workers = 0
        if num_gpus is not None:
            if num_gpus == 0:
                # TODO: Does not obviously impact inference speed
                defaults.device = torch.device('cpu')
            else:
                defaults.device = torch.device('cuda')

        logger.log(15, f'Fitting Neural Network with parameters {params}...')
        data = self._preprocess_train(X,
                                      y,
                                      X_val,
                                      y_val,
                                      num_workers=num_workers)

        nn_metric, objective_func_name = self.__get_objective_func_name()
        objective_func_name_to_monitor = self.__get_objective_func_to_monitor(
            objective_func_name)
        objective_optim_mode = 'min' if objective_func_name in [
            'root_mean_squared_error',
            'mean_squared_error',
            'mean_absolute_error',
            'r2'  # Regression objectives
        ] else 'auto'

        # TODO: calculate max emb concat layer size and use 1st layer as that value and 2nd in between number of classes and the value
        if params.get('layers', None) is not None:
            layers = params['layers']
        elif self.problem_type in [REGRESSION, BINARY]:
            layers = [200, 100]
        else:
            base_size = max(len(data.classes) * 2, 100)
            layers = [base_size * 2, base_size]

        loss_func = None
        if self.problem_type in [BINARY, MULTICLASS
                                 ] and params.get('smoothing', 0.0) > 0.0:
            loss_func = LabelSmoothingCrossEntropy(params['smoothing'])

        ps = params['ps']
        if type(ps) != list:
            ps = [ps]

        if time_limit:
            time_elapsed = time.time() - start_time
            time_left = time_limit - time_elapsed
        else:
            time_left = None

        best_epoch_stop = params.get("best_epoch",
                                     None)  # Use best epoch for refit_full.
        early_stopping_fn = partial(
            EarlyStoppingCallbackWithTimeLimit,
            monitor=objective_func_name_to_monitor,
            mode=objective_optim_mode,
            min_delta=params['early.stopping.min_delta'],
            patience=params['early.stopping.patience'],
            time_limit=time_left,
            best_epoch_stop=best_epoch_stop)

        self.model = tabular_learner(data,
                                     layers=layers,
                                     ps=ps,
                                     emb_drop=params['emb_drop'],
                                     metrics=nn_metric,
                                     loss_func=loss_func,
                                     callback_fns=[early_stopping_fn])
        logger.log(15, self.model.model)

        with make_temp_directory() as temp_dir:
            save_callback = SaveModelCallback(
                self.model,
                monitor=objective_func_name_to_monitor,
                mode=objective_optim_mode,
                name=self.name,
                best_epoch_stop=best_epoch_stop)
            with progress_disabled_ctx(self.model) as model:
                original_path = model.path
                model.path = Path(temp_dir)
                model.fit_one_cycle(params['epochs'],
                                    params['lr'],
                                    callbacks=save_callback)

                # Load the best one and export it
                model.load(self.name)

                if objective_func_name == 'log_loss':
                    eval_result = model.validate()[0]
                else:
                    eval_result = model.validate()[1].numpy().reshape(-1)[0]

                logger.log(15, f'Model validation metrics: {eval_result}')
                model.path = original_path
            self.params_trained['best_epoch'] = save_callback.best_epoch