Example #1
0
 def write_checkpoint(self, checkpoint: Dict):
     # If inside a Tune Trainable, then checkpoint with Tune.
     with tune.checkpoint_dir(
             step=self._latest_checkpoint_id) as checkpoint_dir:
         source_ip = checkpoint[NODE_IP_KEY]
         source_path = checkpoint[CHECKPOINT_PATH_ON_NODE_KEY]
         target_ip = get_node_ip_address()
         if source_ip == target_ip:
             # Move contents of source_path, but not source_path
             # itself. shutil.move is already recursive.
             for path in Path(source_path).iterdir():
                 shutil.move(str(path.absolute()), checkpoint_dir)
             shutil.rmtree(source_path, ignore_errors=True)
         else:
             sync_dir_between_nodes(
                 source_ip=source_ip,
                 source_path=source_path,
                 target_ip=target_ip,
                 target_path=checkpoint_dir,
                 return_futures=False,
                 max_size_bytes=None,
             )
             delete_on_node(node_ip=source_ip, path=source_path)
         checkpoint_dir = Path(checkpoint_dir)
         save_preprocessor_to_dir(self.preprocessor, checkpoint_dir)
         # add tune checkpoint id
         with open(checkpoint_dir.joinpath(TUNE_CHECKPOINT_ID), "w") as f:
             f.write(str(self._latest_checkpoint_id))
Example #2
0
            def save_checkpoint(self, tmp_checkpoint_dir: str = ""):
                checkpoint_path = super().save_checkpoint()
                parent_dir = TrainableUtil.find_checkpoint_dir(checkpoint_path)

                preprocessor = self._merged_config.get("preprocessor", None)
                if parent_dir and preprocessor:
                    save_preprocessor_to_dir(preprocessor, parent_dir)
                return checkpoint_path
Example #3
0
            def save_checkpoint(self, checkpoint_dir: str):
                checkpoint_path = super(AIRRLTrainer, self).save_checkpoint(
                    checkpoint_dir
                )

                trainer_class_path = os.path.join(checkpoint_dir, RL_TRAINER_CLASS_FILE)
                with open(trainer_class_path, "wb") as fp:
                    cpickle.dump(self.__class__, fp)

                config_path = os.path.join(checkpoint_dir, RL_CONFIG_FILE)
                with open(config_path, "wb") as fp:
                    cpickle.dump(self.config, fp)

                if preprocessor:
                    save_preprocessor_to_dir(preprocessor, checkpoint_dir)

                return checkpoint_path
Example #4
0
def test_init():
    preprocessor = DummyPreprocessor()
    preprocessor.attr = 1
    predictor = SklearnPredictor(estimator=model, preprocessor=preprocessor)

    with tempfile.TemporaryDirectory() as tmpdir:
        with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f:
            cpickle.dump(model, f)
        save_preprocessor_to_dir(preprocessor, tmpdir)

        checkpoint = Checkpoint.from_directory(tmpdir)
        checkpoint_predictor = SklearnPredictor.from_checkpoint(checkpoint)

    assert np.allclose(
        checkpoint_predictor.estimator.feature_importances_,
        predictor.estimator.feature_importances_,
    )
    assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr
Example #5
0
def test_init():
    preprocessor = DummyPreprocessor()
    preprocessor.attr = 1
    predictor = LightGBMPredictor(model=model, preprocessor=preprocessor)

    with tempfile.TemporaryDirectory() as tmpdir:
        # This somewhat convoluted procedure is the same as in the
        # Trainers. The reason for saving model to disk instead
        # of directly to the dict as bytes is due to all callbacks
        # following save to disk logic. GBDT models are small
        # enough that IO should not be an issue.
        model.save_model(os.path.join(tmpdir, MODEL_KEY))
        save_preprocessor_to_dir(preprocessor, tmpdir)

        checkpoint = Checkpoint.from_directory(tmpdir)
        checkpoint_predictor = LightGBMPredictor.from_checkpoint(checkpoint)

    assert get_num_trees(checkpoint_predictor.model) == get_num_trees(
        predictor.model)
    assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr
Example #6
0
    def training_loop(self) -> None:
        register_ray()

        self.estimator.set_params(**self.params)

        datasets = self._get_datasets()
        X_train, y_train = datasets.pop(TRAIN_DATASET_KEY)
        groups = None
        if "cv_groups" in X_train.columns:
            groups = X_train["cv_groups"]
            X_train = X_train.drop("cv_groups", axis=1)

        scaling_config_dataclass = self._validate_and_get_scaling_config_data_class(
            self.scaling_config)

        num_workers = scaling_config_dataclass.num_workers or 0
        assert num_workers == 0  # num_workers is not in scaling config allowed_keys

        trainer_resources = scaling_config_dataclass.trainer_resources or {
            "CPU": 1
        }
        has_gpus = bool(trainer_resources.get("GPU", 0))
        num_cpus = int(trainer_resources.get("CPU", 1))

        # see https://scikit-learn.org/stable/computing/parallelism.html
        os.environ["OMP_NUM_THREADS"] = str(num_cpus)
        os.environ["MKL_NUM_THREADS"] = str(num_cpus)
        os.environ["OPENBLAS_NUM_THREADS"] = str(num_cpus)
        os.environ["BLIS_NUM_THREADS"] = str(num_cpus)

        parallelize_cv = self._get_cv_parallelism(has_gpus)
        if self.set_estimator_cpus:
            num_estimator_cpus = 1 if parallelize_cv else num_cpus
            set_cpu_params(self.estimator, num_estimator_cpus)

        with parallel_backend("ray", n_jobs=num_cpus):
            start_time = time()
            self.estimator.fit(X_train, y_train, **self.fit_params)
            fit_time = time() - start_time

            with tune.checkpoint_dir(step=1) as checkpoint_dir:
                with open(os.path.join(checkpoint_dir, MODEL_KEY), "wb") as f:
                    cpickle.dump(self.estimator, f)

                if self.preprocessor:
                    save_preprocessor_to_dir(self.preprocessor, checkpoint_dir)

            if self.label_column:
                validation_set_scores = self._score_on_validation_sets(
                    self.estimator, datasets)
                cv_scores = self._score_cv(
                    self.estimator,
                    X_train,
                    y_train,
                    groups,
                    # if estimator has parallelism, use that. Otherwise,
                    # parallelize CV
                    n_jobs=1 if not parallelize_cv else num_cpus,
                )
            else:
                validation_set_scores = {}
                cv_scores = {}

        # cv_scores will not override validation_set_scores as we
        # check for that during initialization
        results = {
            **validation_set_scores,
            **cv_scores,
            "fit_time": fit_time,
        }
        tune.report(**results)