Beispiel #1
0
    def testAllocateFreeResourcesWithIncreaseByTimes(self):
        scheduler = ResourceChangingScheduler(
            resources_allocation_function=DistributeResources(
                add_bundles=True, increase_by={"GPU": 2}, increase_by_times=2
            )
        )

        base_pgf = PlacementGroupFactory([{"CPU": 1}, {"GPU": 2}])
        trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf)

        decision = scheduler.on_trial_result(
            self.trial_runner, trial1, {"metric": 1, "training_iteration": 4}
        )
        assert decision == TrialScheduler.CONTINUE

        trial4.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 1}] + [{"GPU": 2}] * 2)
        )

        trial3.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(
            trial2, scheduler, PlacementGroupFactory([{"CPU": 1}] + [{"GPU": 2}] * 2)
        )

        trial2.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 1}] + [{"GPU": 2}] * 3)
        )
    def testAllocateFreeResourcesWithIncreaseBy(self):
        scheduler = ResourceChangingScheduler(
            resources_allocation_function=DistributeResourcesToTopJob(
                add_bundles=False,
                increase_by={
                    "CPU": 2,
                    "GPU": 2
                },
                metric="metric",
                mode="max",
            ))

        base_pgf = PlacementGroupFactory([{"CPU": 2, "GPU": 2}])
        trial1, trial2, trial3, trial4 = self._prepareTrials(
            scheduler, base_pgf)

        decision = scheduler.on_trial_result(self.trial_runner, trial2, {
            "metric": 0.9,
            "training_iteration": 4
        })
        assert decision == TrialScheduler.CONTINUE

        decision = scheduler.on_trial_result(self.trial_runner, trial1, {
            "metric": 1.0,
            "training_iteration": 4
        })
        assert decision == TrialScheduler.CONTINUE

        trial4.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{
                "CPU": 4,
                "GPU": 4
            }]))
        decision = scheduler.on_trial_result(self.trial_runner, trial2, {
            "metric": 1.1,
            "training_iteration": 4
        })
        assert decision == TrialScheduler.CONTINUE
        trial3.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(trial2,
                                            scheduler,
                                            PlacementGroupFactory([{
                                                "CPU": 4,
                                                "GPU": 4
                                            }]),
                                            metric=1.1)
        trial2.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(trial1,
                                            scheduler,
                                            PlacementGroupFactory([{
                                                "CPU": 8,
                                                "GPU": 8
                                            }]),
                                            metric=1.2)
Beispiel #3
0
    def testDeallocateResources(self):
        scheduler = ResourceChangingScheduler(
            resources_allocation_function=DistributeResourcesToTopJob(
                add_bundles=False, increase_by={"GPU": 2}, metric="metric", mode="max"
            )
        )

        base_pgf = PlacementGroupFactory([{"CPU": 1, "GPU": 2}])
        trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf)
        trial1.placement_group_factory = PlacementGroupFactory([{"CPU": 1, "GPU": 4}])
        trial4.status = Trial.PENDING

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 1, "GPU": 2}])
        )
Beispiel #4
0
            def default_resource_request(cls, config):
                num_workers = config.get("num_workers",
                                         kwargs.get("num_workers", 1))
                num_cpus_per_worker = config.get(
                    "num_cpus_per_worker", kwargs.get("num_cpus_per_worker",
                                                      1))
                use_gpu = config.get("use_gpu", kwargs.get("use_gpu"))
                use_local = config.get("use_local",
                                       kwargs.get("use_local", False))

                bundles = []

                if not use_local:
                    # We need a separate bundle for the driver
                    bundles += [{"CPU": 1}]

                bundles += [
                    # Worker bundles
                    {
                        "CPU": num_cpus_per_worker,
                        "GPU": int(use_gpu)
                    }
                ] * num_workers

                return PlacementGroupFactory(bundles, strategy="PACK")
Beispiel #5
0
    def pause(self, trial_runner):
        """ Pause the AdaptDLTrial with a checkpoint. We try to remove the PG
        attached to this trial"""
        assert self.runner is not None
        checkpoint_obj = ray.get(
            self.runner.save_all_states.remote(self.runner.get_state.remote()))
        # Serialize to disk
        temp_checkpoint_dir = (FuncCheckpointUtil.mk_temp_checkpoint_dir(
            self.logdir))
        checkpoint_path = TrainableUtil.create_from_pickle(
            checkpoint_obj, temp_checkpoint_dir)

        # Trial will be restored from the checkpoint_path when it's resumed
        self.restore_path = checkpoint_path

        # Clear the allocation. This is a hack to clear the PG associated with
        # the trial. We assign a temporary PG which will get replaced with a
        # real PG once we resume the trial. This is needed because Tune likes
        # to keep the PGs around even for PAUSED trials.
        self.placement_group_factory = PlacementGroupFactory([{"CPU": 0.001}])
        # This forces Tune to garbage-collect uneeded PGs which can then be
        # reused
        trial_runner.trial_executor._pg_manager.\
            reconcile_placement_groups([self])
        logger.debug(f"PAUSING {self} w/ checkpoint at {checkpoint_path}")
Beispiel #6
0
    def testDeallocateResources(self):
        scheduler = ResourceChangingScheduler(
            resources_allocation_function=DistributeResources(
                add_bundles=True, increase_by={"GPU": 2}
            )
        )

        base_pgf = PlacementGroupFactory([{"CPU": 1}, {"GPU": 2}])
        trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf)
        trial1.placement_group_factory = PlacementGroupFactory(
            [{"CPU": 1}] + [{"GPU": 2}] * 2
        )
        trial4.status = Trial.PENDING

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 1}, {"GPU": 2}])
        )
Beispiel #7
0
 def default_resource_request(cls, config: Dict) -> PlacementGroupFactory:
     trainer_bundle = [{"CPU": 1}]
     worker_resources = {"CPU": 1, "GPU": int(use_gpu)}
     worker_resources_extra = (
         {} if resources_per_worker is None else resources_per_worker
     )
     worker_bundles = [
         {**worker_resources, **worker_resources_extra}
         for _ in range(num_workers)
     ]
     bundles = trainer_bundle + worker_bundles
     return PlacementGroupFactory(bundles, strategy="PACK")
Beispiel #8
0
    def testAllocateFreeResources(self):
        scheduler = ResourceChangingScheduler(
            resources_allocation_function=DistributeResources(add_bundles=False)
        )

        base_pgf = PlacementGroupFactory([{"CPU": 1, "GPU": 0}])
        trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf)

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 2}])
        )
        self._allocateAndAssertNewResources(
            trial2, scheduler, PlacementGroupFactory([{"CPU": 2}])
        )

        trial4.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 3}])
        )

        trial3.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 4}])
        )

        trial2.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 8}])
        )
    def get_tune_ddp_resources(num_workers: int = 1,
                               cpus_per_worker: int = 1,
                               use_gpu: bool = False) -> Dict[str, int]:
        """Returns the PlacementGroupFactory to use for Ray Tune."""
        from ray.tune import PlacementGroupFactory

        head_bundle = {"CPU": 1}
        child_bundle = {"CPU": cpus_per_worker, "GPU": int(use_gpu)}
        child_bundles = [child_bundle.copy() for _ in range(num_workers)]
        bundles = [head_bundle] + child_bundles
        placement_group_factory = PlacementGroupFactory(
            bundles, strategy="PACK")
        return placement_group_factory
Beispiel #10
0
    def default_resource_request(cls, config):
        cf = dict(cls._default_config, **config)

        # Return PlacementGroupFactory containing all needed resources
        # (already properly defined as device bundles).
        return PlacementGroupFactory(
            bundles=[
                {
                    # Single CPU for the local worker. This CPU will host the
                    # main model in this example (num_workers=0).
                    "CPU": 1,
                    # Possibly add n GPUs to this.
                    "GPU": cf["num_gpus"],
                },
                {
                    # Different bundle (meaning: possibly different node)
                    # for your n "remote" envs (set remote_worker_envs=True).
                    "CPU": cf["num_envs_per_worker"],
                }
            ],
            strategy=config.get("placement_strategy", "PACK"))
Beispiel #11
0
def _get_tune_resources(num_actors: int, cpus_per_actor: int,
                        gpus_per_actor: int,
                        resources_per_actor: Optional[Dict]):
    """Returns object to use for ``resources_per_trial`` with Ray Tune."""
    if TUNE_INSTALLED:
        if not TUNE_USING_PG:
            resources_per_actor = {} if not resources_per_actor \
                else resources_per_actor
            extra_custom_resources = {
                k: v * num_actors
                for k, v in resources_per_actor.items()
            }
            return dict(
                cpu=1,
                extra_cpu=cpus_per_actor * num_actors,
                extra_gpu=gpus_per_actor * num_actors,
                extra_custom_resources=extra_custom_resources,
            )
        else:
            from ray.tune import PlacementGroupFactory

            head_bundle = {"CPU": 1}
            child_bundle = {"CPU": cpus_per_actor, "GPU": gpus_per_actor}
            child_bundle_extra = {} if resources_per_actor is None else \
                resources_per_actor
            child_bundles = [{
                **child_bundle,
                **child_bundle_extra
            } for _ in range(num_actors)]
            bundles = [head_bundle] + child_bundles
            placement_group_factory = PlacementGroupFactory(bundles,
                                                            strategy="PACK")

            return placement_group_factory
    else:
        raise RuntimeError("Tune is not installed, so `get_tune_resources` is "
                           "not supported. You can install Ray Tune via `pip "
                           "install ray[tune]`.")