def testHasResourcesForTrialWithCaching(self): pgm = _PlacementGroupManager() pgf1 = PlacementGroupFactory([{"CPU": self.head_cpus}]) pgf2 = PlacementGroupFactory([{"CPU": self.head_cpus - 1}]) executor = RayTrialExecutor(reuse_actors=True) executor._pg_manager = pgm executor.set_max_pending_trials(1) def train(config): yield 1 yield 2 yield 3 yield 4 register_trainable("resettable", train) trial1 = Trial("resettable", placement_group_factory=pgf1) trial2 = Trial("resettable", placement_group_factory=pgf1) trial3 = Trial("resettable", placement_group_factory=pgf2) assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert executor.has_resources_for_trial(trial3) executor._stage_and_update_status([trial1, trial2, trial3]) while not pgm.has_ready(trial1): time.sleep(1) executor._stage_and_update_status([trial1, trial2, trial3]) # Fill staging executor._stage_and_update_status([trial1, trial2, trial3]) assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert not executor.has_resources_for_trial(trial3) executor._start_trial(trial1) executor._stage_and_update_status([trial1, trial2, trial3]) executor.pause_trial( trial1) # Caches the PG and removes a PG from staging assert len(pgm._staging_futures) == 0 # This will re-schedule a placement group pgm.reconcile_placement_groups([trial1, trial2]) assert len(pgm._staging_futures) == 1 assert not pgm.can_stage() # We should still have resources for this trial as it has a cached PG assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert not executor.has_resources_for_trial(trial3)
class RayExecutorQueueTest(unittest.TestCase): def setUp(self): self.cluster = Cluster(initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_system_config": { "num_heartbeats_timeout": 10 } }) self.trial_executor = RayTrialExecutor(queue_trials=True, refresh_period=0) # Pytest doesn't play nicely with imports _register_all() def tearDown(self): ray.shutdown() self.cluster.shutdown() _register_all() # re-register the evicted objects def testQueueTrial(self): """Tests that reset handles NotImplemented properly.""" def create_trial(cpu, gpu=0): return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu)) cpu_only = create_trial(1, 0) self.assertTrue(self.trial_executor.has_resources_for_trial(cpu_only)) self.trial_executor.start_trial(cpu_only) gpu_only = create_trial(0, 1) self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_only)) def testHeadBlocking(self): # Once resource requests are deprecated, remove this test os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1" def create_trial(cpu, gpu=0): return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu)) gpu_trial = create_trial(1, 1) self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_trial)) self.trial_executor.start_trial(gpu_trial) # TODO(rliaw): This behavior is probably undesirable, but right now # trials with different resource requirements is not often used. cpu_only_trial = create_trial(1, 0) self.assertFalse( self.trial_executor.has_resources_for_trial(cpu_only_trial)) self.cluster.add_node(num_cpus=1, num_gpus=1) self.cluster.wait_for_nodes() self.assertTrue( self.trial_executor.has_resources_for_trial(cpu_only_trial)) self.trial_executor.start_trial(cpu_only_trial) cpu_only_trial2 = create_trial(1, 0) self.assertTrue( self.trial_executor.has_resources_for_trial(cpu_only_trial2)) self.trial_executor.start_trial(cpu_only_trial2) cpu_only_trial3 = create_trial(1, 0) self.assertFalse( self.trial_executor.has_resources_for_trial(cpu_only_trial3))