Esempio n. 1
0
def test_fault_tolerance_hosts_remove_and_add_cooldown(ray_8_cpus):
    with fault_tolerance_patches():
        discovery_schedule = [
            (10, ['host-1:2', 'host-2:1', 'host-3:2']),
            (10, ['host-1:2']),
            (None, ['host-1:2', 'host-2:1', 'host-3:2']),
        ]
        nics = list(psutil.net_if_addrs().keys())[0]

        settings = RayExecutor.create_settings(nics={nics})
        settings.discovery = SimpleTestDiscovery(discovery_schedule)
        executor = RayExecutor(settings,
                               min_workers=1,
                               cpus_per_worker=1,
                               override_discovery=False,
                               cooldown_range=[1, 1])

        training_fn = _create_training_function(iterations=30)
        executor.start()
        trace = StatusCallback()
        results = executor.run(training_fn, callbacks=[trace])
        assert len(results) == 5

        events = trace.fetch()
        assert sum(int("started" in e) for e in events) == 5, events
        assert sum(int("finished" in e) for e in events) == 5, events
Esempio n. 2
0
def test_min_num_proc(ray_8_cpus):
    with fault_tolerance_patches():
        discovery_schedule = [
            (10, ['host-1:1']),
            (10, ['host-1:1', 'host-4:1', 'host-5:1']),
            (None, ['host-1:1', 'host-4:1', 'host-5:1', 'host-6:1']),
        ]
        nics = list(psutil.net_if_addrs().keys())[0]

        settings = RayExecutor.create_settings(nics={nics})
        settings.discovery = SimpleTestDiscovery(discovery_schedule)
        executor = RayExecutor(settings,
            min_workers=4,
            max_workers=4,
            override_discovery=False
        )

        training_fn = _create_training_function(iterations=30)
        executor.start()
        trace = StatusCallback()
        results = executor.run(training_fn, callbacks=[trace])
        assert len(results) == 4

        events = trace.fetch()
        assert sum(int("started" in e) for e in events) == 4, events
        assert sum(int("finished" in e) for e in events) == 4, events
Esempio n. 3
0
def test_ray_exec_func(ray_start_4_cpus):
    def simple_fn(num_epochs):
        import horovod.torch as hvd
        hvd.init()
        return hvd.rank() * num_epochs

    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(
        setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available())
    hjob.start()
    result = hjob.run(simple_fn, args=[0])
    assert len(set(result)) == 1
    hjob.shutdown()