コード例 #1
0
ファイル: test_trainer.py プロジェクト: rlan/ray
def test_worker_start_failure(ray_start_2_cpus):
    test_config = TestConfig()

    trainer = Trainer(test_config, num_workers=2)

    restart = trainer._executor._restart

    def init_hook():
        pass

    def init_hook_fail():
        ray.actor.exit_actor()

    def restart_patched(self):
        self._initialization_hook = init_hook
        restart()

    with patch.object(BackendExecutor, "_restart", restart_patched):
        trainer.start(initialization_hook=init_hook_fail)
        assert len(trainer._executor.worker_group) == 2
コード例 #2
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_report(ray_start_2_cpus):
    config = TestConfig()

    def train_func():
        for i in range(3):
            sgd.report(index=i)
        return 1

    callback = TestCallback()
    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(train_func, callbacks=[callback])
    assert results == [1, 1]

    result_list = callback.result_list
    assert len(result_list) == 3
    for index in range(len(result_list)):
        intermediate_results = result_list[index]
        assert len(intermediate_results) == 2
        for worker_result in intermediate_results:
            assert worker_result["index"] == index
コード例 #3
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_load_checkpoint(ray_start_2_cpus):
    config = TestConfig()

    def train_func_checkpoint():
        checkpoint = sgd.load_checkpoint()
        assert checkpoint is not None
        assert checkpoint["epoch"] == 3

        result = []
        for i in range(checkpoint["epoch"], 5):
            result.append(i)
        return result

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    result = trainer.run(train_func_checkpoint, checkpoint={"epoch": 3})

    assert result is not None
    assert len(result) == 2
    assert result[0] == [3, 4]
    assert result[1] == [3, 4]
コード例 #4
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_fast_slow(ray_start_2_cpus):
    test_config = TestConfig()

    def train():
        for i in range(2):
            sgd.save_checkpoint(epoch=i)
            sgd.report(index=i)

    def train_slow():
        for i in range(2):
            sgd.save_checkpoint(epoch=i)
            time.sleep(5)
            sgd.report(index=i)
            time.sleep(5)

    new_backend_executor_cls = gen_new_backend_executor(train_slow)
    callback = TestCallback()

    with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor",
                      new_backend_executor_cls):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        trainer.run(train, callbacks=[callback])

    assert trainer.latest_checkpoint["epoch"] == 1

    result_list = callback.result_list
    assert len(result_list) == 2
    for index in range(len(result_list)):
        intermediate_results = result_list[index]
        assert len(intermediate_results) == 2
        for worker_result in intermediate_results:
            assert worker_result["index"] == index
コード例 #5
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_mismatch_checkpoint_report(ray_start_2_cpus):
    test_config = TestConfig()

    def train():
        for i in range(2):
            sgd.save_checkpoint(epoch=i)
            sgd.report(index=i)

    def train_mismatch():
        sgd.save_checkpoint(epoch=0)
        sgd.report(index=0)
        # skip checkpoint
        sgd.report(index=1)

    new_backend_executor_cls = gen_new_backend_executor(train_mismatch)
    callback = TestCallback()

    with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor",
                      new_backend_executor_cls):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        with pytest.raises(RuntimeError):
            trainer.run(train, callbacks=[callback])
    # validate checkpoint
    assert trainer.latest_checkpoint["epoch"] == 0
    # validate callback
    result_list = callback.result_list
    assert len(result_list) == 1  # 1 epoch succeeded
    intermediate_results = result_list[0]
    assert len(intermediate_results) == 2  # both workers reported
    for worker_result in intermediate_results:
        assert worker_result["index"] == 0
コード例 #6
0
ファイル: test_callbacks.py プロジェクト: hngenc/ray
def test_json(ray_start_4_cpus, make_temp_dir, workers_to_log, detailed,
              filename):
    if detailed:
        os.environ[ENABLE_DETAILED_AUTOFILLED_METRICS_ENV] = "1"
    else:
        os.environ.pop(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0)

    config = TestConfig()

    num_iters = 5
    num_workers = 4

    if workers_to_log is None:
        num_workers_to_log = num_workers
    elif isinstance(workers_to_log, int):
        num_workers_to_log = 1
    else:
        num_workers_to_log = len(workers_to_log)

    def train_func():
        for i in range(num_iters):
            sgd.report(index=i)
        return 1

    if filename is None:
        # if None, use default value
        callback = JsonLoggerCallback(
            make_temp_dir, workers_to_log=workers_to_log)
        assert str(
            callback.log_path.name) == JsonLoggerCallback._default_filename
    else:
        callback = JsonLoggerCallback(
            make_temp_dir, filename=filename, workers_to_log=workers_to_log)
        assert str(callback.log_path.name) == filename
    trainer = Trainer(config, num_workers=num_workers)
    trainer.start()
    trainer.run(train_func, callbacks=[callback])

    with open(callback.log_path, "r") as f:
        log = json.load(f)
    print(log)
    assert len(log) == num_iters
    assert len(log[0]) == num_workers_to_log
    assert all(len(element) == len(log[0]) for element in log)
    assert all(
        all(worker["index"] == worker[TRAINING_ITERATION] - 1
            for worker in element) for element in log)
    assert all(
        all(
            all(key in worker for key in BASIC_AUTOFILLED_KEYS)
            for worker in element) for element in log)
    if detailed:
        assert all(
            all(
                all(key in worker for key in DETAILED_AUTOFILLED_KEYS)
                for worker in element) for element in log)
    else:
        assert all(
            all(not any(key in worker for key in DETAILED_AUTOFILLED_KEYS)
                for worker in element) for element in log)
コード例 #7
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_worker_failure_2(ray_start_2_cpus):
    test_config = TestConfig()

    def train():
        for _ in range(2):
            sgd.report(loss=1)
        return 1

    def train_actor_failure():
        for _ in range(2):
            sgd.report(loss=1)
        import sys
        sys.exit(0)

    new_backend_executor_cls = gen_new_backend_executor(train_actor_failure)

    with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor",
                      new_backend_executor_cls):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        results = trainer.run(train)
        assert results == [1, 1]
コード例 #8
0
def train_linear(num_workers=1):
    trainer = Trainer(TorchConfig(backend="gloo"), num_workers=num_workers)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}
    trainer.start()
    results = trainer.run(train_func,
                          config,
                          callbacks=[JsonLoggerCallback("./sgd_results")])
    trainer.shutdown()

    print(results)
    return results
コード例 #9
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_horovod_simple(ray_start_2_cpus):
    def simple_fn():
        hvd_torch.init()
        return hvd_torch.rank()

    num_workers = 2
    trainer = Trainer("horovod", num_workers)
    trainer.start()
    result = trainer.run(simple_fn)
    trainer.shutdown()

    assert result == list(range(num_workers))
コード例 #10
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_run_iterator(ray_start_2_cpus):
    config = TestConfig()

    def train_func():
        for i in range(3):
            sgd.report(index=i)
        return 1

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    iterator = trainer.run_iterator(train_func)

    count = 0
    for results in iterator:
        assert (value["index"] == count for value in results)
        count += 1

    assert count == 3
    assert iterator.is_finished()
    assert iterator.get_final_results() == [1, 1]

    with pytest.raises(StopIteration):
        next(iterator)
コード例 #11
0
ファイル: tensorflow_mnist_example.py プロジェクト: rlan/ray
def train_tensorflow_mnist(num_workers=2, use_gpu=False):
    trainer = Trainer(backend="tensorflow",
                      num_workers=num_workers,
                      use_gpu=use_gpu)
    trainer.start()
    results = trainer.run(train_func=train_func,
                          config={
                              "lr": 1e-3,
                              "batch_size": 64,
                              "epochs": 4
                          })
    trainer.shutdown()
    print(f"Results: {results[0]}")
コード例 #12
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_run(ray_start_2_cpus):
    config = TestConfig()

    def train_func():
        return 1

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(train_func)
    trainer.shutdown()

    assert len(results) == 2
    assert all(result == 1 for result in results)
コード例 #13
0
def test_dataset_pipeline(ray_start_4_cpus):
    """Checks that Pipeline is correctly sharded even with multiple epochs."""
    num_epochs = 2
    num_data = 10

    dataset = ray.data.range(num_data).repeat()

    def get_dataset():
        pipeline_iterator = sgd.get_dataset_shard().iter_datasets()
        data_all_epochs = []
        for _ in range(num_epochs):
            dataset_this_epoch = next(pipeline_iterator)
            data_this_epoch = []
            for batch in dataset_this_epoch.iter_batches():
                data_this_epoch.extend(batch)
            data_all_epochs.append(data_this_epoch)
        return data_all_epochs

    config = TestConfig()

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(get_dataset, dataset=dataset)
    check_dataset_output(num_data, num_epochs, results)
コード例 #14
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_torch_fashion_mnist_gpu(ray_start_2_cpus_2_gpus):
    num_workers = 2
    epochs = 3

    trainer = Trainer("torch", num_workers=num_workers, use_gpu=True)
    config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    trainer.start()
    results = trainer.run(fashion_mnist_train_func, config)
    trainer.shutdown()

    assert len(results) == num_workers

    for result in results:
        assert len(result) == epochs
        assert result[-1] < result[0]
コード例 #15
0
def train_linear(num_workers=2, use_gpu=False):
    datasets = get_datasets()

    trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}
    trainer.start()
    results = trainer.run(
        train_func,
        config,
        dataset=datasets,
        callbacks=[JsonLoggerCallback(),
                   TBXLoggerCallback()])
    trainer.shutdown()
    print(results)
    return results
コード例 #16
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_run_config(ray_start_2_cpus):
    backend_config = TestConfig()

    def train_func(config):
        return config["fruit"]

    config = {"fruit": "banana"}

    trainer = Trainer(backend_config, num_workers=2)
    trainer.start()
    results = trainer.run(train_func, config)
    trainer.shutdown()

    assert len(results) == 2
    assert all(result == "banana" for result in results)
コード例 #17
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_torch_linear(ray_start_2_cpus):
    num_workers = 2
    epochs = 3

    trainer = Trainer("torch", num_workers=num_workers)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer.start()
    results = trainer.run(linear_train_func, config)
    trainer.shutdown()

    assert len(results) == num_workers

    for result in results:
        assert len(result) == epochs
        assert result[-1]["loss"] < result[0]["loss"]
コード例 #18
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_horovod_torch_mnist_gpu(ray_start_2_cpus_2_gpus):
    num_workers = 2
    num_epochs = 2
    trainer = Trainer("horovod", num_workers, use_gpu=True)
    trainer.start()
    results = trainer.run(horovod_torch_train_func,
                          config={
                              "num_epochs": num_epochs,
                              "lr": 1e-3
                          })
    trainer.shutdown()

    assert len(results) == num_workers
    for worker_result in results:
        assert len(worker_result) == num_epochs
        assert worker_result[num_epochs - 1] < worker_result[0]
コード例 #19
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_run_after_user_error(ray_start_2_cpus):
    config = TestConfig()

    def fail_train():
        raise NotImplementedError

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    with pytest.raises(NotImplementedError):
        trainer.run(fail_train)

    def train():
        return 1

    output = trainer.run(train)
    assert output == [1, 1]
コード例 #20
0
def train_tensorflow_linear(num_workers=2, use_gpu=False):
    dataset_pipeline = get_dataset_pipeline()
    trainer = Trainer(backend="tensorflow",
                      num_workers=num_workers,
                      use_gpu=use_gpu)
    trainer.start()
    results = trainer.run(train_func=train_func,
                          dataset=dataset_pipeline,
                          config={
                              "lr": 1e-3,
                              "batch_size": 32,
                              "epochs": 4
                          })
    trainer.shutdown()
    print(f"Results: {results[0]}")
    return results
コード例 #21
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_user_error(ray_start_2_cpus):
    """Tests if user training function raises an error"""

    config = TestConfig()

    def fail_train_1():
        raise NotImplementedError

    trainer = Trainer(config, num_workers=2)
    trainer.start()

    with pytest.raises(NotImplementedError):
        trainer.run(fail_train_1)

    def fail_train_2():
        for _ in range(2):
            sgd.report(loss=1)
        raise NotImplementedError

    with pytest.raises(NotImplementedError):
        trainer.run(fail_train_2)
コード例 #22
0
ファイル: test_trainer.py プロジェクト: TanjaBayer/ray
def test_worker_failure_1(ray_start_2_cpus):
    test_config = TestConfig()

    def train():
        return 1

    def train_actor_failure():
        import sys
        sys.exit(0)

    new_backend_executor_cls = gen_new_backend_executor(train_actor_failure)

    with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor",
                      new_backend_executor_cls):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        with pytest.raises(RuntimeError):
            trainer.run(train)

    # Make sure Trainer is shutdown after worker failure.
    with pytest.raises(RuntimeError):
        trainer.run(train)
コード例 #23
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_start_shutdown(ray_start_2_cpus, num_workers):
    config = TestConfig()
    assert ray.available_resources()["CPU"] == 2
    trainer = Trainer(config, num_workers=num_workers)
    trainer.start()
    time.sleep(1)

    remaining = 2 - num_workers
    if remaining == 0:
        assert "CPU" not in ray.available_resources()
    else:
        assert ray.available_resources()["CPU"] == remaining

    trainer.shutdown()
    time.sleep(1)
    assert ray.available_resources()["CPU"] == 2
コード例 #24
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_tensorflow_mnist_gpu(ray_start_2_cpus_2_gpus):
    num_workers = 2
    epochs = 3

    trainer = Trainer("tensorflow", num_workers=num_workers, use_gpu=True)
    config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    trainer.start()
    results = trainer.run(tensorflow_mnist_train_func, config)
    trainer.shutdown()

    assert len(results) == num_workers
    result = results[0]

    loss = result["loss"]
    assert len(loss) == epochs
    assert loss[-1] < loss[0]

    accuracy = result["accuracy"]
    assert len(accuracy) == epochs
    assert accuracy[-1] > accuracy[0]
コード例 #25
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_multiple_run(ray_start_2_cpus):
    config = TestConfig()

    def train_1():
        return 1

    trainer = Trainer(config, num_workers=2)
    trainer.start()

    output_1 = trainer.run(train_1)
    assert output_1 == [1, 1]

    def train_2():
        return 2

    output_2 = trainer.run(train_2)
    assert output_2 == [2, 2]
コード例 #26
0
def test_multiple_datasets(ray_start_4_cpus):
    num_epochs = 2
    num_data_1 = 10
    num_data_2 = 6

    train_data = ray.data.range(num_data_1)
    val_data = ray.data.range(num_data_2)

    def get_dataset():
        data_train_all_epochs = []
        data_val_all_epochs = []
        for _ in range(2):
            data_this_epoch_train = []
            train_dataset = sgd.get_dataset_shard("train")
            for batch in train_dataset.iter_batches():
                data_this_epoch_train.extend(batch)
            data_train_all_epochs.append(data_this_epoch_train)

            data_this_epoch_val = []
            val_dataset = sgd.get_dataset_shard("val")
            for batch in val_dataset.iter_batches():
                data_this_epoch_val.extend(batch)
            data_val_all_epochs.append(data_this_epoch_val)

        return data_train_all_epochs, data_val_all_epochs

    config = TestConfig()

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(get_dataset,
                          dataset={
                              "train": train_data,
                              "val": val_data
                          })
    check_dataset_output(num_data_1, num_epochs,
                         [worker_data[0] for worker_data in results])
    check_dataset_output(num_data_2, num_epochs,
                         [worker_data[1] for worker_data in results])
    trainer.shutdown()
コード例 #27
0
ファイル: test_callbacks.py プロジェクト: rlan/ray
def test_TBX(ray_start_4_cpus, make_temp_dir):
    config = TestConfig()

    temp_dir = make_temp_dir
    num_workers = 4

    def train_func():
        sgd.report(episode_reward_mean=4)
        sgd.report(episode_reward_mean=5)
        sgd.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1})
        return 1

    callback = TBXLoggerCallback(temp_dir)
    trainer = Trainer(config, num_workers=num_workers)
    trainer.start()
    trainer.run(train_func, callbacks=[callback])

    _validate_tbx_result(temp_dir)
コード例 #28
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_mismatch_checkpoint(ray_start_2_cpus):
    test_config = TestConfig()

    def train():
        for i in range(2):
            sgd.save_checkpoint(epoch=i)

    def train_mismatch():
        sgd.save_checkpoint(epoch=0)

    new_backend_executor_cls = gen_new_backend_executor(train_mismatch)

    with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor",
                      new_backend_executor_cls):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        with pytest.raises(RuntimeError):
            trainer.run(train)
コード例 #29
0
ファイル: test_trainer.py プロジェクト: enothereska/ray
def test_resources(ray_start_4_cpus_4_gpus_4_extra, resource, num_requested):
    num_workers = 2
    config = TestConfig()
    original = ray.available_resources().get(resource)
    resources_per_worker = {resource: num_requested}
    use_gpu = resource == "GPU"
    trainer = Trainer(config,
                      num_workers=num_workers,
                      use_gpu=use_gpu,
                      resources_per_worker=resources_per_worker)

    trainer.start()
    expected = original - num_workers * num_requested
    wait_for_condition(
        lambda: ray.available_resources().get(resource, 0) == expected)

    trainer.shutdown()
    wait_for_condition(
        lambda: ray.available_resources().get(resource, 0) == original)
コード例 #30
0
def main():
    args = parse_args()
    config = {"args": args}

    if args.start_local or args.address or \
            args.num_workers > 1 or args.use_gpu:
        if args.start_local:
            # Start a local Ray runtime.
            ray.init(num_cpus=args.num_workers)
        else:
            # Connect to a Ray cluster for distributed training.
            ray.init(address=args.address)
        trainer = Trainer("torch",
                          num_workers=args.num_workers,
                          use_gpu=args.use_gpu)
        trainer.start()
        trainer.run(train_func, config)
    else:
        # Run training locally.
        train_func(config)