def test_validation(ray_start_2_cpus):  # noqa: F811
    def bad_func(a, b, c):
        return 1

    t_cls = DistributedTrainableCreator(bad_func)
    with pytest.raises(ValueError):
        t_cls()
Exemple #2
0
def test_colocated(ray_4_node):  # noqa: F811
    assert ray.available_resources()["CPU"] == 4
    trainable_cls = DistributedTrainableCreator(
        _train_check_global, num_workers=4, num_workers_per_host=1)
    trainable = trainable_cls()
    assert ray.available_resources().get("CPU", 0) == 0
    trainable.train()
    trainable.stop()
Exemple #3
0
def test_colocated_gpu_double(ray_4_node_gpu):  # noqa: F811
    assert ray.available_resources()["GPU"] == 8
    trainable_cls = DistributedTrainableCreator(_train_check_global,
                                                num_workers=8,
                                                num_gpus_per_worker=1,
                                                num_cpus_per_worker=1,
                                                num_workers_per_host=2)
    trainable = trainable_cls()
    assert ray.available_resources().get("GPU", 0) == 0
    trainable.train()
    trainable.stop()
Exemple #4
0
        default=2,
        help="Sets number of workers for training.")
    parser.add_argument(
        "--use-gpu",
        action="store_true",
        default=False,
        help="enables CUDA training")
    parser.add_argument(
        "--cluster",
        action="store_true",
        default=False,
        help="enables multi-node tuning")
    args = parser.parse_args()
    tf_trainable = DistributedTrainableCreator(
        train_mnist,
        use_gpu=args.use_gpu,
        num_workers=2,
    )

    sched = AsyncHyperBandScheduler(max_t=400, grace_period=20)

    analysis = tune.run(
        tf_trainable,
        name="exp",
        scheduler=sched,
        metric="mean_accuracy",
        mode="max",
        stop={
            "mean_accuracy": 0.99,
            "training_iteration": 10
        },
                        default=False,
                        help="enables multi-node tuning")
    parser.add_argument("--smoke-test",
                        action="store_true",
                        default=False,
                        help="enables small scale testing")
    args = parser.parse_args()
    if args.cluster:
        options = dict(address="auto")
    else:
        options = dict(num_cpus=4)
    ray.init(**options)
    tf_trainable = DistributedTrainableCreator(
        train_mnist,
        num_workers=args.num_workers,
        num_workers_per_host=args.num_workers_per_host,
        num_cpus_per_worker=args.num_cpus_per_worker,
        num_gpus_per_worker=args.num_gpus_per_worker,
    )

    sched = AsyncHyperBandScheduler(max_t=400, grace_period=20)

    analysis = tune.run(tf_trainable,
                        name="exp",
                        scheduler=sched,
                        metric="mean_accuracy",
                        mode="max",
                        stop={
                            "mean_accuracy": 0.99,
                            "training_iteration": 10
                        },
Exemple #6
0
def raytune(config, name, local, cpus, gpus, tune_result_dir, resume, ntrain,
            ntest, seeds):
    if seeds:
        # Set seeds for reproducibility
        random.seed(1234)
        np.random.seed(1234)
        tf.random.set_seed(1234)

    cfg = load_config(config)
    config_file_path = config

    if tune_result_dir is not None:
        os.environ["TUNE_RESULT_DIR"] = tune_result_dir
    else:
        if isinstance(cfg["raytune"]["local_dir"], type(None)):
            raise TypeError(
                "Please specify a local_dir in the raytune section of the config file."
            )
        trd = cfg["raytune"]["local_dir"] + "/tune_result_dir"
        os.environ["TUNE_RESULT_DIR"] = trd

    expdir = Path(cfg["raytune"]["local_dir"]) / name
    expdir.mkdir(parents=True, exist_ok=True)
    shutil.copy(
        "mlpf/raytune/search_space.py",
        str(Path(cfg["raytune"]["local_dir"]) / name / "search_space.py"
            ))  # Copy the config file to the train dir for later reference
    shutil.copy(config_file_path,
                str(Path(cfg["raytune"]["local_dir"]) / name / "config.yaml")
                )  # Copy the config file to the train dir for later reference

    ray.tune.ray_trial_executor.DEFAULT_GET_TIMEOUT = 1 * 60 * 60  # Avoid timeout errors
    if not local:
        ray.init(address='auto')

    sched = get_raytune_schedule(cfg["raytune"])
    search_alg = get_raytune_search_alg(cfg["raytune"], seeds)

    distributed_trainable = DistributedTrainableCreator(
        partial(build_model_and_train,
                full_config=config_file_path,
                ntrain=ntrain,
                ntest=ntest,
                name=name,
                seeds=seeds),
        num_workers=1,  # Number of hosts that each trial is expected to use.
        num_cpus_per_worker=cpus,
        num_gpus_per_worker=gpus,
        num_workers_per_host=
        1,  # Number of workers to colocate per host. None if not specified.
        timeout_s=1 * 60 * 60,
    )

    sync_config = tune.SyncConfig(sync_to_driver=False)

    start = datetime.now()
    analysis = tune.run(
        distributed_trainable,
        config=search_space,
        name=name,
        scheduler=sched,
        search_alg=search_alg,
        num_samples=raytune_num_samples,
        local_dir=cfg["raytune"]["local_dir"],
        callbacks=[TBXLoggerCallback()],
        log_to_file=True,
        resume=resume,
        max_failures=2,
        sync_config=sync_config,
    )
    end = datetime.now()
    print("Total time of tune.run(...): {}".format(end - start))

    print(
        "Best hyperparameters found according to {} were: ".format(
            cfg["raytune"]["default_metric"]),
        analysis.get_best_config(cfg["raytune"]["default_metric"],
                                 cfg["raytune"]["default_mode"]))

    skip = 20
    if skip > cfg["setup"]["num_epochs"]:
        skip = 0
    analysis.default_metric = cfg["raytune"]["default_metric"]
    analysis.default_mode = cfg["raytune"]["default_mode"]
    plot_ray_analysis(analysis, save=True, skip=skip)
    topk_summary_plot_v2(analysis,
                         k=5,
                         save_dir=Path(analysis.get_best_logdir()).parent)
    summarize_top_k(analysis,
                    k=5,
                    save_dir=Path(analysis.get_best_logdir()).parent)

    best_params = analysis.get_best_config(cfg["raytune"]["default_metric"],
                                           cfg["raytune"]["default_mode"])
    with open(
            Path(analysis.get_best_logdir()).parent / "best_parameters.txt",
            "a") as best_params_file:
        best_params_file.write("Best hyperparameters according to {}\n".format(
            cfg["raytune"]["default_metric"]))
        for key, val in best_params.items():
            best_params_file.write(("{}: {}\n".format(key, val)))

    with open(Path(analysis.get_best_logdir()).parent / "time.txt",
              "a") as timefile:
        timefile.write(str(end - start) + "\n")

    num_skipped = count_skipped_configurations(analysis.get_best_logdir())
    print("Number of skipped configurations: {}".format(num_skipped))
Exemple #7
0
def test_step_after_completion(ray_start_2_cpus):  # noqa: F811
    trainable_cls = DistributedTrainableCreator(train_mnist, num_workers=2)
    trainer = trainable_cls(config={"epochs": 1})
    with pytest.raises(RuntimeError):
        for i in range(10):
            trainer.train()
Exemple #8
0
def test_single_step(ray_start_2_cpus):  # noqa: F811
    trainable_cls = DistributedTrainableCreator(train_mnist, num_workers=2)
    trainer = trainable_cls()
    trainer.train()
    trainer.stop()
Exemple #9
0
def test_validate_session(ray_start_2_cpus):
    trainable_cls = DistributedTrainableCreator(_train_validate_session)
    tune.run(trainable_cls)