Example #1
0
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    data_dir = os.path.abspath("./data")
    load_data(data_dir)
    # 配置搜索空间
    # 随机搜索
    config = {
        "l1": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    # ASHAScheduler,它将尽早终止效果不佳的测试。
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2
    )

    reporter = CLIReporter(
        parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss","accuracy", "training_iteration"]
    )
    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter
    )

    best_trial = result.get_best_trial("loss","min","last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]
    ))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))
    # 设置模型超参
    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)

    best_trained_model.to(device)


    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"
    ))
    # 加载模型参数
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))
Example #2
0
def run_exp(design_or_test,
            config,
            n_samples,
            p_early,
            p_scheduler,
            exp_dir,
            chk_score_attr,
            log_params,
            gpus=[],
            gpu_threshold=None):
    
    if not os.path.exists(exp_dir):
        os.makedirs(exp_dir)
    
    config['wdir'] = os.getcwd()
    config['gpu_ids'] = gpus
    config['gpu_threshold'] = gpu_threshold
    early_stopping = TrialNoImprovementStopper(metric=p_early['metric'], 
                                               mode=p_early['mode'], 
                                               patience_threshold=p_early['patience'])
    if p_scheduler is not None:
        scheduler = ASHAScheduler(
            metric=p_scheduler['metric'],
            mode=p_scheduler['mode'],
            max_t=p_scheduler['max_t'],
            grace_period=p_scheduler['grace'],
            reduction_factor=p_scheduler['reduction']
        ) 
    else:
        scheduler = None
    
    resources = {'cpu': 1, 'gpu': 0.0001}
    reporter = tune.CLIReporter(metric_columns={
                                    'training_iteration': '#Iter',
                                    'tr_loss': 'TR-Loss',
                                    'tr_score': 'TR-Score',
                                    'vl_loss': 'VL-Loss', 
                                    'vl_score': 'VL-Score', 
                                    'rank_score': 'Rank Score',
                                },
                                parameter_columns=log_params,
                                infer_limit=3,
                                metric='rank_score',
                                mode='max')
    return tune.run(
        GPUTrainable,
        name=design_or_test,
        stop=early_stopping,
        local_dir=exp_dir,
        config=config,
        num_samples=n_samples,
        resources_per_trial=resources,
        keep_checkpoints_num=1,
        checkpoint_score_attr=chk_score_attr,
        checkpoint_freq=1,
        max_failures=5,
        progress_reporter=reporter,
        scheduler=scheduler,
        verbose=1
    )
Example #3
0
def tune_xgboost(use_cv: bool = False):
    search_space = {
        # You can mix constants with search space objects.
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "max_depth": tune.randint(1, 9),
        "min_child_weight": tune.choice([1, 2, 3]),
        "subsample": tune.uniform(0.5, 1.0),
        "eta": tune.loguniform(1e-4, 1e-1),
    }
    # This will enable aggressive early stopping of bad trials.
    scheduler = ASHAScheduler(
        max_t=10,
        grace_period=1,
        reduction_factor=2  # 10 training iterations
    )

    analysis = tune.run(
        train_breast_cancer if not use_cv else train_breast_cancer_cv,
        metric="test-logloss",
        mode="min",
        # You can add "gpu": 0.1 to allocate GPUs
        resources_per_trial={"cpu": 1},
        config=search_space,
        num_samples=10,
        scheduler=scheduler,
    )

    return analysis
Example #4
0
def tune_mnist_mxnet(num_samples=10, num_epochs=10):
    logger.info("Downloading MNIST data...")
    mnist_data = mx.test_utils.get_mnist()
    logger.info("Got MNIST data, starting Ray Tune.")

    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-3, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
    }

    scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)

    analysis = tune.run(
        tune.with_parameters(
            train_mnist_mxnet, mnist=mnist_data, num_epochs=num_epochs
        ),
        resources_per_trial={
            "cpu": 1,
        },
        metric="mean_accuracy",
        mode="max",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        name="tune_mnist_mxnet",
    )
    return analysis
def tune_mnist_asha():
    data_dir = mkdtemp(prefix="mnist_data_")
    LightningMNISTClassifier.download_data(data_dir)
    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
        "data_dir": data_dir
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=10,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])
    tune.run(
        train_mnist_tune,
        resources_per_trial={"cpu": 1},
        config=config,
        num_samples=10,
        scheduler=scheduler,
        progress_reporter=reporter)
    shutil.rmtree(data_dir)
Example #6
0
def main(loss_function="L1", num_samples=25, max_num_epochs=25, gpus_per_trial=1, cpus_per_trial=10):

    experiment_name = loss_function + "_shuffle_validation"
    save_dir = '/data/results/vcpujol/transformers/single_deployment/predict_maxmax/pytorch_transformer/'

    config = {
        "lr": tune.loguniform(1e-4, 5e-1),
        "lr_step": tune.randint(1,10),
        "gamma": tune.loguniform(0.85,0.9999),
        "epochs": tune.choice([5, 10, 15, 20, 25]),
        "n_heads": tune.randint(2,10),
        "dim_val": tune.choice([2,4,6]), # FIXME requires numero parell...
        "dim_att": tune.randint(2,12),
        "encoder_layers": tune.randint(1,7),
        "decoder_layers": tune.randint(1,7),
        "batch_size": tune.randint(1,10),
        "input_feat_enc": tune.choice([94]),
        "input_feat_dec": tune.choice([1]),
        "seq_len": tune.choice([16, 32, 64, 96, 128, 180, 220, 256, 312, 350, 420, 470, 512]),  #[16, 32, 64, 128, 256, 512, 1024, 2048]
        "prediction_step": tune.choice([1])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=4,
        reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["lr", "lr_step", "gamma", "epochs", "n_heads", "dim_val", "dim_att", "encoder_layers",
                           "decoder_layers", "batch_size", "seq_len"],
        metric_columns=["loss", "training_iteration"])
    result = tune.run(
        partial(transformer_train, save_dir=save_dir, loss_function=loss_function),
        resources_per_trial={"cpu": cpus_per_trial, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        local_dir=save_dir,
        name=experiment_name)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
    # print("Best trial final validation accuracy: {}".format(best_trial.last_result["accuracy"]))

    best_trained_model = Transformer(best_trial.config["dim_val"], best_trial.config["dim_att"],
                                     best_trial.config["input_feat_enc"], best_trial.config["input_feat_dec"],
                                     best_trial.config["seq_len"], best_trial.config["decoder_layers"],
                                     best_trial.config["encoder_layers"], best_trial.config["n_heads"])

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    local_dir = save_dir
    exp_name = experiment_name
    test_acc = test_transformer(best_trained_model, best_trial.config, local_dir, exp_name, loss_function)
    print("Best trial test set accuracy: {}".format(test_acc))
Example #7
0
def main(num_samples=10, max_num_epochs=15):
    # data_dir = os.path.abspath("./data")
    configs = load_configs("configs/training/smg_configs_template.yaml")
    # configs["processor"]["torch_model_dict"]["hidden_sizes"] =
    a = tune.choice([[90, 45, 25, 10], [45, 25], [45, 25], [90, 45, 25, 10]])
    dataloaders, amplification, data_info_dict = load_data(
        configs)  # Download data for all trials before starting the run

    scheduler = ASHAScheduler(max_t=max_num_epochs,
                              grace_period=1,
                              reduction_factor=2)
    result = tune.run(
        tune.with_parameters(generate_surrogate_model, configs=configs),
        resources_per_trial={
            "cpu": 8,
            "gpu": 1
        },
        config=configs,
        metric="loss",
        mode="min",
        num_samples=num_samples,
        scheduler=scheduler,
    )

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))
def tune_mnist_asha(num_samples=10, num_epochs=10, gpus_per_trial=0, data_dir="~/data"):
    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
    }

    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])

    train_fn_with_parameters = tune.with_parameters(train_mnist_tune,
                                                    num_epochs=num_epochs,
                                                    num_gpus=gpus_per_trial,
                                                    data_dir=data_dir)
    resources_per_trial = {"cpu": 1, "gpu": gpus_per_trial}

    analysis = tune.run(train_fn_with_parameters,
        resources_per_trial=resources_per_trial,
        metric="loss",
        mode="min",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        name="tune_mnist_asha")

    print("Best hyperparameters found were: ", analysis.best_config)
Example #9
0
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    result = tune.run(
        tune.with_parameters(train_cifar),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        metric="loss",
        mode="min",
        num_samples=num_samples,
        scheduler=scheduler
    )

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    if ray.util.client.ray.is_connected():
        # If using Ray Client, we want to make sure checkpoint access
        # happens on the server. So we wrap `test_best_model` in a Ray task.
        ray.get(ray.remote(test_best_model).remote(best_trial))
    else:
        test_best_model(best_trial)
Example #10
0
def tune_mnist_mxnet(num_samples=10, num_epochs=10):
    logger.info("Downloading MNIST data...")
    mnist_data = mx.test_utils.get_mnist()
    logger.info("Got MNIST data, starting Ray Tune.")

    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-3, 1e-1),
        "batch_size": tune.choice([32, 64, 128])
    }

    scheduler = ASHAScheduler(metric="mean_accuracy",
                              mode="max",
                              max_t=num_epochs,
                              grace_period=1,
                              reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])

    tune.run(partial(train_mnist_mxnet,
                     mnist=mnist_data,
                     num_epochs=num_epochs),
             resources_per_trial={
                 "cpu": 1,
             },
             config=config,
             num_samples=num_samples,
             scheduler=scheduler,
             progress_reporter=reporter,
             name="tune_mnist_mxnet")
def tune_mnist():
    sched = ASHAScheduler(time_attr="training_iteration")

    bayesopt = BayesOptSearch()

    metric = "mean_accuracy"

    analysis = tune.run(
        train_mnist,
        name="foo",
        scheduler=sched,
        search_alg=bayesopt,
        metric=metric,
        mode="max",
        #stop={
        #    "mean_accuracy": 0.99,
        #    "training_iteration": num_training_iterations
        #},
        num_samples=50,
        resources_per_trial={
            "cpu": 1,
            "gpu": 0
        },
        config={
            "dropout": tune.uniform(0.05, 0.5),
            "lr": tune.uniform(0.001, 0.1),
            "momentum": tune.uniform(0.1, 0.9),
            "hidden": tune.uniform(32, 512),
        })
    print("Best hyperparameters found were: ", analysis.best_config)
    print("Best value for", metric, ':', analysis.best_result[metric])
Example #12
0
def resolve_early_stopping(early_stopping, max_iters, metric_name):
    if isinstance(early_stopping, str):
        if early_stopping in TuneBaseSearchCV.defined_schedulers:
            if early_stopping == "PopulationBasedTraining":
                return PopulationBasedTraining(metric=metric_name, mode="max")
            elif early_stopping == "AsyncHyperBandScheduler":
                return AsyncHyperBandScheduler(metric=metric_name,
                                               mode="max",
                                               max_t=max_iters)
            elif early_stopping == "HyperBandScheduler":
                return HyperBandScheduler(metric=metric_name,
                                          mode="max",
                                          max_t=max_iters)
            elif early_stopping == "MedianStoppingRule":
                return MedianStoppingRule(metric=metric_name, mode="max")
            elif early_stopping == "ASHAScheduler":
                return ASHAScheduler(metric=metric_name,
                                     mode="max",
                                     max_t=max_iters)
        raise ValueError(
            "{} is not a defined scheduler. "
            "Check the list of available schedulers.".format(early_stopping))
    elif isinstance(early_stopping, TrialScheduler):
        early_stopping._metric = metric_name
        early_stopping._mode = "max"
        return early_stopping
    else:
        raise TypeError("`early_stopping` must be a str, boolean, "
                        f"or tune scheduler. Got {type(early_stopping)}.")
Example #13
0
def grid_search(hparams):
    scheduler = ASHAScheduler(max_t=hparams['n_epochs'],
                              grace_period=1,
                              reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=hparams['param_cols'],
        metric_columns=['valid_acc', 'valid_f1', 'valid_loss'])

    rdm = RetinalDataModule()

    analysis = tune.run(tune.with_parameters(train_tune, rdm=rdm),
                        resources_per_trial={
                            "cpu": 1,
                            "gpu": 1
                        },
                        metric="valid_loss",
                        mode="min",
                        config=hparams,
                        local_dir=Path(hparams['output_dir'], 'ray_tune'),
                        num_samples=5,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name=f"tune_{hparams['model']}_DRIVE")

    print("Best hyperparameters found were: ", analysis.best_config)
Example #14
0
def main_tune(base_args):
    # ray.init(log_to_driver=False)
    tune_config = {
        "learning_rate": tune.loguniform(5e-6, 1e-3),
        "weight_decay": tune.choice([0.0, 1e-3, 1e-2, 0.1]),
        "batch_size": tune.choice([16, 32, 64, 128]),
        "latent_dim": tune.choice([2, 3, 8, 16, 32, 128, 256, 512])
    }

    scheduler = ASHAScheduler(max_t=base_args.max_tune_epoches,
                              grace_period=3,
                              reduction_factor=2)

    reporter = CLIReporter(parameter_columns=[
        "learning_rate", "weight_decay", "batch_size", "latent_dim"
    ],
                           metric_columns=[
                               "val_lossR", "loss", "Reconstruction_Loss",
                               "training_iteration"
                           ])

    analysis = tune.run(tune.with_parameters(tune_train, base_arg=base_args),
                        resources_per_trial={
                            "cpu": 12,
                            "gpu": 1.0,
                        },
                        metric="val_lossR",
                        mode="min",
                        config=tune_config,
                        num_samples=10,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name="tune_vae_chol")

    print("Best hyperparameters found were: ", analysis.best_config)
def tune_from_existing(start_model,
                       start_config,
                       num_samples=10,
                       num_epochs=10,
                       gpus_per_trial=0.0,
                       day=0):
    data_interface = MNISTDataInterface("/tmp/mnist_data", max_days=10)
    num_examples = data_interface._get_day_slice(
        day) - data_interface._get_day_slice(day - 1)

    config = start_config.copy()
    config.update({
        "batch_size": tune.choice([16, 32, 64]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "momentum": tune.uniform(0.1, 0.9),
    })

    scheduler = ASHAScheduler(
        metric="mean_accuracy",
        mode="max",
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2,
    )

    reporter = CLIReporter(
        parameter_columns=["lr", "momentum", "batch_size"],
        metric_columns=["mean_accuracy", "training_iteration"],
    )

    analysis = tune.run(
        partial(
            train_mnist,
            start_model=start_model,
            data_fn=data_interface.get_incremental_data,
            num_epochs=num_epochs,
            use_gpus=True if gpus_per_trial > 0 else False,
            day=day,
        ),
        resources_per_trial={
            "cpu": 1,
            "gpu": gpus_per_trial
        },
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        verbose=0,
        name="tune_serve_mnist_fromsexisting",
    )

    best_trial = analysis.get_best_trial("mean_accuracy", "max", "last")
    best_accuracy = best_trial.metric_analysis["mean_accuracy"]["last"]
    best_trial_config = best_trial.config
    best_checkpoint = best_trial.checkpoint.value

    return best_accuracy, best_trial_config, best_checkpoint, num_examples
Example #16
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("--X_dir", type=str)
    parser.add_argument("--y_dir", type=str)
    parser.add_argument("--epoch", type=int)
    parser.add_argument("--config_dir", type=str)
    parser.add_argument("--model", type=str)
    parser.add_argument("--n_sample", type=int)

    args = parser.parse_args()

    X_train = torch.load(args.X_dir)
    y_train = torch.load(args.y_dir)

    config = {
        "n_hidden": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([16, 32, 64, 128]),
    }

    CL = CustomLoss(1, 2)
    scheduler = ASHAScheduler(metric="loss",
                              mode="min",
                              max_t=args.epoch,
                              grace_period=1,
                              reduction_factor=2)
    reporter = CLIReporter(metric_columns=["loss", "training_iteration"])

    def train_func(config):
        train_model(
            X=X_train,
            y=y_train,
            num_epochs=args.epoch,
            loss_func=CL.custom_loss_1,
            model_name=args.model,
            config=config,
        )

    result = tune.run(
        train_func,
        resources_per_trial={
            "cpu": 2,
            "gpu": 2
        },
        config=config,
        num_samples=args.n_sample,
        scheduler=scheduler,
        progress_reporter=reporter,
    )
    best_trial = result.get_best_trial("loss", "min", "last")

    with open(args.config_dir, "w") as json_file:
        json.dump(best_trial.last_result["config"], json_file)

    last_loss = best_trial.last_result["loss"]
    print(f"Validation Loss of best model was {last_loss}.")
Example #17
0
def tune4_withLabel(
    model,
    train_set: Dataset,
    val_set: Dataset,
    dims: list,
    config: dict,
    EPOCHS: int = 300,
    extra_feature_len: int = 0,
    extra_feature_len2: int = 0,
    n_gpu=1,
    n_samples=20,
    model_name="model",
):

    dim1, dim2, dim3, dim4 = dims[0], dims[1], dims[2], dims[3]

    scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=1, reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["k", "lr", "batch_size", "hidden_dim"],
        metric_columns=["loss", "training_iteration"],
        max_error_rows=5,
        max_progress_rows=5,
        max_report_frequency=10)

    analysis = tune.run(tune.with_parameters(
        train4_withLabel,
        model=model,
        dim1=dim1,
        dim2=dim2,
        dim3=dim3,
        dim4=dim4,
        extra_feature_len=extra_feature_len,
        extra_feature_len2=extra_feature_len2,
        train_set=train_set,
        val_set=val_set,
        num_epochs=EPOCHS,
        num_gpus=n_gpu,
        model_name=model_name),
                        resources_per_trial={
                            "cpu": 1,
                            "gpu": n_gpu
                        },
                        metric="loss",
                        mode="min",
                        config=config,
                        num_samples=n_samples,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name=model_name,
                        verbose=False)

    print("-" * 70)
    print("Done")
    print("Best hyperparameters found were: ", analysis.best_config)
    print("Best achieved loss was: ", analysis.best_result)
    print("-" * 70)
Example #18
0
def hyperparameter_tuning_initializer(loss_type='SL', learning_rate_scheduler ='CARM'):
    
    # defining the hyperparameters
    if loss_type == 'FL':
        config = {
            'gamma': tune.choice([0.5, 1, 2]),
            'lr': tune.loguniform(1e-4, 1e-3)
        }
    elif loss_type == 'CEDL':
        config = {
            'dice_loss': tune.uniform(0, 3),
            'lr': tune.loguniform(1e-4, 1e-3)
        }
    elif loss_type == 'CEDIL':
        config = {
            'dice_loss': tune.uniform(0, 3),
            'inverse_dice_loss': tune.uniform(0, 3),
            'lr': tune.loguniform(1e-4, 1e-3)
        }
    elif loss_type == 'SL':
        config = {
            'lambda': tune.uniform(0, 1),
            'tau': tune.uniform(0.02, 0.04),
            'lr': tune.loguniform(1e-4, 1e-3)
        }
    elif loss_type == 'VCE':
        config = {
            'var_loss':  tune.uniform(0.5, 5.5),
            'lr': tune.loguniform(1e-4, 1e-3)
        }

    # hyperparameters for learning rate scheduler
    if learning_rate_scheduler == 'CARM':
        config['T_0'] = tune.choice([5, 10, 20, 40, 50])
        config['eta_min_factor'] = tune.loguniform(1e2, 1e4)
    if learning_rate_scheduler == 'SLR':
        config['step_size'] = tune.choice([5, 10, 20, 40, 50])
    if learning_rate_scheduler == 'MLR':
        config['lr_lambda'] = tune.uniform(0.8, 0.99)

        
    # defining the scheduler
    scheduler = ASHAScheduler(
        metric='loss',
        mode='min',
        max_t=conf['max_epochs'] // 20,
        grace_period=1,
        reduction_factor=2
    )
    
    # defining the reporter
    reporter = CLIReporter(metric_columns=['loss', 'avg_dice_coefficient', 'epoch'])
    
    return config, scheduler, reporter
Example #19
0
def ray_tune_interface(data_settings, cross_validation_settings,
                       parameter_settings, model_settings):
    cat_features = data_settings['cat_features']
    weights = data_settings['weights']
    data = data_settings['data']
    target = data_settings['target']

    model = model_settings.pop('model', None)

    n_folds = cross_validation_settings['n_folds']
    metric_to_use = cross_validation_settings['metric_to_use']
    mode = cross_validation_settings['mode']

    n_samples = parameter_settings['tune_parameters']['n_samples']
    num_threads = parameter_settings['tune_parameters']['num_threads']
    max_concurrent = parameter_settings['tune_parameters']['max_concurrent']
    space = ({
        k: var_type_sample(k, v, parameter_settings['parameters_types'])
        for k, v in parameter_settings['parameter_range'].items()
    })
    algo = HyperOptSearch(
        space,
        max_concurrent=max_concurrent,
        metric="metric_ave",
        mode=mode,
    )
    scheduler = ASHAScheduler(metric="metric_ave", mode=mode)
    config = {
        'num_samples': n_samples,
        'config': {
            'non_hp': {
                'data': {
                    'X': data,
                    'y': target,
                    'cat_features': cat_features,
                    'weights': weights,
                },
                'modelIns': model,
                'n_folds': n_folds,
                'metric_to_use': metric_to_use,
            }
        }
    }
    config['config'].update(parameter_settings)
    config['config']['model_settings'] = model_settings
    ray_experiment = tune.run(cross_validation_catboost_ray,
                              resources_per_trial={"gpu": 1},
                              search_alg=algo,
                              scheduler=scheduler,
                              keep_checkpoints_num=0,
                              verbose=1,
                              **config)
    results = ray_experiment.dataframe(metric="metric_ave", mode="min")
    results.to_csv('results_experiment.csv')
Example #20
0
def start_training(name):
    Epochs = 1000
    Samples = 50
    ModelName = name

    pose_autoencoder = MLP_withLabel.load_checkpoint(
        "/home/nuoc/Documents/MEX/models/MLP4_withLabel_best/M3/0.00324857.512.pbz2"
    )
    # pose_autoencoder = MLP_withLabel.load_checkpoint("/home/nuoc/Documents/MEX/models/MLP_withLabel/0.0013522337.512.pbz2")

    pose_encoder_out_dim = pose_autoencoder.dimensions[-1]

    scheduler = ASHAScheduler(max_t=Epochs,
                              grace_period=15,
                              reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["k", "lr", "batch_size", "loss_fn"],
        metric_columns=["loss", "training_iteration"],
        max_error_rows=5,
        max_progress_rows=5,
        max_report_frequency=1)

    analysis = tune.run(tune.with_parameters(
        tuning,
        MODEL=MotionGenerationModel,
        pose_autoencoder=pose_autoencoder,
        cost_dim=cost_dim,
        phase_dim=phase_dim,
        input_slices=[phase_dim, pose_dim, cost_dim],
        output_slices=[phase_dim, phase_dim, pose_encoder_out_dim],
        train_set=train_set,
        val_set=val_set,
        num_epochs=Epochs,
        model_name=ModelName),
                        resources_per_trial={
                            "cpu": 2,
                            "gpu": 1
                        },
                        metric="loss",
                        mode="min",
                        config=config,
                        num_samples=Samples,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name=ModelName,
                        verbose=False)

    print("-" * 70)
    print("Done")
    print("Best hyperparameters found were: ", analysis.best_config)
    print("Best achieved loss was: ", analysis.best_result)
    print("-" * 70)

    ray.shutdown()
Example #21
0
    def start(self, config: dict = None, **kwargs):
        if config is None:
            raise ValueError("Argument 'config' must not be None")

        # better save results in home dir because paths may become very long
        # possibly exceeding the max path limit if stored in different paths
        results_dir = (os.environ.get("TEST_TMPDIR")
                       or os.environ.get("TUNE_RESULT_DIR")
                       or os.path.expanduser("~/ray_results"))

        tune_log_path = os.path.join(results_dir, self.session_id)
        os.makedirs(tune_log_path, exist_ok=True)
        os.makedirs(self.out_path, exist_ok=True)
        metric = kwargs.get("metric", "mean_accuracy")
        mode = kwargs.get("mode", "max")

        num_gpus = kwargs.get("num_gpus", 1)
        num_cpus = kwargs.get("num_cpus", 1)

        ray.init(include_dashboard=False,
                 local_mode=True,
                 num_gpus=num_gpus,
                 num_cpus=num_cpus)

        tensorboard_url = launch_tensorboard(tune_log_path)
        print(f"TensorBoard launched: {tensorboard_url}.")

        scheduler = kwargs.get("scheduler", None)
        if scheduler is None:
            scheduler = ASHAScheduler(metric, mode=mode)
        analysis = tune.run(self._start,
                            self.session_id,
                            config=config,
                            scheduler=scheduler,
                            checkpoint_freq=20,
                            local_dir=results_dir)
        result = analysis.get_best_trial(metric)
        print("Best trial config: {}".format(result.config))
        print("Best trial final validation loss: {}".format(
            result.last_result["mean_loss"]))
        print("Best trial final validation accuracy: {}".format(
            result.last_result["mean_accuracy"]))

        # TODO search algorithm?
        df = analysis.dataframe(metric, mode)
        df.to_pickle(os.path.join(self.out_path, "results.pkl"))

        try:
            with pd.ExcelWriter(os.path.join(self.out_path,
                                             "results.xlsx")) as writer:
                df.to_excel(writer)
        except ModuleNotFoundError as e:
            print("Failed to write tuning result:", e)
Example #22
0
def test_custom_scheduler_default_time_attr():
    try:
        from ray.tune.schedulers import ASHAScheduler
    except ImportError:
        print("skip the test as ray tune cannot be imported.")
        return
    my_scheduler = ASHAScheduler(max_t=10)
    best_config = test_scheduler(scheduler=my_scheduler)
    print(
        "Custom ASHA scheduler (with ASHA default time attr), test error:",
        abs(10 / 2 - best_config["z"] / 2),
    )
Example #23
0
def main(cmdl):
    base_cfg = namespace_to_dict(read_config(Path(cmdl.cfg) / "default.yaml"))
    search_cfg = namespace_to_dict(read_config(Path(cmdl.cfg) / "search.yaml"))

    print(config_to_string(cmdl))
    print(config_to_string(dict_to_namespace(search_cfg)))

    # the search space
    good_init, search_space = get_search_space(search_cfg)

    search_name = "{timestep}_tune_{experiment_name}{dev}".format(
        timestep="{:%Y%b%d-%H%M%S}".format(datetime.now()),
        experiment_name=base_cfg["experiment"],
        dev="_dev" if cmdl.dev else "",
    )

    # search algorithm
    hyperopt_search = HyperOptSearch(
        search_space,
        metric="criterion",
        mode="max",
        max_concurrent=cmdl.workers,
        points_to_evaluate=good_init,
    )

    # early stopping
    scheduler = ASHAScheduler(
        time_attr="train_step",
        metric="criterion",
        mode="max",
        max_t=base_cfg["training_steps"],  # max length of the experiment
        grace_period=cmdl.grace_steps,  # stops after 20 logged steps
        brackets=3,  # don't know what this does
    )

    analysis = tune.run(
        lambda x: tune_trial(x, base_cfg=base_cfg, get_objective=None),
        name=search_name,
        # config=search_space,
        search_alg=hyperopt_search,
        scheduler=scheduler,
        local_dir="./results",
        num_samples=cmdl.trials,
        trial_name_creator=trial2string,
        resources_per_trial={"cpu": 3},
    )

    dfs = analysis.trial_dataframes
    for i, (key, df) in enumerate(dfs.items()):
        print("saving: ", key)
        df.to_pickle(f"{key}/trial_df.pkl")
def main(num_samples=10, max_num_epochs=50, gpus_per_trial=2):
    data_dir = os.path.abspath("./data")
    checkpoint_dir = os.path.abspath("./checkpoints")
    config = {
        "l1": tune.choice([32, 64, 128, 256]),
        "l2": tune.choice([32, 64, 128, 256]),
        "lr": tune.loguniform(1e-5, 1e-3),
        "batch_size": tune.choice([64, 128, 256])
    }

    scheduler = ASHAScheduler(metric="loss",
                              mode="min",
                              max_t=max_num_epochs,
                              grace_period=5,
                              reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])

    st_time = time.time()
    result = tune.run(partial(train_regression,
                              checkpoint_dir=checkpoint_dir,
                              data_dir=data_dir),
                      resources_per_trial={
                          "cpu": 2,
                          "gpu": gpus_per_trial
                      },
                      config=config,
                      num_samples=num_samples,
                      scheduler=scheduler,
                      progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))

    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(
        os.path.join(best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))
    print("Total Time", st_time - time.time())
Example #25
0
def main():
    logging.basicConfig(level=logging.INFO)

    # Raylib parameters
    num_samples = 10
    envname = 'AdversarialAntBulletEnv-v0'
    trainingconfig = Path.cwd() / 'trainingconfig.json'
    evaluate_mean_n = 1000  # Number of timesteps over which to evaluate the mean reward
    name_fmt = 'million-bucks_{adv_force}'

    config = {
        # TODO: sample from control once, then different adversarial strengths
        # Range is centered on the force that achieves the closest reward to the control (7.5)
        "adv_force": tune.qrandn(7.5, 2.5, 0.1),
    }

    # https://docs.ray.io/en/master/tune/tutorials/overview.html#which-search-algorithm-scheduler-should-i-choose
    # Use BOHB for larger problems with a small number of hyperparameters
    # search = TuneBOHB(max_concurrent=4, metric="mean_loss", mode="min")
    # sched = HyperBandForBOHB(
    #     time_attr="training_iteration",
    #     max_t=100,
    # )

    # Implicitly use random search if search algo is not specified
    sched = ASHAScheduler(
        time_attr='training_iteration',
        max_t=100,
        grace_period=1,  # Unit is iterations, not timesteps. TODO configure
    )

    # Pass in a Trainable class or function to tune.run.
    local_dir = str(Path.cwd() / "ray")
    logging.info(f'{local_dir=}')
    anal = tune.run(tune.with_parameters(trainable,
                                         envname=envname,
                                         trainingconfig=trainingconfig,
                                         evaluate_mean_n=evaluate_mean_n,
                                         name_fmt=name_fmt),
                    config=config,
                    num_samples=num_samples,
                    scheduler=sched,
                    local_dir=local_dir,
                    metric="robustness",
                    mode="max",
                    log_to_file=True)
    logging.info(f'best config: {anal.best_config}')
    logging.info(f'best config: {anal.best_result}')
Example #26
0
def main(args):
    device = config_cuda(args.use_cuda)

    data_pth = Path(Path.cwd() / args.data_path)
    out_pth = Path(Path.cwd() / args.out_path)

    train_set, valid_set, test_set = get_datasets(data_pth)

    config = {
        'eta': tune.loguniform(1e-5, 1e-1),
        'batch_size': tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        metric='loss',
        mode='min',
        max_t=args.max_epochs,
        grace_period=1,
        reduction_factor=2
    )
    reporter = CLIReporter(
        parameter_columns=['eta', 'batch_size'],
        metric_columns=['loss', 'accuracy', 'training_iteration']
    )
    num_gpus = 1 if device == 'cuda' else 0
    result = tune.run(
        partial(train_model, model_name=args.model, device=device,
                max_epochs=args.max_epochs, num_workers=args.num_workers,
                data_pth=data_pth),
        resources_per_trial={'cpu': args.num_workers, 'gpu': num_gpus},
        config=config,
        num_samples=args.num_samples,
        scheduler=scheduler,
        progress_reporter=reporter
    )
    best_trial = result.get_best_trial('loss', 'min', 'last')
    print(f'Best trial config {best_trial.config}')
    print(
          f'Best trial final validation loss: {best_trial.last_result["loss"]}'
         )
    model = load_model(args.model, len(train_set.classes))

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, 'checkpoint'))
    model.load_state_dict(model_state)

    test_acc = test_model(model, device, test_set, out_pth, args.num_workers)
    print(f'Best trial test set accuracy: {test_acc}')
def main(num_samples=10, max_num_epochs=10):
    data_dir = os.path.abspath("./data")
    trainset, _ = load_data(data_dir)
    class_names = trainset.classes
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-2),
        "batch_size": tune.choice([8, 16, 32])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2
    )
    reporter = CLIReporter(
        parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"]
    )
    result = tune.run(
        partial(train_net, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": 1},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter
    )

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = Net(len(class_names), best_trial.config["l1"], best_trial.config["l2"])
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))
Example #28
0
def setup_tune_scheduler():
    search_space = workload.create_search_space()

    scheduler = ASHAScheduler(
        # set a large max_t such that ASHA will always promot to next rung,
        # until something reaches target accuracy
        max_t=int(1000),
        reduction_factor=3,
        **workload.exp_metric(),
    )
    return dict(
        search_alg=VariantGenerator(),
        scheduler=scheduler,
        config=search_space,
        resources_per_trial=com.detect_baseline_resource(),
    )
    def tune_model(data):
        logfile = open("/tmp/ray/session_latest/custom.log", "w")

        def write(msg):
            logfile.write(f"{msg}\n")
            logfile.flush()

        search_space = {
            # You can mix constants with search space objects.
            "objective": "binary:logistic",
            "eval_metric": ["logloss", "error"],
            "max_depth": tune.randint(1, 9),
            "min_child_weight": tune.choice([1, 2, 3]),
            "subsample": tune.uniform(0.5, 1.0),
            "eta": tune.loguniform(1e-4, 1e-1),
        }

        print("enabling aggressive early stopping of bad trials")
        # This will enable aggressive early stopping of bad trials.
        scheduler = ASHAScheduler(
            max_t=4,
            grace_period=1,
            reduction_factor=2  # 10 training iterations
        )

        print("Tuning")

        analysis = tune.run(
            tune.with_parameters(
                train_model,
                data=data
                # checkpoint_dir=f"{LOCAL_DIR}/checkpoints",
                # data_dir=f"{LOCAL_DIR}/data",
            ),
            metric="eval-logloss",
            mode="min",
            local_dir=LOCAL_DIR,
            # You can add "gpu": 0.1 to allocate GPUs
            resources_per_trial=RAY_PARAMS.get_tune_resources(),
            config=search_space,
            num_samples=4,
            scheduler=scheduler,
        )

        print("Done Tuning")

        return analysis
Example #30
0
def tunerTrain():
    ray.init(_memory=4000000000, num_cpus=5)
    searchSpace = {
        'lr': tune.loguniform(1e-4, 9e-1),
        'finalOutput': tune.randint(2, 50),  # minimum of 2, other 1//2 = 0 activation maps
        'stride1': tune.grid_search(np.arange(1, 4).tolist()),
        'stride2': tune.grid_search(np.arange(1, 4).tolist()),
        'batchSize': tune.grid_search([2, 4, 8, 16, 32, 64, 128, 256]),
        'finalChannel': tune.randint(1, 50),
    }

    analysis = tune.run(train, num_samples=1, scheduler=ASHAScheduler(metric='score', mode='max'),
                        config=searchSpace)
    print(f"Best Config: {analysis.get_all_configs(metric='score', mode='max')}")
    df = analysis.results_df
    logdir = analysis.get_best_logdir("mean_accuracy", mode="max")
    print(f"dir of best: {logdir}")