Exemple #1
0
def tune_mnist_mxnet(num_samples=10, num_epochs=10):
    logger.info("Downloading MNIST data...")
    mnist_data = mx.test_utils.get_mnist()
    logger.info("Got MNIST data, starting Ray Tune.")

    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-3, 1e-1),
        "batch_size": tune.choice([32, 64, 128])
    }

    scheduler = ASHAScheduler(metric="mean_accuracy",
                              mode="max",
                              max_t=num_epochs,
                              grace_period=1,
                              reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])

    tune.run(partial(train_mnist_mxnet,
                     mnist=mnist_data,
                     num_epochs=num_epochs),
             resources_per_trial={
                 "cpu": 1,
             },
             config=config,
             num_samples=num_samples,
             scheduler=scheduler,
             progress_reporter=reporter,
             name="tune_mnist_mxnet")
Exemple #2
0
def grid_search(hparams):
    scheduler = ASHAScheduler(max_t=hparams['n_epochs'],
                              grace_period=1,
                              reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=hparams['param_cols'],
        metric_columns=['valid_acc', 'valid_f1', 'valid_loss'])

    rdm = RetinalDataModule()

    analysis = tune.run(tune.with_parameters(train_tune, rdm=rdm),
                        resources_per_trial={
                            "cpu": 1,
                            "gpu": 1
                        },
                        metric="valid_loss",
                        mode="min",
                        config=hparams,
                        local_dir=Path(hparams['output_dir'], 'ray_tune'),
                        num_samples=5,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name=f"tune_{hparams['model']}_DRIVE")

    print("Best hyperparameters found were: ", analysis.best_config)
def tune_mnist_pbt():
    data_dir = mkdtemp(prefix="mnist_data_")
    LightningMNISTClassifier.download_data(data_dir)
    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": 1e-3,
        "batch_size": 64,
        "data_dir": data_dir
    }
    scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="loss",
        mode="min",
        perturbation_interval=4,
        hyperparam_mutations={
            "lr": lambda: tune.loguniform(1e-4, 1e-1).func(None),
            "batch_size": [32, 64, 128]
        })
    reporter = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])
    tune.run(
        train_mnist_tune_checkpoint,
        resources_per_trial={"cpu": 1},
        config=config,
        num_samples=10,
        scheduler=scheduler,
        progress_reporter=reporter)
    shutil.rmtree(data_dir)
def tune_mnist_asha():
    data_dir = mkdtemp(prefix="mnist_data_")
    LightningMNISTClassifier.download_data(data_dir)
    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
        "data_dir": data_dir
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=10,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])
    tune.run(
        train_mnist_tune,
        resources_per_trial={"cpu": 1},
        config=config,
        num_samples=10,
        scheduler=scheduler,
        progress_reporter=reporter)
    shutil.rmtree(data_dir)
Exemple #5
0
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    data_dir = os.path.abspath("./data")
    load_data(data_dir)
    # 配置搜索空间
    # 随机搜索
    config = {
        "l1": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    # ASHAScheduler,它将尽早终止效果不佳的测试。
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2
    )

    reporter = CLIReporter(
        parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss","accuracy", "training_iteration"]
    )
    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter
    )

    best_trial = result.get_best_trial("loss","min","last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]
    ))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))
    # 设置模型超参
    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)

    best_trained_model.to(device)


    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"
    ))
    # 加载模型参数
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))
def tune_mnist_asha(num_samples=10, num_epochs=10, gpus_per_trial=0, data_dir="~/data"):
    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
    }

    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])

    train_fn_with_parameters = tune.with_parameters(train_mnist_tune,
                                                    num_epochs=num_epochs,
                                                    num_gpus=gpus_per_trial,
                                                    data_dir=data_dir)
    resources_per_trial = {"cpu": 1, "gpu": gpus_per_trial}

    analysis = tune.run(train_fn_with_parameters,
        resources_per_trial=resources_per_trial,
        metric="loss",
        mode="min",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        name="tune_mnist_asha")

    print("Best hyperparameters found were: ", analysis.best_config)
Exemple #7
0
def main():
    # parse config
    parser = flags.get_parser()
    args, override_args = parser.parse_known_args()
    config = build_config(args, override_args)
    # add parameters to tune using grid or random search
    config["lr"] = tune.loguniform(0.0001, 0.01)
    # define scheduler
    scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="val_loss",
        mode="min",
        perturbation_interval=1,
        hyperparam_mutations={
            "lr": tune.loguniform(0.000001, 0.01),
        },
    )
    # ray init
    ray.init(
        address="auto",
        _node_ip_address=os.environ["ip_head"].split(":")[0],
        _redis_password=os.environ["redis_password"],
    )
    # define command line reporter
    reporter = CLIReporter(
        print_intermediate_tables=True,
        metric="val_loss",
        mode="min",
        metric_columns={
            "act_lr": "act_lr",
            "steps": "steps",
            "epochs": "epochs",
            "training_iteration": "training_iteration",
            "val_loss": "val_loss",
            "val_forces_mae": "val_forces_mae",
        },
    )
    # define run parameters
    analysis = tune.run(
        ocp_trainable,
        resources_per_trial={
            "cpu": 8,
            "gpu": 1
        },
        config=config,
        stop={"epochs": 12},
        # time_budget_s=28200,
        fail_fast=False,
        local_dir=config.get("run_dir", "./"),
        num_samples=8,
        progress_reporter=reporter,
        scheduler=scheduler,
    )

    print(
        "Best config is:",
        analysis.get_best_config(metric="val_forces_mae",
                                 mode="min",
                                 scope="last"),
    )
Exemple #8
0
def main(loss_function="L1", num_samples=25, max_num_epochs=25, gpus_per_trial=1, cpus_per_trial=10):

    experiment_name = loss_function + "_shuffle_validation"
    save_dir = '/data/results/vcpujol/transformers/single_deployment/predict_maxmax/pytorch_transformer/'

    config = {
        "lr": tune.loguniform(1e-4, 5e-1),
        "lr_step": tune.randint(1,10),
        "gamma": tune.loguniform(0.85,0.9999),
        "epochs": tune.choice([5, 10, 15, 20, 25]),
        "n_heads": tune.randint(2,10),
        "dim_val": tune.choice([2,4,6]), # FIXME requires numero parell...
        "dim_att": tune.randint(2,12),
        "encoder_layers": tune.randint(1,7),
        "decoder_layers": tune.randint(1,7),
        "batch_size": tune.randint(1,10),
        "input_feat_enc": tune.choice([94]),
        "input_feat_dec": tune.choice([1]),
        "seq_len": tune.choice([16, 32, 64, 96, 128, 180, 220, 256, 312, 350, 420, 470, 512]),  #[16, 32, 64, 128, 256, 512, 1024, 2048]
        "prediction_step": tune.choice([1])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=4,
        reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["lr", "lr_step", "gamma", "epochs", "n_heads", "dim_val", "dim_att", "encoder_layers",
                           "decoder_layers", "batch_size", "seq_len"],
        metric_columns=["loss", "training_iteration"])
    result = tune.run(
        partial(transformer_train, save_dir=save_dir, loss_function=loss_function),
        resources_per_trial={"cpu": cpus_per_trial, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        local_dir=save_dir,
        name=experiment_name)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
    # print("Best trial final validation accuracy: {}".format(best_trial.last_result["accuracy"]))

    best_trained_model = Transformer(best_trial.config["dim_val"], best_trial.config["dim_att"],
                                     best_trial.config["input_feat_enc"], best_trial.config["input_feat_dec"],
                                     best_trial.config["seq_len"], best_trial.config["decoder_layers"],
                                     best_trial.config["encoder_layers"], best_trial.config["n_heads"])

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    local_dir = save_dir
    exp_name = experiment_name
    test_acc = test_transformer(best_trained_model, best_trial.config, local_dir, exp_name, loss_function)
    print("Best trial test set accuracy: {}".format(test_acc))
Exemple #9
0
def main_tune(base_args):
    # ray.init(log_to_driver=False)
    tune_config = {
        "learning_rate": tune.loguniform(5e-6, 1e-3),
        "weight_decay": tune.choice([0.0, 1e-3, 1e-2, 0.1]),
        "batch_size": tune.choice([16, 32, 64, 128]),
        "latent_dim": tune.choice([2, 3, 8, 16, 32, 128, 256, 512])
    }

    scheduler = ASHAScheduler(max_t=base_args.max_tune_epoches,
                              grace_period=3,
                              reduction_factor=2)

    reporter = CLIReporter(parameter_columns=[
        "learning_rate", "weight_decay", "batch_size", "latent_dim"
    ],
                           metric_columns=[
                               "val_lossR", "loss", "Reconstruction_Loss",
                               "training_iteration"
                           ])

    analysis = tune.run(tune.with_parameters(tune_train, base_arg=base_args),
                        resources_per_trial={
                            "cpu": 12,
                            "gpu": 1.0,
                        },
                        metric="val_lossR",
                        mode="min",
                        config=tune_config,
                        num_samples=10,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name="tune_vae_chol")

    print("Best hyperparameters found were: ", analysis.best_config)
Exemple #10
0
def run_best_params(opt):
    best_params_dir = get_best_params_dir(opt)
    with open(best_params_dir + '/params.json') as f:
        best_params = json.loads(f.read())
    # allow params specified at the cmd line to override
    best_params_ret = {**best_params, **opt}
    # the exception is number of epochs as we want to use more here than we would for hyperparameter tuning.
    best_params_ret['epoch'] = opt['epoch']

    print("Running with parameters {}".format(best_params_ret))

    data_dir = os.path.abspath("../data")
    reporter = CLIReporter(metric_columns=[
        "accuracy", "loss", "test_acc", "train_acc", "best_time", "best_epoch",
        "training_iteration"
    ])

    if opt['name'] is None:
        name = opt['folder'] + '_test'
    else:
        name = opt['name']

    result = tune.run(
        partial(train_ray_int, data_dir=data_dir),
        name=name,
        resources_per_trial={
            "cpu": opt['cpus'],
            "gpu": opt['gpus']
        },
        search_alg=None,
        keep_checkpoints_num=3,
        checkpoint_score_attr='accuracy',
        config=best_params_ret,
        num_samples=opt['reps']
        if opt["num_splits"] == 0 else opt["num_splits"] * opt["reps"],
        scheduler=None,
        max_failures=
        1,  # early stop solver can't recover from failure as it doesn't own m2.
        local_dir='../ray_tune',
        progress_reporter=reporter,
        raise_on_failed_trial=False)

    df = result.dataframe(metric=opt['metric'],
                          mode="max").sort_values(opt['metric'],
                                                  ascending=False)
    try:
        df.to_csv('../ray_results/{}_{}.csv'.format(
            name, time.strftime("%Y%m%d-%H%M%S")))
    except:
        pass

    print(df[['accuracy', 'test_acc', 'train_acc', 'best_time', 'best_epoch']])

    test_accs = df['test_acc'].values
    print("test accuracy {}".format(test_accs))
    log = "mean test {:04f}, test std {:04f}, test sem {:04f}, test 95% conf {:04f}"
    print(
        log.format(test_accs.mean(), np.std(test_accs), get_sem(test_accs),
                   mean_confidence_interval(test_accs)))
def tune_from_existing(start_model,
                       start_config,
                       num_samples=10,
                       num_epochs=10,
                       gpus_per_trial=0.0,
                       day=0):
    data_interface = MNISTDataInterface("/tmp/mnist_data", max_days=10)
    num_examples = data_interface._get_day_slice(
        day) - data_interface._get_day_slice(day - 1)

    config = start_config.copy()
    config.update({
        "batch_size": tune.choice([16, 32, 64]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "momentum": tune.uniform(0.1, 0.9),
    })

    scheduler = ASHAScheduler(
        metric="mean_accuracy",
        mode="max",
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2,
    )

    reporter = CLIReporter(
        parameter_columns=["lr", "momentum", "batch_size"],
        metric_columns=["mean_accuracy", "training_iteration"],
    )

    analysis = tune.run(
        partial(
            train_mnist,
            start_model=start_model,
            data_fn=data_interface.get_incremental_data,
            num_epochs=num_epochs,
            use_gpus=True if gpus_per_trial > 0 else False,
            day=day,
        ),
        resources_per_trial={
            "cpu": 1,
            "gpu": gpus_per_trial
        },
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        verbose=0,
        name="tune_serve_mnist_fromsexisting",
    )

    best_trial = analysis.get_best_trial("mean_accuracy", "max", "last")
    best_accuracy = best_trial.metric_analysis["mean_accuracy"]["last"]
    best_trial_config = best_trial.config
    best_checkpoint = best_trial.checkpoint.value

    return best_accuracy, best_trial_config, best_checkpoint, num_examples
Exemple #12
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("--X_dir", type=str)
    parser.add_argument("--y_dir", type=str)
    parser.add_argument("--epoch", type=int)
    parser.add_argument("--config_dir", type=str)
    parser.add_argument("--model", type=str)
    parser.add_argument("--n_sample", type=int)

    args = parser.parse_args()

    X_train = torch.load(args.X_dir)
    y_train = torch.load(args.y_dir)

    config = {
        "n_hidden": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([16, 32, 64, 128]),
    }

    CL = CustomLoss(1, 2)
    scheduler = ASHAScheduler(metric="loss",
                              mode="min",
                              max_t=args.epoch,
                              grace_period=1,
                              reduction_factor=2)
    reporter = CLIReporter(metric_columns=["loss", "training_iteration"])

    def train_func(config):
        train_model(
            X=X_train,
            y=y_train,
            num_epochs=args.epoch,
            loss_func=CL.custom_loss_1,
            model_name=args.model,
            config=config,
        )

    result = tune.run(
        train_func,
        resources_per_trial={
            "cpu": 2,
            "gpu": 2
        },
        config=config,
        num_samples=args.n_sample,
        scheduler=scheduler,
        progress_reporter=reporter,
    )
    best_trial = result.get_best_trial("loss", "min", "last")

    with open(args.config_dir, "w") as json_file:
        json.dump(best_trial.last_result["config"], json_file)

    last_loss = best_trial.last_result["loss"]
    print(f"Validation Loss of best model was {last_loss}.")
def tune_vl_bert(config_path,
                 pl_ckpt_path,
                 num_samples=10,
                 num_epochs=10,
                 gpus_per_trial=2):

    # scheduler = ASHAScheduler(
    #     metric="loss",
    #     mode="min",
    #     max_t=num_epochs,
    #     grace_period=1,
    #     reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=[
            "lr", "weight_decay", "warmup_factor", "max_epoch", "batch_size"
        ],
        metric_columns=["mean_accuracy", "training_iteration"])

    param_config = {
        "lr": 6.25e-7,
        "weight_decay": tune.loguniform(1e-5, 1e-2),
        "batch_size": 4,
        "max_epoch": tune.choice([4, 6, 8, 10]),
        "warmup_factor": tune.uniform(0, 1),
        "warmup_steps": tune.uniform(100, 800),
    }

    scheduler = PopulationBasedTraining(time_attr="training_iteration",
                                        metric="mean_accuracy",
                                        mode="max",
                                        perturbation_interval=2,
                                        hyperparam_mutations={
                                            "lr":
                                            tune.loguniform(6.25e-6, 6.25e-8),
                                            "batch_size": [1, 2, 3, 4],
                                        })

    update_config(config_path)
    model_base_cfg = copy.deepcopy(config)

    tune.run(partial(
        _tune,
        vl_bert_config=model_base_cfg,
        pl_ckpt_path=pl_ckpt_path,
        num_gpus=gpus_per_trial,
    ),
             resources_per_trial={
                 "cpu": 4,
                 "gpu": gpus_per_trial,
             },
             config=param_config,
             num_samples=num_samples,
             scheduler=scheduler,
             progress_reporter=reporter,
             name="tune_vl_bert")
Exemple #14
0
def tune4_withLabel(
    model,
    train_set: Dataset,
    val_set: Dataset,
    dims: list,
    config: dict,
    EPOCHS: int = 300,
    extra_feature_len: int = 0,
    extra_feature_len2: int = 0,
    n_gpu=1,
    n_samples=20,
    model_name="model",
):

    dim1, dim2, dim3, dim4 = dims[0], dims[1], dims[2], dims[3]

    scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=1, reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["k", "lr", "batch_size", "hidden_dim"],
        metric_columns=["loss", "training_iteration"],
        max_error_rows=5,
        max_progress_rows=5,
        max_report_frequency=10)

    analysis = tune.run(tune.with_parameters(
        train4_withLabel,
        model=model,
        dim1=dim1,
        dim2=dim2,
        dim3=dim3,
        dim4=dim4,
        extra_feature_len=extra_feature_len,
        extra_feature_len2=extra_feature_len2,
        train_set=train_set,
        val_set=val_set,
        num_epochs=EPOCHS,
        num_gpus=n_gpu,
        model_name=model_name),
                        resources_per_trial={
                            "cpu": 1,
                            "gpu": n_gpu
                        },
                        metric="loss",
                        mode="min",
                        config=config,
                        num_samples=n_samples,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name=model_name,
                        verbose=False)

    print("-" * 70)
    print("Done")
    print("Best hyperparameters found were: ", analysis.best_config)
    print("Best achieved loss was: ", analysis.best_result)
    print("-" * 70)
Exemple #15
0
def start_training(name):
    Epochs = 1000
    Samples = 50
    ModelName = name

    pose_autoencoder = MLP_withLabel.load_checkpoint(
        "/home/nuoc/Documents/MEX/models/MLP4_withLabel_best/M3/0.00324857.512.pbz2"
    )
    # pose_autoencoder = MLP_withLabel.load_checkpoint("/home/nuoc/Documents/MEX/models/MLP_withLabel/0.0013522337.512.pbz2")

    pose_encoder_out_dim = pose_autoencoder.dimensions[-1]

    scheduler = ASHAScheduler(max_t=Epochs,
                              grace_period=15,
                              reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["k", "lr", "batch_size", "loss_fn"],
        metric_columns=["loss", "training_iteration"],
        max_error_rows=5,
        max_progress_rows=5,
        max_report_frequency=1)

    analysis = tune.run(tune.with_parameters(
        tuning,
        MODEL=MotionGenerationModel,
        pose_autoencoder=pose_autoencoder,
        cost_dim=cost_dim,
        phase_dim=phase_dim,
        input_slices=[phase_dim, pose_dim, cost_dim],
        output_slices=[phase_dim, phase_dim, pose_encoder_out_dim],
        train_set=train_set,
        val_set=val_set,
        num_epochs=Epochs,
        model_name=ModelName),
                        resources_per_trial={
                            "cpu": 2,
                            "gpu": 1
                        },
                        metric="loss",
                        mode="min",
                        config=config,
                        num_samples=Samples,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name=ModelName,
                        verbose=False)

    print("-" * 70)
    print("Done")
    print("Best hyperparameters found were: ", analysis.best_config)
    print("Best achieved loss was: ", analysis.best_result)
    print("-" * 70)

    ray.shutdown()
Exemple #16
0
def main(args):
    cfg = setup(args)

    search_space = CS.ConfigurationSpace()
    search_space.add_hyperparameters([
        CS.UniformFloatHyperparameter(name="lr", lower=1e-6, upper=1e-3),
        CS.UniformFloatHyperparameter(name="wd", lower=0, upper=1e-3),
        CS.UniformFloatHyperparameter(name="wd_bias", lower=0, upper=1e-3),
        CS.CategoricalHyperparameter(name="bsz",
                                     choices=[64, 96, 128, 160, 224, 256]),
        CS.CategoricalHyperparameter(name="num_inst",
                                     choices=[2, 4, 8, 16, 32]),
        CS.UniformIntegerHyperparameter(name="delay_iters", lower=20,
                                        upper=60),
        CS.UniformFloatHyperparameter(name="ce_scale", lower=0.1, upper=1.0),
        CS.UniformIntegerHyperparameter(name="circle_scale",
                                        lower=8,
                                        upper=256),
        CS.UniformFloatHyperparameter(name="circle_margin",
                                      lower=0.1,
                                      upper=0.5),
        CS.CategoricalHyperparameter(name="autoaug_enabled",
                                     choices=[True, False]),
        CS.CategoricalHyperparameter(name="cj_enabled", choices=[True, False]),
    ])

    exp_metrics = dict(metric="score", mode="max")
    bohb_hyperband = HyperBandForBOHB(
        time_attr="training_iteration",
        max_t=7,
        **exp_metrics,
    )
    bohb_search = TuneBOHB(search_space, max_concurrent=4, **exp_metrics)

    reporter = CLIReporter(parameter_columns=["bsz", "num_inst", "lr"],
                           metric_columns=["r1", "map", "training_iteration"])

    analysis = tune.run(partial(train_reid_tune, cfg),
                        resources_per_trial={
                            "cpu": 10,
                            "gpu": 1
                        },
                        search_alg=bohb_search,
                        num_samples=args.num_samples,
                        scheduler=bohb_hyperband,
                        progress_reporter=reporter,
                        local_dir=cfg.OUTPUT_DIR,
                        keep_checkpoints_num=4,
                        name="bohb")

    best_trial = analysis.get_best_trial("map", "max", "last")
    logger.info("Best trial config: {}".format(best_trial.config))
    logger.info("Best trial final validation mAP: {}, Rank-1: {}".format(
        best_trial.last_result["map"], best_trial.last_result["r1"]))
Exemple #17
0
def hyperparameter_tuning_initializer(loss_type='SL', learning_rate_scheduler ='CARM'):
    
    # defining the hyperparameters
    if loss_type == 'FL':
        config = {
            'gamma': tune.choice([0.5, 1, 2]),
            'lr': tune.loguniform(1e-4, 1e-3)
        }
    elif loss_type == 'CEDL':
        config = {
            'dice_loss': tune.uniform(0, 3),
            'lr': tune.loguniform(1e-4, 1e-3)
        }
    elif loss_type == 'CEDIL':
        config = {
            'dice_loss': tune.uniform(0, 3),
            'inverse_dice_loss': tune.uniform(0, 3),
            'lr': tune.loguniform(1e-4, 1e-3)
        }
    elif loss_type == 'SL':
        config = {
            'lambda': tune.uniform(0, 1),
            'tau': tune.uniform(0.02, 0.04),
            'lr': tune.loguniform(1e-4, 1e-3)
        }
    elif loss_type == 'VCE':
        config = {
            'var_loss':  tune.uniform(0.5, 5.5),
            'lr': tune.loguniform(1e-4, 1e-3)
        }

    # hyperparameters for learning rate scheduler
    if learning_rate_scheduler == 'CARM':
        config['T_0'] = tune.choice([5, 10, 20, 40, 50])
        config['eta_min_factor'] = tune.loguniform(1e2, 1e4)
    if learning_rate_scheduler == 'SLR':
        config['step_size'] = tune.choice([5, 10, 20, 40, 50])
    if learning_rate_scheduler == 'MLR':
        config['lr_lambda'] = tune.uniform(0.8, 0.99)

        
    # defining the scheduler
    scheduler = ASHAScheduler(
        metric='loss',
        mode='min',
        max_t=conf['max_epochs'] // 20,
        grace_period=1,
        reduction_factor=2
    )
    
    # defining the reporter
    reporter = CLIReporter(metric_columns=['loss', 'avg_dice_coefficient', 'epoch'])
    
    return config, scheduler, reporter
Exemple #18
0
def run_ray_single_instance(trainer, logger, **kwargs):
    """Run with ray in a single instance. Tested."""

    # adapted from HF integrations
    prefix_checkpoint_dir = "checkpoint"

    def _objective(trial, checkpoint_dir=None):
        model_path = None
        if checkpoint_dir:
            for subdir in os.listdir(checkpoint_dir):
                if subdir.startswith(prefix_checkpoint_dir):
                    model_path = os.path.join(checkpoint_dir, subdir)

        trainer.objective = None
        trainer.train(model_path=model_path, trial=trial)
        # If there hasn't been any evaluation during the training loop.
        if getattr(trainer, "objective", None) is None:
            metrics = trainer.evaluate()
            # Logs at the end of the objective function, useful for hp search only
            trainer._tune_save_checkpoint()
            # Q: what else is reporting, when not reporting hp search metrics?
            tune.report(**metrics, done=True)

    _tb_writer = trainer.pop_callback(TensorBoardCallback)
    if _tb_writer is not None:
        trainer.add_callback(_tb_writer)

    # Setup default `resources_per_trial` and `reporter`.
    if "resources_per_trial" not in kwargs and trainer.args.n_gpu > 0:
        # `args.n_gpu` is considered the total number of GPUs that will be split
        # among the `n_jobs`
        n_jobs = int(kwargs.pop("n_jobs", 1))
        num_gpus_per_trial = trainer.args.n_gpu
        if num_gpus_per_trial / n_jobs >= 1:
            num_gpus_per_trial = int(math.ceil(num_gpus_per_trial / n_jobs))
        kwargs["resources_per_trial"] = {"gpu": num_gpus_per_trial}

    if "progress_reporter" not in kwargs:
        from ray.tune import CLIReporter

        kwargs["progress_reporter"] = CLIReporter(metric_columns=["objective"])

    if "keep_checkpoints_num" in kwargs and kwargs["keep_checkpoints_num"] > 0:
        # `keep_checkpoints_num=0` would disabled checkpointing
        trainer.use_tune_checkpoints = True
        if kwargs["keep_checkpoints_num"] > 1:
            logger.warning(
                "Currently keeping {} checkpoints for each trial."
                "Checkpoints are large, consider setting `keep_checkpoints_num=1`."
            )

    # run tune
    tune.run(_objective, **kwargs)
def main(num_samples=10, max_num_epochs=50, gpus_per_trial=2):
    data_dir = os.path.abspath("./data")
    checkpoint_dir = os.path.abspath("./checkpoints")
    config = {
        "l1": tune.choice([32, 64, 128, 256]),
        "l2": tune.choice([32, 64, 128, 256]),
        "lr": tune.loguniform(1e-5, 1e-3),
        "batch_size": tune.choice([64, 128, 256])
    }

    scheduler = ASHAScheduler(metric="loss",
                              mode="min",
                              max_t=max_num_epochs,
                              grace_period=5,
                              reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])

    st_time = time.time()
    result = tune.run(partial(train_regression,
                              checkpoint_dir=checkpoint_dir,
                              data_dir=data_dir),
                      resources_per_trial={
                          "cpu": 2,
                          "gpu": gpus_per_trial
                      },
                      config=config,
                      num_samples=num_samples,
                      scheduler=scheduler,
                      progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))

    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(
        os.path.join(best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))
    print("Total Time", st_time - time.time())
def main(args):
    device = config_cuda(args.use_cuda)

    data_pth = Path(Path.cwd() / args.data_path)
    out_pth = Path(Path.cwd() / args.out_path)

    train_set, valid_set, test_set = get_datasets(data_pth)

    config = {
        'eta': tune.loguniform(1e-5, 1e-1),
        'batch_size': tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        metric='loss',
        mode='min',
        max_t=args.max_epochs,
        grace_period=1,
        reduction_factor=2
    )
    reporter = CLIReporter(
        parameter_columns=['eta', 'batch_size'],
        metric_columns=['loss', 'accuracy', 'training_iteration']
    )
    num_gpus = 1 if device == 'cuda' else 0
    result = tune.run(
        partial(train_model, model_name=args.model, device=device,
                max_epochs=args.max_epochs, num_workers=args.num_workers,
                data_pth=data_pth),
        resources_per_trial={'cpu': args.num_workers, 'gpu': num_gpus},
        config=config,
        num_samples=args.num_samples,
        scheduler=scheduler,
        progress_reporter=reporter
    )
    best_trial = result.get_best_trial('loss', 'min', 'last')
    print(f'Best trial config {best_trial.config}')
    print(
          f'Best trial final validation loss: {best_trial.last_result["loss"]}'
         )
    model = load_model(args.model, len(train_set.classes))

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, 'checkpoint'))
    model.load_state_dict(model_state)

    test_acc = test_model(model, device, test_set, out_pth, args.num_workers)
    print(f'Best trial test set accuracy: {test_acc}')
def main(num_samples=10, max_num_epochs=10):
    data_dir = os.path.abspath("./data")
    trainset, _ = load_data(data_dir)
    class_names = trainset.classes
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-2),
        "batch_size": tune.choice([8, 16, 32])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2
    )
    reporter = CLIReporter(
        parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"]
    )
    result = tune.run(
        partial(train_net, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": 1},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter
    )

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = Net(len(class_names), best_trial.config["l1"], best_trial.config["l2"])
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))
Exemple #22
0
def run_learning(env_config):
    config['env_config'] = env_config
    stop = {
        #"episodes_total": 10
    }
    tune.run("PPO",
             name="Herding",
             stop=stop,
             config=config,
             trial_name_creator=lambda trial: 'Herding',
             trial_dirname_creator=lambda trial: 'Herding',
             keep_checkpoints_num=1,
             checkpoint_freq=5,
             progress_reporter=CLIReporter(max_report_frequency=60)
             #resume=True
             )
Exemple #23
0
def tuning(args):
    activation = nn.PReLU if args.actv == 'prelu' else nn.SELU
    config = {
        "l1_units": tune.choice([480, 512, 544]),
        "l2_units": tune.choice([224, 256, 288]),
        "l3_units": tune.choice([96, 128, 160]),
        "lambda": tune.choice([1e-3, 1e-4, 1e-5]),
        "actv": tune.choice([activation])
    }
    scheduler = PopulationBasedTraining(time_attr='training_iteration',
                                        perturbation_interval=4,
                                        hyperparam_mutations={
                                            "l1_units":
                                            [464, 496, 528, 560, 576],
                                            "l2_units":
                                            [208, 240, 272, 304, 328],
                                            "l3_units":
                                            [80, 112, 144, 176, 208]
                                        })

    reporter = CLIReporter(parameter_columns=[
        "l1_units",
        "l2_units",
        "l3_units",
        "lambda",
    ],
                           metric_columns=["loss", "training_iteration"])

    analysis = tune.run(tune.with_parameters(train,
                                             batch_size=args.batch_size,
                                             num_epochs=args.num_epochs,
                                             num_gpus=args.num_gpus),
                        resources_per_trial={
                            "cpu": args.num_cpus,
                            "gpu": args.num_gpus
                        },
                        metric="loss",
                        mode="min",
                        config=config,
                        num_samples=args.num_trials,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        max_failures=3,
                        stop={"training_iteration": 10},
                        name="tune_cae")

    print(f"Found best hyperparameters: {analysis.best_config}")
Exemple #24
0
def tune_ecg(data_dir,
             num_epochs=1,
             normalised=True,
             num_samples=10,
             gpus_per_trial=1):
    config = {
        "lstm_size": tune.choice([2, 3, 4, 5, 32, 64, 128]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([32, 64, 128, 256, 512])
    }
    #    scheduler = PopulationBasedTraining(
    #        time_attr="training_iteration",
    #        perturbation_interval=5,
    #        hyperparam_mutations={
    #            # distribution for resampling
    #            "lr": lambda: np.random.uniform(0.0001, 1),
    #            # allow perturbations within this set of categorical values
    #            "momentum": [0.8, 0.9, 0.99],
    #        })

    reporter = CLIReporter(
        parameter_columns=["lstm_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])

    trainable = tune.with_parameters(train_ecg,
                                     data_dir=data_dir,
                                     normalised=normalised,
                                     num_epochs=num_epochs,
                                     num_gpus=gpus_per_trial)

    analysis = tune.run(
        trainable,
        resources_per_trial={
            "cpu": 16,
            "gpu": gpus_per_trial
        },
        metric="loss",
        mode="min",
        config=config,
        num_samples=num_samples,
        local_dir="./results",
        #        scheduler=scheduler,
        progress_reporter=reporter,
        name="tune_ecg")

    print("Best hyperparameters found were: ", analysis.best_config)
Exemple #25
0
def main(args, reproducible: bool):
    if reproducible:
        seed_everything(42)

    datamodule = TwoDomainMMEDM(dataPath=args.dataPath,
                                augment=True,
                                batch_size=32,
                                num_workers=8)

    config = {
        "log_lr": tune.uniform(-4, -2),
        "log_lrRatio": tune.uniform(-3, 0),
        "log_decay": tune.uniform(-8, -1),
    }

    search_alg = BayesOptSearch(
        metric='mean_iou',
        mode='max',
    )

    scheduler = ASHAScheduler(grace_period=25, )

    reporter = CLIReporter(
        parameter_columns=["log_lr", "log_lrRatio", "log_decay"],
        metric_columns=["loss", "mean_iou", "training_iteration"])

    analysis = tune.run(tune.with_parameters(
        trainWithTune,
        datamodule=datamodule,
        num_epochs=175,
        num_gpus=1,
    ),
                        resources_per_trial={
                            "cpu": 5,
                            "gpu": 0.5,
                        },
                        metric="mean_iou",
                        mode="max",
                        config=config,
                        num_samples=20,
                        scheduler=scheduler,
                        search_alg=search_alg,
                        progress_reporter=reporter,
                        name="tune_minimax_segmenter")

    print("Best hyperparameters found were: ", analysis.best_config)
Exemple #26
0
def optimize():
    name_dir = os.path.join('saved', 'hyper-lstm')
    hyperparam_config = {
        'name': 'lstm',
        'num_hidden': tune.sample_from(lambda _: np.random.randint(1, 10)),
        'num_layers': tune.sample_from(lambda _: np.random.randint(1, 5)),
        'opt': tune.choice(['adam', 'sgd', 'adamw', 'lbfgs']),
        'lr': tune.loguniform(1e-10, 1),
        'epoch': tune.sample_from(lambda _: np.random.randint(5, 25)),
        'beta_1': tune.loguniform(1e-8, 1e-2),
        'beta_2': tune.loguniform(1e-8, 1e-2),
        'weight_decay': tune.loguniform(1e-8, 1e-2),
        'max_iter': tune.sample_from(lambda _: np.random.randint(10, 100)),
        'momentum': tune.uniform(0.5, 0.9),
        'patience': tune.sample_from(lambda _: np.random.randint(5, 25)),
        'batch_size': 16
    }
    if not os.path.isdir(name_dir):
        os.mkdir(name_dir)
    scheduler = ASHAScheduler(metric='accuracy',
                              mode='max',
                              max_t=25,
                              grace_period=1,
                              reduction_factor=2)
    reporter = CLIReporter(metric_columns=["loss", "accuracy"])
    result = tune.run(partial(train,
                              checkpoint_dir=name_dir,
                              cwd=os.getcwd(),
                              tuning=True),
                      resources_per_trial={
                          "cpu": 1,
                          "gpu": 0.5
                      },
                      config=hyperparam_config,
                      num_samples=200,
                      scheduler=scheduler,
                      progress_reporter=reporter)
    best_trial = result.get_best_trial("accuracy", "max", "last")

    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))
    print("Best Checkpoint Dir: " + str(best_trial.checkpoint.value))
    return best_trial.config
Exemple #27
0
def _tune(
    model,
    train_set: Dataset,
    val_set: Dataset,
    dim: int,
    config: dict,
    EPOCHS: int = 300,
    n_gpu=1,
    n_samples=20,
    model_name="model",
):

    scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=1, reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["k", "lr", "batch_size", "loss_fn"],
        metric_columns=["loss", "training_iteration"],
        max_error_rows=5,
        max_progress_rows=5,
        max_report_frequency=10)
    analysis = tune.run(tune.with_parameters(train,
                                             model=model,
                                             dim=dim,
                                             train_set=train_set,
                                             val_set=val_set,
                                             num_epochs=EPOCHS,
                                             num_gpus=n_gpu,
                                             model_name=model_name),
                        resources_per_trial={
                            "cpu": 1,
                            "gpu": n_gpu
                        },
                        metric="loss",
                        mode="min",
                        config=config,
                        num_samples=n_samples,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name=model_name,
                        verbose=False)

    print("-" * 70)
    print("Done")
    print("Best hyperparameters found were: ", analysis.best_config)
    print("Best achieved loss was: ", analysis.best_result)
    print("-" * 70)
Exemple #28
0
def run(smoke_test=False):
    stop = {"training_iteration": 1 if smoke_test else 50}
    num_workers = 1 if smoke_test else 20
    num_gpus = 0 if smoke_test else 1

    config = {
        "env": "PongNoFrameskip-v4",
        "framework": tune.grid_search(["tf", "torch"]),
        "num_gpus": num_gpus,
        "rollout_fragment_length": 50,
        "train_batch_size": 750,
        "num_workers": num_workers,
        "num_envs_per_worker": 1,
        "clip_rewards": True,
        "num_sgd_iter": 2,
        "vf_loss_coeff": 1.0,
        "clip_param": 0.3,
        "grad_clip": 10,
        "vtrace": True,
        "use_kl_loss": False,
    }
    logger.info("Configuration: \n %s", pformat(config))

    # Run the experiment.
    # TODO(jungong) : maybe add checkpointing.
    return tune.run(
        "APPO",
        config=config,
        stop=stop,
        verbose=1,
        num_samples=1,
        progress_reporter=CLIReporter(
            metric_columns={
                "training_iteration": "iter",
                "time_total_s": "time_total_s",
                "timesteps_total": "ts",
                "snapshots": "snapshots",
                "episodes_this_iter": "train_episodes",
                "episode_reward_mean": "reward_mean",
            },
            sort_by_metric=True,
            max_report_frequency=30,
        ),
    )
Exemple #29
0
def tune_mnist_asha(num_samples=10,
                    num_epochs=50,
                    gpus_per_trial=0,
                    cpus_per_trial=4):
    data_dir = os.path.join(tempfile.gettempdir(), "mnist_data_")
    LightningMNISTClassifier.download_data(data_dir)

    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "layer_3_size": tune.choice([128, 256, 512]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
    }

    scheduler = ASHAScheduler(max_t=num_epochs,
                              grace_period=1,
                              reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=[
            "layer_1_size", "layer_2_size", "layer_3_size", "lr", "batch_size"
        ],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])

    analysis = tune.run(tune.with_parameters(train_mnist_tune,
                                             data_dir=data_dir,
                                             num_epochs=num_epochs,
                                             num_gpus=gpus_per_trial),
                        resources_per_trial={
                            "cpu": cpus_per_trial,
                            "gpu": gpus_per_trial
                        },
                        metric="loss",
                        mode="min",
                        config=config,
                        num_samples=num_samples,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name="tune_mnist_asha")

    print("Best hyperparameters found were: ", analysis.best_config)

    shutil.rmtree(data_dir)
Exemple #30
0
def tune_mnist_pbt(num_samples=10, num_epochs=10, gpus_per_trial=0):
    data_dir = os.path.join(tempfile.gettempdir(), "mnist_data_")
    LightningMNISTClassifier.download_data(data_dir)

    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": 1e-3,
        "batch_size": 64,
    }

    scheduler = PopulationBasedTraining(
        perturbation_interval=4,
        hyperparam_mutations={
            "lr": tune.loguniform(1e-4, 1e-1),
            "batch_size": [32, 64, 128]
        })

    reporter = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])

    analysis = tune.run(
        tune.with_parameters(
            train_mnist_tune_checkpoint,
            data_dir=data_dir,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        resources_per_trial={
            "cpu": 1,
            "gpu": gpus_per_trial
        },
        metric="loss",
        mode="min",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        name="tune_mnist_pbt")

    print("Best hyperparameters found were: ", analysis.best_config)

    shutil.rmtree(data_dir)