Esempio n. 1
0
config['comet_ml_workspace'] = args.comet_ml_workspace
config['comet_ml_project_name'] = args.comet_ml_project_name
config['comet_ml_save_model'] = args.comet_ml_save_model
# Make sure that all None configuration are correctly formated as None, not a string
for key, val in config.items():
    if str(val).lower() == 'none':
        config[key] = None
# Start ray
# ray.init(address='auto', resources=dict(CPU=120, GPU=120))
ray.init(address='auto')
print('DEBUG: Started Ray.')
# NOTE: These could actually just be the current VM's resources. If it's the head node,
# we might need some extra resources just to add new nodes.
print(f'DEBUG: The cluster\'s total resources: \n{ray.cluster_resources()}')
print(f'DEBUG: The cluster\'s currently available resources: \n{ray.available_resources()}')
# Create the trainer
trainer = TorchTrainer(
        model_creator=utils.eICU_model_creator,
        data_creator=utils.eICU_data_creator,
        optimizer_creator=utils.eICU_optimizer_creator,
        training_operator_cls=utils.eICU_Operator,
        num_workers=config.get('num_workers', 1),
        config=config,
        use_gpu=True,
        use_fp16=config.get('use_fp16', False),
        use_tqdm=True)
print(f'DEBUG: Created the TorchTrainer object.')
# Train the model
for epoch in du.utils.iterations_loop(range(config.get('n_epochs', 1)), see_progress=config.get('see_progress', True), desc='Epochs'):
    stats = trainer.train(info=dict(epoch_idx=epoch))
Esempio n. 2
0
                          schedulers=scheduler)
        self.register_data(train_loader=train_loader, validation_loader=val_loader)
# __torch_operator_end__

# __torch_ray_start__
import ray

ray.init()
# or ray.init(address="auto") to connect to a running cluster.
# __torch_ray_end__

# __torch_trainer_start__
from ray.util.sgd import TorchTrainer

trainer = TorchTrainer(
    training_operator_cls=MyTrainingOperator,
    scheduler_step_freq="epoch",  # if scheduler is used
    config={"lr": 0.001, "batch_size": 64})

# __torch_trainer_end__

trainer.shutdown()

# __torch_model_start__
import torch.nn as nn

def model_creator(config):
    """Constructor function for the model(s) to be optimized.

    You will also need to provide a custom training
    function to specify the optimization procedure for multiple models.
Esempio n. 3
0
if __name__ == "__main__":
    ray.init(address=None if args.local else "auto")
    num_workers = 2 if args.local else int(ray.cluster_resources().get(device))
    from ray.util.sgd.torch.examples.train_example import LinearDataset

    print(f"Model: {args.model}")
    print("Batch size: %d" % args.batch_size)
    print("Number of %ss: %d" % (device, num_workers))

    trainer = TorchTrainer(
        model_creator=lambda cfg: getattr(models, args.model)(),
        optimizer_creator=lambda model, cfg: optim.SGD(
            model.parameters(), lr=0.01 * cfg.get("lr_scaler")),
        data_creator=lambda cfg: LinearDataset(4, 2),  # Mock dataset.
        initialization_hook=init_hook,
        config=dict(
            lr_scaler=num_workers),
        training_operator_cls=Training,
        num_workers=num_workers,
        use_gpu=args.cuda,
        use_fp16=args.fp16,
    )

    img_secs = []
    for x in range(args.num_iters):
        result = trainer.train()
        # print(result)
        img_sec = result["img_sec"]
        print("Iter #%d: %.1f img/sec per %s" % (x, img_sec, device))
        img_secs.append(img_sec)
Esempio n. 4
0
def main(args):
    if args.smoke_test:
        ray.init(num_cpus=4)
    else:
        ray.init(address=args.address,
                 num_cpus=args.num_workers,
                 log_to_driver=True)

    # Trainer Initialization
    trainer = TorchTrainer(training_operator_cls=CIFAR10Module,
                           num_workers=args.num_workers,
                           config={
                               "lr": args.learning_rate,
                               "lr_decay": args.lr_decay,
                               "eps": args.eps,
                               "momentum": args.momentum,
                               "wd": args.wd,
                               "data_dir": args.data_dir,
                               "batch_size": args.batch_size,
                               "num_workers": args.num_workers,
                               "smoke_test": args.smoke_test
                           },
                           use_gpu=args.use_gpu,
                           scheduler_step_freq="epoch",
                           use_fp16=args.fp16,
                           use_tqdm=False)

    train_loss = []
    val_loss = []
    val_acc = []

    path = os.path.join("/root/volume/Paper/MLVC_Internship",
                        args.checkpoint_dir,
                        args.model_name + "_" + str(args.trial))
    if not os.path.exists(path):
        os.mkdir(path)

    from tabulate import tabulate
    pbar = trange(args.max_epochs, unit="epoch")
    for it in pbar:
        stats = trainer.train(max_retries=1,
                              info=dict(epoch_idx=it,
                                        num_epochs=args.max_epochs))
        train_loss.append(stats["train_loss"])
        val_stats = trainer.validate()
        val_loss.append(val_stats["val_loss"])
        pbar.set_postfix(dict(acc=val_stats["val_accuracy"]))

        trainer.save(
            "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.ray".
            format(args.model_name, args.trial, it))
        torch.save(
            [train_loss, val_loss],
            "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.loss"
            .format(args.model_name, args.trial, it))
        torch.save(
            [val_acc],
            "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.acc".
            format(args.model_name, args.trial, it))

    print(val_stats)
    trainer.shutdown()
    print("success!")
Esempio n. 5
0
        self.model = self.model[0]
        self.optimizer = self.optimizer[0]
        # Get the corresponging shard
        train_shard = train_dataset.get_shard(self.world_rank)
        train_loader = DataLoader(train_shard, batch_size=64)
        test_shard = test_dataset.get_shard(self.world_rank)
        val_loader = DataLoader(test_shard, batch_size=64)
        self.register_data(train_loader=train_loader,
                           validation_loader=val_loader)


# You can either train the model like this

trainer = TorchTrainer(training_operator_cls=CustomOperator,
                       num_workers=num_executors,
                       add_dist_sampler=False,
                       num_cpus_per_worker=1,
                       config={"lr": 0.01})
for i in range(10):
    stats = trainer.train()
    print(stats)
    val_stats = trainer.validate()
    print(val_stats)
trainer.shutdown()

# Or you can perform a hyperparameter search using Ray Tune

# TorchTrainable = TorchTrainer.as_trainable(
#                     training_operator_cls=CustomOperator,
#                     num_workers=num_executors,
#                     add_dist_sampler=False,
Esempio n. 6
0
        optimizers: The return values from ``optimizer_creator``.
            This can be one or more torch optimizer objects.
        config: Configuration dictionary passed into ``TorchTrainer``

    Returns:
        One or more Torch scheduler objects.
    """
    return torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9)

# __torch_scheduler_end__

# __torch_ray_start__
import ray

ray.init()
# or ray.init(address="auto") to connect to a running cluster.
# __torch_ray_end__

# __torch_trainer_start__
from ray.util.sgd import TorchTrainer

trainer = TorchTrainer(
    model_creator,
    data_creator,
    optimizer_creator,
    loss_creator=nn.MSELoss,
    scheduler_creator=scheduler_creator,
    config={"lr": 0.001})

# __torch_trainer_end__
Esempio n. 7
0
def main():
    parser = HfArgumentParser((ModelArguments, DataProcessingArguments,
                               TrainingArguments, RayArguments))
    all_args = parser.parse_args_into_dataclasses()
    model_args, dataprocessing_args, training_args, ray_args = all_args

    # For now, let's merge all the sets of args into one,
    # but soon, we'll keep distinct sets of args, with a
    # cleaner separation of concerns.
    args = argparse.Namespace(
        **vars(model_args),
        **vars(dataprocessing_args),
        **vars(training_args),
        **vars(ray_args),
    )

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. "
            "Use --overwrite_output_dir to overcome.".format(args.output_dir))

    use_gpu = torch.cuda.is_available() and not args.no_cuda

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError(f"Task not found: {args.task_name}")
    args.output_mode = output_modes[args.task_name]

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info("Training/evaluation parameters %s", args)
    ray.init(address=args.address)
    # Training

    trainer = TorchTrainer(
        training_operator_cls=TransformerOperator,
        use_fp16=args.fp16,
        num_workers=args.num_workers,
        use_gpu=use_gpu,
        use_tqdm=True,
        config={"args": args},
    )

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = trainer.get_local_operator().tokenizer
    local_model = trainer.get_model()

    epochs_trained = 0
    train_iterator = trange(
        epochs_trained,
        int(args.num_train_epochs),
        desc="Epoch",
    )

    trainer.apply_all_workers(lambda: set_seed(args))
    if args.do_train:
        for _ in train_iterator:
            stats = trainer.train()
            print("Training stats:", stats)
            logs = evaluate(args, local_model, tokenizer)
            print(json.dumps(logs))

    # Post-training validation
    save_and_evaluate_checkpoints(args, local_model, tokenizer)
Esempio n. 8
0
        ray.init(num_cpus=2)
    else:
        ray.init(address="auto")
    num_workers = 2 if args.local else int(ray.cluster_resources().get(device))
    from ray.util.sgd.torch.examples.train_example import LinearDataset

    print(f"Model: {args.model}")
    print("Batch size: %d" % args.batch_size)
    print("Number of %ss: %d" % (device, num_workers))

    trainer = TorchTrainer(
        training_operator_cls=Training,
        initialization_hook=init_hook,
        config={
            "lr_scaler": num_workers,
            "model": args.model
        },
        num_workers=num_workers,
        use_gpu=args.cuda,
        use_fp16=args.fp16,
    )

    img_secs = []
    for x in range(args.num_iters):
        result = trainer.train()
        # print(result)
        img_sec = result["img_sec"]
        print("Iter #%d: %.1f img/sec per %s" % (x, img_sec, device))
        img_secs.append(img_sec)

    # Results
Esempio n. 9
0
            This can be one or more torch optimizer objects.
        config: Configuration dictionary passed into ``TorchTrainer``

    Returns:
        One or more Torch scheduler objects.
    """
    return torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9)

# __torch_scheduler_end__

# __torch_ray_start__
import ray

ray.init()
# or ray.init(address="auto") to connect to a running cluster.
# __torch_ray_end__

# __torch_trainer_start__
from ray.util.sgd import TorchTrainer

trainer = TorchTrainer(
    model_creator=model_creator,
    data_creator=data_creator,
    optimizer_creator=optimizer_creator,
    loss_creator=nn.MSELoss,
    scheduler_creator=scheduler_creator,
    scheduler_step_freq="epoch",  # if scheduler_creator is set
    config={"lr": 0.001, "batch_size": 64})

# __torch_trainer_end__
Esempio n. 10
0
path1 = "/home/edoardo/ray_results/tune_PPO_stopping_car/PPO_StoppingCar_acc24_00001_1_cost_fn=0,epsilon_input=0_2021-01-21_02-30-49/checkpoint_58/checkpoint-58"
# path1 = "/home/edoardo/ray_results/tune_PPO_stopping_car/PPO_StoppingCar_c1c7e_00005_5_cost_fn=0,epsilon_input=0.1_2021-01-17_12-41-27/checkpoint_10/checkpoint-10"
val_data = TrainedPolicyDataset(path1, size=(0, 0), seed=4567, traces=False)
config = get_PPO_config(1234, use_gpu=0)
trainer = ppo.PPOTrainer(config=config)
trainer.restore(path1)
policy = trainer.get_policy()
sequential_nn = convert_ray_policy_to_sequential(policy).cpu()

if enable_training:
    trainer1 = TorchTrainer(
        training_operator_cls=SafetyTrainingOperator,
        num_workers=1,
        use_gpu=True,
        config={
            "lr": 1e-2,  # used in optimizer_creator
            "hidden_size": 1,  # used in model_creator
            "batch_size": 1024,  # used in data_creator
            "path": path1,  # path to load the agent nn
        },
        backend="auto",
        scheduler_step_freq="epoch")
    for i in range(100):
        stats = trainer1.train()
        print(stats)

    print(trainer1.validate())
    torch.save(trainer1.state_dict(), "checkpoint.pt")
    torch.save(trainer1.get_model().state_dict(), "invariant_checkpoint.pt")
    m = trainer1.get_model()
    print(f"trained weight: torch.tensor([[{m[0].weight.data.cpu().numpy()[0][0]},{m[0].weight.data.cpu().numpy()[0][1]}]]), bias: torch.tensor({m[0].bias.data.cpu().numpy()})")
    # trainer1.shutdown()
Esempio n. 11
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=1,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=True,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')

    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()

    ray.init(address='auto')

    trainer1 = TorchTrainer(training_operator_cls=MyTrainingOperator,
                            num_workers=2,
                            use_gpu=False,
                            config=vars(args))

    stats = trainer1.train()

    # lift the trainer to on remote
    # RemoteTrainer = ray.remote(num_gpus=0.5)(TorchTrainer)
    # remote_trainer = RemoteTrainer.remote(
    #     training_operator_cls=MyTrainingOperator, num_workers=1, use_gpu=True, config=vars(args))
    # # remote_trainer.train.remote()
    # stats = ray.get([remote_trainer.train.remote()])

    # ray.tune.run(TorchTrainer.as_trainable(
    #     training_operator_cls=MyTrainingOperator, num_workers=1, use_gpu=True, config=vars(args)))
    print(stats)
    print("success!")
Esempio n. 12
0
        # Create model.
        model = torch.nn.Linear(1, 1)

        # Create optimizer.
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

        # Create loss.
        loss = torch.nn.MSELoss()

        # Register model, optimizer, and loss.
        self.model, self.optimizer, self.criterion = self.register(
            models=model,
            optimizers=optimizer,
            criterion=loss)

        # Register data loaders.
        self.register_data(train_loader=train_loader, validation_loader=val_loader)


ray.init()

trainer1 = TorchTrainer(
    training_operator_cls=CustomTrainingOperator,
    num_workers=2,
    use_gpu=False,
    config={"batch_size": 64})

stats = trainer1.train()
print(stats)
trainer1.shutdown()
print("success!")
Esempio n. 13
0
        'worker': ["localhost:12345", "localhost:23456"]
    },
    'task': {'type': 'worker', 'index': 0}
})
...
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
with strategy.scope():
  multi_worker_model = model_creator()

'''

# PyTorch支持
from ray.util.sgd import TorchTrainer 
trainer = TorchTrainer( 
    model_creator=model_creator, 
    data_creator=data_creator, 
    optimizer_creator=optimizer_creator, 
    loss_creator=nn.MSELoss, 
    scheduler_creator=scheduler_creator, 
    scheduler_step_freq="epoch", 
    initialization_hook=init_hook, # setup env
    config={"lr": 0.001, "batch_size": 64},
    num_workers = 100
    num_replicas = 400,
    use_fp16=True   # Mixded Precision FP16
)
trainer.train(max_retries=5, checkpoint="auto")
trainer.validate()
trainer.shutdown()