Beispiel #1
0
def test_save_and_restore(ray_start_2_cpus, num_workers, use_local,
                          tmp_path):  # noqa: F811
    trainer1 = TorchTrainer(training_operator_cls=Operator,
                            num_workers=num_workers,
                            use_local=use_local)
    trainer1.train()
    checkpoint_path = os.path.join(tmp_path, "checkpoint")
    trainer1.save(checkpoint_path)

    model1 = trainer1.get_model()
    ints1 = trainer1.apply_all_operators(lambda op: op.get_model().rand_int)[0]

    trainer1.shutdown()

    trainer2 = TorchTrainer(training_operator_cls=Operator,
                            num_workers=num_workers,
                            use_local=use_local)
    trainer2.load(checkpoint_path)

    model2 = trainer2.get_model()
    ints2 = trainer2.apply_all_operators(lambda op: op.get_model().rand_int)

    model1_state_dict = model1.state_dict()
    model2_state_dict = model2.state_dict()

    assert set(model1_state_dict.keys()) == set(model2_state_dict.keys())

    for k in model1_state_dict:
        assert torch.equal(model1_state_dict[k], model2_state_dict[k])
    for i in ints2:
        assert i == ints1
    trainer2.shutdown()
def train_example(num_workers=1, use_gpu=False):
    trainer1 = TorchTrainer(
        model_creator=model_creator,
        data_creator=data_creator,
        optimizer_creator=optimizer_creator,
        loss_creator=nn.MSELoss,
        scheduler_creator=scheduler_creator,
        num_workers=num_workers,
        use_gpu=use_gpu,
        config={
            "lr": 1e-2,  # used in optimizer_creator
            "hidden_size": 1,  # used in model_creator
            "batch_size": 4,  # used in data_creator
        },
        backend="gloo",
        scheduler_step_freq="epoch")
    for i in range(5):
        stats = trainer1.train()
        print(stats)

    print(trainer1.validate())
    m = trainer1.get_model()
    print("trained weight: % .2f, bias: % .2f" % (
        m.weight.item(), m.bias.item()))
    trainer1.shutdown()
    print("success!")
Beispiel #3
0
def train_example(num_workers=1, use_gpu=False):
    CustomTrainingOperator = TrainingOperator.from_creators(
        model_creator=model_creator,
        optimizer_creator=optimizer_creator,
        data_creator=data_creator,
        scheduler_creator=scheduler_creator,
        loss_creator=nn.MSELoss)
    trainer1 = TorchTrainer(
        training_operator_cls=CustomTrainingOperator,
        num_workers=num_workers,
        use_gpu=use_gpu,
        config={
            "lr": 1e-2,  # used in optimizer_creator
            "hidden_size": 1,  # used in model_creator
            "batch_size": 4,  # used in data_creator
        },
        backend="gloo",
        scheduler_step_freq="epoch")
    for i in range(5):
        stats = trainer1.train()
        print(stats)

    print(trainer1.validate())

    # If using Ray Client, make sure to force model onto CPU.
    import ray
    m = trainer1.get_model(to_cpu=ray.util.client.ray.is_connected())
    print("trained weight: % .2f, bias: % .2f" %
          (m.weight.item(), m.bias.item()))
    trainer1.shutdown()
    print("success!")
Beispiel #4
0
def main():
    parser = HfArgumentParser((ModelArguments, DataProcessingArguments,
                               TrainingArguments, RayArguments))
    all_args = parser.parse_args_into_dataclasses()
    model_args, dataprocessing_args, training_args, ray_args = all_args

    # For now, let's merge all the sets of args into one,
    # but soon, we'll keep distinct sets of args, with a
    # cleaner separation of concerns.
    args = argparse.Namespace(**vars(model_args), **vars(dataprocessing_args),
                              **vars(training_args), **vars(ray_args))

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. "
            "Use --overwrite_output_dir to overcome.".format(args.output_dir))

    use_gpu = torch.cuda.is_available() and not args.no_cuda

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    args.output_mode = output_modes[args.task_name]

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    logger.info("Training/evaluation parameters %s", args)
    ray.init(address=args.address)
    # Training

    trainer = TorchTrainer(model_creator=model_creator,
                           data_creator=data_creator,
                           optimizer_creator=optimizer_creator,
                           training_operator_cls=TransformerOperator,
                           use_fp16=args.fp16,
                           apex_args={"opt_level": args.fp16_opt_level},
                           num_workers=args.num_workers,
                           use_gpu=use_gpu,
                           use_tqdm=True,
                           config={"args": args})

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = trainer.get_local_operator().tokenizer
    local_model = trainer.get_model()

    epochs_trained = 0
    train_iterator = trange(
        epochs_trained,
        int(args.num_train_epochs),
        desc="Epoch",
    )

    trainer.apply_all_workers(lambda: set_seed(args))
    if args.do_train:
        for _ in train_iterator:
            stats = trainer.train()
            print("Training stats:", stats)
            logs = evaluate(args, local_model, tokenizer)
            print(json.dumps(logs))

    # Post-training validation
    save_and_evaluate_checkpoints(args, local_model, tokenizer)
Beispiel #5
0
        use_gpu=True,
        config={
            "lr": 1e-2,  # used in optimizer_creator
            "hidden_size": 1,  # used in model_creator
            "batch_size": 1024,  # used in data_creator
            "path": path1,  # path to load the agent nn
        },
        backend="auto",
        scheduler_step_freq="epoch")
    for i in range(100):
        stats = trainer1.train()
        print(stats)

    print(trainer1.validate())
    torch.save(trainer1.state_dict(), "checkpoint.pt")
    torch.save(trainer1.get_model().state_dict(), "invariant_checkpoint.pt")
    m = trainer1.get_model()
    print(f"trained weight: torch.tensor([[{m[0].weight.data.cpu().numpy()[0][0]},{m[0].weight.data.cpu().numpy()[0][1]}]]), bias: torch.tensor({m[0].bias.data.cpu().numpy()})")
    # trainer1.shutdown()
    print("success!")
else:
    m = torch.nn.Sequential(torch.nn.Linear(2, 50), torch.nn.ReLU(), torch.nn.Linear(50, 1), torch.nn.Tanh())
    checkpoint = torch.load("invariant_checkpoint.pt", torch.device("cpu"))
    m.load_state_dict(checkpoint)
    # trained weight:  [[0.0018693  0.05228069]], bias: [-0.5533147] , train_loss = 0.0
    # trained weight:  [[-0.01369903  0.03511396]], bias: [-0.6535952] , train_loss = 0.0
    # trained weight:  [[0.00687088  0.26634103]], bias: [-0.6658108] , train_loss = 0.0
    # trained weight: torch.tensor([[0.038166143000125885,0.16197167336940765]]), bias: torch.tensor([-2.3122551])
# %%
m.cpu()
random.seed(0)
Beispiel #6
0
                "hidden_size": 1,  # used in model_creator
                "batch_size": 1024,  # used in data_creator
                "path": path1,  # path to load the agent nn
                "path_invariant":
                path_invariant,  # the path to the invariant network
            },
            backend="auto",
            scheduler_step_freq="epoch")
        for i in range(50):
            stats = trainer1.train()
            print(stats)

        print(trainer1.validate())
        torch.save(trainer1.state_dict(),
                   os.path.join(utils.get_save_dir(), "checkpoint.pt"))
        torch.save(trainer1.get_model()[0].state_dict(),
                   os.path.join(utils.get_save_dir(), "retrained_agent.pt"))
        agent_model, invariant_model = trainer1.get_model()
    else:
        sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
        sequential_nn.load_state_dict(
            torch.load(os.path.join(utils.get_save_dir(),
                                    "retrained_agent.pt")))
        agent_model = sequential_nn
        invariant_model = torch.nn.Sequential(torch.nn.Linear(2, 50),
                                              torch.nn.ReLU(),
                                              torch.nn.Linear(50, 1),
                                              torch.nn.Tanh())
        invariant_model.load_state_dict(
            torch.load(
                path_invariant,