Ejemplo n.º 1
0
    def train_func():
        train.torch.accelerate(amp=True)

        model = torchvision.models.resnet101()
        model = train.torch.prepare_model(model)

        train.save_checkpoint(model=model)
Ejemplo n.º 2
0
 def train_func():
     checkpoint = train.load_checkpoint()
     if checkpoint:
         epoch = checkpoint["epoch"]
     else:
         epoch = 0
     for i in range(epoch, epoch + 2):
         train.save_checkpoint(epoch=i)
Ejemplo n.º 3
0
    def train_func_checkpoint():
        checkpoint = train.load_checkpoint()
        assert checkpoint is not None
        assert checkpoint["epoch"] == 2

        for i in range(checkpoint["epoch"], 5):
            train.save_checkpoint(epoch=i)
        return 1
Ejemplo n.º 4
0
    def train_func(config):
        itr = 0
        ckpt = train.load_checkpoint()
        if ckpt is not None:
            itr = ckpt["iter"] + 1

        for i in range(itr, config["max_iter"]):
            train.save_checkpoint(iter=i)
            train.report(test=i, training_iteration=i)
Ejemplo n.º 5
0
 def train_func():
     checkpoint = train.load_checkpoint()
     if checkpoint:
         epoch = checkpoint["epoch"]
     else:
         epoch = 0
     print("Epoch: ", epoch)
     for i in range(epoch, 2):
         train.report(loss=1, iter=i)
         train.save_checkpoint(epoch=i + 1)
Ejemplo n.º 6
0
    def train_fn():
        model = torch.nn.Linear(1, 1)

        # Wrap in DDP.
        model = train.torch.prepare_model(model)

        # Save DDP wrapped model.
        train.save_checkpoint(model=model)

        # Report DDP wrapped model.
        train.report(model=model)
Ejemplo n.º 7
0
 def on_save(self, args, state, control, **kwargs):
     # Save is called after evaluation.
     checkpoint_path = Path(
         transformers.trainer.get_last_checkpoint(
             args.output_dir)).absolute()
     if checkpoint_path:
         train.save_checkpoint(
             **{
                 NODE_IP_KEY: get_node_ip_address(),
                 CHECKPOINT_PATH_ON_NODE_KEY: str(checkpoint_path),
             })
Ejemplo n.º 8
0
    def train_func():
        ckpt = train.load_checkpoint()
        restored = bool(ckpt)  # Does a previous checkpoint exist?
        itr = 0
        if ckpt:
            itr = ckpt["iter"] + 1

        for i in range(itr, 4):
            if i == 2 and not restored:
                raise Exception("try to fail me")
            train.save_checkpoint(iter=i)
            train.report(test=i, training_iteration=i)
Ejemplo n.º 9
0
def train_func(config):
    batch_size = config.get("batch_size", 32)
    hidden_size = config.get("hidden_size", 1)
    lr = config.get("lr", 1e-2)
    epochs = config.get("epochs", 3)

    train_dataset_shard = train.get_dataset_shard("train")
    validation_dataset = train.get_dataset_shard("validation")

    model = nn.Linear(1, hidden_size)
    model = train.torch.prepare_model(model)

    loss_fn = nn.MSELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    results = []

    for _ in range(epochs):
        train_torch_dataset = train_dataset_shard.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=torch.float,
            batch_size=batch_size,
        )
        validation_torch_dataset = validation_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=torch.float,
            batch_size=batch_size,
        )

        device = train.torch.get_device()

        train_epoch(train_torch_dataset, model, loss_fn, optimizer, device)
        if train.world_rank() == 0:
            result = validate_epoch(validation_torch_dataset, model, loss_fn,
                                    device)
        else:
            result = {}
        train.report(**result)
        results.append(result)
        train.save_checkpoint(model=model)

    return results
Ejemplo n.º 10
0
def training_loop(config):
    # Create model.
    model = ResNet18(config)
    model.conv1 = nn.Conv2d(1,
                            64,
                            kernel_size=7,
                            stride=1,
                            padding=3,
                            bias=False)
    model = train.torch.prepare_model(model)

    # Create optimizer.
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.get("lr", 0.1),
        momentum=config.get("momentum", 0.9),
    )

    # Load in training and validation data.
    train_dataset = load_mnist_data(True, True)
    validation_dataset = load_mnist_data(False, False)

    if config["test_mode"]:
        train_dataset = Subset(train_dataset, list(range(64)))
        validation_dataset = Subset(validation_dataset, list(range(64)))

    train_loader = DataLoader(train_dataset,
                              batch_size=config["batch_size"],
                              num_workers=2)
    validation_loader = DataLoader(validation_dataset,
                                   batch_size=config["batch_size"],
                                   num_workers=2)

    train_loader = train.torch.prepare_data_loader(train_loader)
    validation_loader = train.torch.prepare_data_loader(validation_loader)

    # Create loss.
    criterion = nn.CrossEntropyLoss()

    for epoch_idx in range(2):
        train_epoch(train_loader, model, criterion, optimizer)
        validation_loss = validate_epoch(validation_loader, model, criterion)

        train.save_checkpoint(model_state_dict=model.module.state_dict())
        train.report(**validation_loss)
Ejemplo n.º 11
0
def train_func(config):
    num_epochs = config.get("num_epochs", 10)
    log_interval = config.get("log_interval", 10)
    use_cuda = config.get("use_cuda", False)
    save_model_as_dict = config.get("save_model_as_dict", False)

    model, optimizer, train_loader, train_sampler = setup(config)

    results = []
    for epoch in range(num_epochs):
        loss = train_epoch(model, optimizer, train_sampler, train_loader,
                           epoch, log_interval, use_cuda)
        results.append(loss)
    if save_model_as_dict:
        train.save_checkpoint(model=model.state_dict())
    else:
        train.save_checkpoint(model=model)
    print("losses of each epoch:")
    print(results)
    return results
Ejemplo n.º 12
0
def train_func():
    twp = TorchWorkerProfiler()
    with profile(
            activities=[],
            schedule=schedule(wait=0, warmup=0, active=1),
            on_trace_ready=twp.trace_handler,
    ) as p:

        # Setup model.
        model = torch.nn.Linear(1, 1)
        model = train.torch.prepare_model(model)
        loss_fn = torch.nn.MSELoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

        # Setup data.
        input = torch.randn(1000, 1)
        labels = input * 2
        dataset = torch.utils.data.TensorDataset(input, labels)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)
        dataloader = train.torch.prepare_data_loader(dataloader)

        # Train.
        for epoch in range(5):
            with record_function("train_epoch"):
                for X, y in dataloader:
                    pred = model(X)
                    loss = loss_fn(pred, y)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

            with record_function("train_checkpoint"):
                state_dict = model.state_dict()
                consume_prefix_in_state_dict_if_present(state_dict, "module.")
                train.save_checkpoint(epoch=epoch, model_weights=state_dict)

            p.step()

            with record_function("train_report"):
                profile_results = twp.get_and_clear_profile_traces()
                train.report(epoch=epoch, **profile_results)
Ejemplo n.º 13
0
 def train_func():
     for i in range(10):
         train.report(test=i)
     train.save_checkpoint(hello="world")
Ejemplo n.º 14
0
 def train_func():
     train.save_checkpoint(epoch=0)
Ejemplo n.º 15
0
 def train_func():
     train.save_checkpoint(loss=3)  # best
     train.save_checkpoint(loss=7)  # worst, deleted
     train.save_checkpoint(loss=5)
Ejemplo n.º 16
0
 def train_func():
     for i in range(2):
         train.save_checkpoint(epoch=i)
         time.sleep(1)
Ejemplo n.º 17
0
 def train_mismatch():
     train.save_checkpoint(epoch=0)
     train.report(index=0)
     # skip checkpoint
     train.report(index=1)
Ejemplo n.º 18
0
 def on_epoch_end(self, epoch, logs=None):
     train.save_checkpoint(**{"model": self.model.get_weights()})
     train.report(**logs)
Ejemplo n.º 19
0
 def train_func():
     checkpoint = train.load_checkpoint()
     train.report(**checkpoint)
     train.save_checkpoint(**checkpoint)
     return checkpoint[key]
Ejemplo n.º 20
0
def train_loop_per_worker(config):
    import horovod.torch as hvd

    hvd.init()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    net = ResNet18(None).to(device)
    optimizer = torch.optim.SGD(
        net.parameters(),
        lr=config["lr"],
    )
    epoch = 0

    checkpoint = train.load_checkpoint()
    if checkpoint:
        model_state = checkpoint["model_state"]
        optimizer_state = checkpoint["optimizer_state"]
        epoch = checkpoint["epoch"]

        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    criterion = nn.CrossEntropyLoss()
    optimizer = hvd.DistributedOptimizer(optimizer)
    np.random.seed(1 + hvd.rank())
    torch.manual_seed(1234)
    # To ensure consistent initialization across workers,
    hvd.broadcast_parameters(net.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    trainset = ray.get(config["data"])
    trainloader = DataLoader(trainset,
                             batch_size=int(config["batch_size"]),
                             shuffle=True,
                             num_workers=4)

    for epoch in range(epoch, 40):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            train.report(loss=running_loss / epoch_steps)
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" %
                      (epoch + 1, i + 1, running_loss / epoch_steps))

        train.save_checkpoint(
            model_state=net.state_dict(),
            optimizer_state=optimizer.state_dict(),
            epoch=epoch,
        )
Ejemplo n.º 21
0
 def train_func():
     assert train.load_checkpoint() is None
     for i in range(3):
         train.save_checkpoint(epoch=i)
     return 1
Ejemplo n.º 22
0
 def train_slow():
     for i in range(2):
         train.save_checkpoint(epoch=i)
         time.sleep(5)
         train.report(index=i)
         time.sleep(5)
Ejemplo n.º 23
0
 def train_func():
     if (train.world_rank()) == 0:
         train.save_checkpoint(epoch=0)
     else:
         train.report(iter=0)
Ejemplo n.º 24
0
 def train_func():
     model = torch.nn.Linear(1, 1).state_dict()
     train.save_checkpoint(model=model)
Ejemplo n.º 25
0
def train_func(config):
    use_gpu = config["use_gpu"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    num_layers = config["num_layers"]
    num_hidden = config["num_hidden"]
    dropout_every = config["dropout_every"]
    dropout_prob = config["dropout_prob"]
    num_features = config["num_features"]

    print("Defining model, loss, and optimizer...")

    # Setup device.
    device = torch.device(f"cuda:{train.local_rank()}"
                          if use_gpu and torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Setup data.
    train_dataset_pipeline = train.get_dataset_shard("train_dataset")
    train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs()
    test_dataset = train.get_dataset_shard("test_dataset")
    test_torch_dataset = test_dataset.to_torch(label_column="label",
                                               batch_size=batch_size)

    net = Net(
        n_layers=num_layers,
        n_features=num_features,
        num_hidden=num_hidden,
        dropout_every=dropout_every,
        drop_prob=dropout_prob,
    ).to(device)
    print(net.parameters)

    net = train.torch.prepare_model(net)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(net.parameters(), weight_decay=0.0001)

    print("Starting training...")
    for epoch in range(num_epochs):
        train_dataset = next(train_dataset_epoch_iterator)

        train_torch_dataset = train_dataset.to_torch(label_column="label",
                                                     batch_size=batch_size)

        train_running_loss, train_num_correct, train_num_total = train_epoch(
            train_torch_dataset, net, device, criterion, optimizer,
            num_features)
        train_acc = train_num_correct / train_num_total
        print(f"epoch [{epoch + 1}]: training accuracy: "
              f"{train_num_correct} / {train_num_total} = {train_acc:.4f}")

        test_running_loss, test_num_correct, test_num_total = test_epoch(
            test_torch_dataset, net, device, criterion)
        test_acc = test_num_correct / test_num_total
        print(f"epoch [{epoch + 1}]: testing accuracy: "
              f"{test_num_correct} / {test_num_total} = {test_acc:.4f}")

        # Record and log stats.
        train.report(
            train_acc=train_acc,
            train_loss=train_running_loss,
            test_acc=test_acc,
            test_loss=test_running_loss,
        )

        # Checkpoint model.
        module = net.module if isinstance(net,
                                          DistributedDataParallel) else net
        train.save_checkpoint(model_state_dict=module.state_dict())

    if train.world_rank() == 0:
        return module.cpu()
Ejemplo n.º 26
0
 def train_func_checkpoint():
     train.save_checkpoint(loss=3)
     train.save_checkpoint(loss=7)
Ejemplo n.º 27
0
 def train_func():
     for i in range(3):
         train.save_checkpoint(model=i)
Ejemplo n.º 28
0
 def train_mismatch():
     train.save_checkpoint(epoch=0)
Ejemplo n.º 29
0
 def train_func():
     for i in range(2):
         train.save_checkpoint(epoch=i)
         train.report(index=i)
Ejemplo n.º 30
0
 def train_func():
     model = build_model().get_weights()
     train.save_checkpoint(**{MODEL_KEY: model})