Esempio n. 1
0
def train_func(config: Dict):
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["epochs"]

    # Create data loaders.
    train_dataloader = DataLoader(training_data,
                                  batch_size=batch_size,
                                  sampler=DistributedSampler(training_data))
    test_dataloader = DataLoader(test_data,
                                 batch_size=batch_size,
                                 sampler=DistributedSampler(test_data))

    # Create model.
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = NeuralNetwork()
    model = model.to(device)
    model = DistributedDataParallel(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    loss_results = []

    for _ in range(epochs):
        train(train_dataloader, model, loss_fn, optimizer, device)
        loss = validate(test_dataloader, model, loss_fn, device)
        sgd.report(loss=loss)
        loss_results.append(loss)

    return loss_results
Esempio n. 2
0
def train_func(config):
    data_size = config.get("data_size", 1000)
    val_size = config.get("val_size", 400)
    batch_size = config.get("batch_size", 32)
    hidden_size = config.get("hidden_size", 1)
    lr = config.get("lr", 1e-2)
    epochs = config.get("epochs", 3)

    train_dataset = LinearDataset(2, 5, size=data_size)
    val_dataset = LinearDataset(2, 5, size=val_size)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        sampler=DistributedSampler(train_dataset))
    validation_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=batch_size,
        sampler=DistributedSampler(val_dataset))

    model = nn.Linear(1, hidden_size)
    model = DistributedDataParallel(model)

    loss_fn = nn.MSELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    results = []

    for _ in range(epochs):
        train(train_loader, model, loss_fn, optimizer)
        result = validate(validation_loader, model, loss_fn)
        sgd.report(**result)
        results.append(result)

    return results
Esempio n. 3
0
    def train_func(config):
        itr = 0
        ckpt = sgd.load_checkpoint()
        if ckpt is not None:
            itr = ckpt["iter"] + 1

        for i in range(itr, config["max_iter"]):
            sgd.save_checkpoint(iter=i)
            sgd.report(test=i, training_iteration=i)
Esempio n. 4
0
 def train():
     checkpoint = sgd.load_checkpoint()
     if checkpoint:
         epoch = checkpoint["epoch"]
     else:
         epoch = 0
     print("Epoch: ", epoch)
     for i in range(epoch, 2):
         sgd.report(loss=1, iter=i)
         sgd.save_checkpoint(epoch=i + 1)
def train_func(config):
    batch_size = config.get("batch_size", 32)
    hidden_size = config.get("hidden_size", 1)
    lr = config.get("lr", 1e-2)
    epochs = config.get("epochs", 3)

    train_dataset_pipeline_shard = sgd.get_dataset_shard("train")
    validation_dataset_pipeline_shard = sgd.get_dataset_shard("validation")

    device = torch.device(
        f"cuda:{sgd.local_rank()}" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        torch.cuda.set_device(device)

    model = nn.Linear(1, hidden_size)
    model = model.to(device)
    model = DistributedDataParallel(
        model,
        device_ids=[sgd.local_rank()] if torch.cuda.is_available() else None)

    loss_fn = nn.MSELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    results = []

    train_dataset_iterator = train_dataset_pipeline_shard.iter_datasets()
    validation_dataset_iterator = \
        validation_dataset_pipeline_shard.iter_datasets()

    for _ in range(epochs):
        train_dataset = next(train_dataset_iterator)
        validation_dataset = next(validation_dataset_iterator)

        train_torch_dataset = train_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=[torch.float],
            batch_size=batch_size,
        )
        validation_torch_dataset = validation_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=[torch.float],
            batch_size=batch_size)

        train(train_torch_dataset, model, loss_fn, optimizer, device)
        result = validate(validation_torch_dataset, model, loss_fn, device)
        sgd.report(**result)
        results.append(result)

    return results
Esempio n. 6
0
    def train_func():
        ckpt = sgd.load_checkpoint()
        restored = bool(ckpt)  # Does a previous checkpoint exist?
        itr = 0
        if ckpt:
            itr = ckpt["iter"] + 1

        for i in range(itr, 4):
            if i == 2 and not restored:
                raise Exception("try to fail me")
            sgd.save_checkpoint(iter=i)
            sgd.report(test=i, training_iteration=i)
Esempio n. 7
0
 def train_func():
     for i in range(2):
         sgd.report(loss=1, iter=i)
Esempio n. 8
0
 def train():
     for _ in range(2):
         sgd.report(loss=1)
     return 1
Esempio n. 9
0
 def train_actor_failure():
     for _ in range(2):
         sgd.report(loss=1)
     import sys
     sys.exit(0)
Esempio n. 10
0
 def train_mismatch():
     sgd.save_checkpoint(epoch=0)
     sgd.report(index=0)
     # skip checkpoint
     sgd.report(index=1)
Esempio n. 11
0
 def fail_train_2():
     for _ in range(2):
         sgd.report(loss=1)
     raise NotImplementedError
Esempio n. 12
0
 def train_func():
     for i in range(3):
         sgd.report(index=i)
     return 1
Esempio n. 13
0
 def train():
     for i in range(2):
         sgd.save_checkpoint(epoch=i)
         sgd.report(index=i)
Esempio n. 14
0
 def train_func():
     for i in range(num_iters):
         sgd.report(index=i)
     return 1
Esempio n. 15
0
 def train_slow():
     sgd.report(index=0)
     time.sleep(5)
     sgd.report(index=1)
Esempio n. 16
0
 def train_func():
     for _ in range(2):
         sgd.report(loss=1)
Esempio n. 17
0
def train_func(config):
    device = torch.device(
        f"cuda:{sgd.local_rank()}" if torch.cuda.is_available() else "cpu")

    epochs = config.pop("epochs", 3)
    model = ResNet18(config)
    model = model.to(device)
    model = DistributedDataParallel(
        model,
        device_ids=[device.index] if torch.cuda.is_available() else None)

    # Create optimizer.
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=config.get("lr", 0.1),
                                momentum=config.get("momentum", 0.9))

    # Load in training and validation data.
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])  # meanstd transformation

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    with FileLock(".ray.lock"):
        train_dataset = CIFAR10(root="~/data",
                                train=True,
                                download=True,
                                transform=transform_train)
        validation_dataset = CIFAR10(root="~/data",
                                     train=False,
                                     download=False,
                                     transform=transform_test)

    if config.get("test_mode"):
        train_dataset = Subset(train_dataset, list(range(64)))
        validation_dataset = Subset(validation_dataset, list(range(64)))

    train_loader = DataLoader(train_dataset,
                              batch_size=config["batch_size"],
                              sampler=DistributedSampler(train_dataset))
    validation_loader = DataLoader(
        validation_dataset,
        batch_size=config["batch_size"],
        sampler=DistributedSampler(validation_dataset))

    # Create loss.
    criterion = nn.CrossEntropyLoss()

    results = []

    for _ in range(epochs):
        train(train_loader, model, criterion, optimizer, device)
        result = validate(validation_loader, model, criterion, device)
        sgd.report(**result)
        results.append(result)

    return results
Esempio n. 18
0
 def train_func():
     for i in range(10):
         sgd.report(test=i)
     sgd.save_checkpoint(hello="world")
Esempio n. 19
0
 def train():
     if (sgd.world_rank()) == 0:
         sgd.save_checkpoint(epoch=0)
     else:
         sgd.report(iter=0)
 def on_epoch_end(self, epoch, logs=None):
     sgd.report(**logs)
Esempio n. 21
0
 def train_func():
     sgd.report(episode_reward_mean=4)
     sgd.report(episode_reward_mean=5)
     sgd.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1})
     return 1
Esempio n. 22
0
 def train_slow():
     for i in range(2):
         sgd.save_checkpoint(epoch=i)
         time.sleep(5)
         sgd.report(index=i)
         time.sleep(5)
Esempio n. 23
0
 def train_mismatch():
     sgd.report(loss=1)
Esempio n. 24
0
 def train():
     for i in range(2):
         sgd.report(index=i)