def train_func(config: Dict): batch_size = config["batch_size"] lr = config["lr"] epochs = config["epochs"] # Create data loaders. train_dataloader = DataLoader(training_data, batch_size=batch_size, sampler=DistributedSampler(training_data)) test_dataloader = DataLoader(test_data, batch_size=batch_size, sampler=DistributedSampler(test_data)) # Create model. device = "cuda" if torch.cuda.is_available() else "cpu" model = NeuralNetwork() model = model.to(device) model = DistributedDataParallel(model) loss_fn = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) loss_results = [] for _ in range(epochs): train(train_dataloader, model, loss_fn, optimizer, device) loss = validate(test_dataloader, model, loss_fn, device) sgd.report(loss=loss) loss_results.append(loss) return loss_results
def train_func(config): data_size = config.get("data_size", 1000) val_size = config.get("val_size", 400) batch_size = config.get("batch_size", 32) hidden_size = config.get("hidden_size", 1) lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) train_dataset = LinearDataset(2, 5, size=data_size) val_dataset = LinearDataset(2, 5, size=val_size) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, sampler=DistributedSampler(train_dataset)) validation_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, sampler=DistributedSampler(val_dataset)) model = nn.Linear(1, hidden_size) model = DistributedDataParallel(model) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) results = [] for _ in range(epochs): train(train_loader, model, loss_fn, optimizer) result = validate(validation_loader, model, loss_fn) sgd.report(**result) results.append(result) return results
def train_func(config): itr = 0 ckpt = sgd.load_checkpoint() if ckpt is not None: itr = ckpt["iter"] + 1 for i in range(itr, config["max_iter"]): sgd.save_checkpoint(iter=i) sgd.report(test=i, training_iteration=i)
def train(): checkpoint = sgd.load_checkpoint() if checkpoint: epoch = checkpoint["epoch"] else: epoch = 0 print("Epoch: ", epoch) for i in range(epoch, 2): sgd.report(loss=1, iter=i) sgd.save_checkpoint(epoch=i + 1)
def train_func(config): batch_size = config.get("batch_size", 32) hidden_size = config.get("hidden_size", 1) lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) train_dataset_pipeline_shard = sgd.get_dataset_shard("train") validation_dataset_pipeline_shard = sgd.get_dataset_shard("validation") device = torch.device( f"cuda:{sgd.local_rank()}" if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): torch.cuda.set_device(device) model = nn.Linear(1, hidden_size) model = model.to(device) model = DistributedDataParallel( model, device_ids=[sgd.local_rank()] if torch.cuda.is_available() else None) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) results = [] train_dataset_iterator = train_dataset_pipeline_shard.iter_datasets() validation_dataset_iterator = \ validation_dataset_pipeline_shard.iter_datasets() for _ in range(epochs): train_dataset = next(train_dataset_iterator) validation_dataset = next(validation_dataset_iterator) train_torch_dataset = train_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=[torch.float], batch_size=batch_size, ) validation_torch_dataset = validation_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=[torch.float], batch_size=batch_size) train(train_torch_dataset, model, loss_fn, optimizer, device) result = validate(validation_torch_dataset, model, loss_fn, device) sgd.report(**result) results.append(result) return results
def train_func(): ckpt = sgd.load_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? itr = 0 if ckpt: itr = ckpt["iter"] + 1 for i in range(itr, 4): if i == 2 and not restored: raise Exception("try to fail me") sgd.save_checkpoint(iter=i) sgd.report(test=i, training_iteration=i)
def train_func(): for i in range(2): sgd.report(loss=1, iter=i)
def train(): for _ in range(2): sgd.report(loss=1) return 1
def train_actor_failure(): for _ in range(2): sgd.report(loss=1) import sys sys.exit(0)
def train_mismatch(): sgd.save_checkpoint(epoch=0) sgd.report(index=0) # skip checkpoint sgd.report(index=1)
def fail_train_2(): for _ in range(2): sgd.report(loss=1) raise NotImplementedError
def train_func(): for i in range(3): sgd.report(index=i) return 1
def train(): for i in range(2): sgd.save_checkpoint(epoch=i) sgd.report(index=i)
def train_func(): for i in range(num_iters): sgd.report(index=i) return 1
def train_slow(): sgd.report(index=0) time.sleep(5) sgd.report(index=1)
def train_func(): for _ in range(2): sgd.report(loss=1)
def train_func(config): device = torch.device( f"cuda:{sgd.local_rank()}" if torch.cuda.is_available() else "cpu") epochs = config.pop("epochs", 3) model = ResNet18(config) model = model.to(device) model = DistributedDataParallel( model, device_ids=[device.index] if torch.cuda.is_available() else None) # Create optimizer. optimizer = torch.optim.SGD(model.parameters(), lr=config.get("lr", 0.1), momentum=config.get("momentum", 0.9)) # Load in training and validation data. transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) # meanstd transformation transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) with FileLock(".ray.lock"): train_dataset = CIFAR10(root="~/data", train=True, download=True, transform=transform_train) validation_dataset = CIFAR10(root="~/data", train=False, download=False, transform=transform_test) if config.get("test_mode"): train_dataset = Subset(train_dataset, list(range(64))) validation_dataset = Subset(validation_dataset, list(range(64))) train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], sampler=DistributedSampler(train_dataset)) validation_loader = DataLoader( validation_dataset, batch_size=config["batch_size"], sampler=DistributedSampler(validation_dataset)) # Create loss. criterion = nn.CrossEntropyLoss() results = [] for _ in range(epochs): train(train_loader, model, criterion, optimizer, device) result = validate(validation_loader, model, criterion, device) sgd.report(**result) results.append(result) return results
def train_func(): for i in range(10): sgd.report(test=i) sgd.save_checkpoint(hello="world")
def train(): if (sgd.world_rank()) == 0: sgd.save_checkpoint(epoch=0) else: sgd.report(iter=0)
def on_epoch_end(self, epoch, logs=None): sgd.report(**logs)
def train_func(): sgd.report(episode_reward_mean=4) sgd.report(episode_reward_mean=5) sgd.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1}) return 1
def train_slow(): for i in range(2): sgd.save_checkpoint(epoch=i) time.sleep(5) sgd.report(index=i) time.sleep(5)
def train_mismatch(): sgd.report(loss=1)
def train(): for i in range(2): sgd.report(index=i)