def train_mnist(config): use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") train_loader, test_loader = get_data_loaders() model = ConvNet().to(device) optimizer = optim.SGD(model.parameters(), lr=config["lr"], momentum=config["momentum"]) while True: train(model, optimizer, train_loader, device) acc = test(model, test_loader, device) # Set this to run Tune. session.report({"mean_accuracy": acc})
def train_func(config): step = 0 width, height = config["width"], config["height"] if session.get_checkpoint(): loaded_checkpoint = session.get_checkpoint() step = loaded_checkpoint.to_dict()["step"] + 1 for step in range(step, 100): intermediate_score = evaluation_fn(step, width, height) checkpoint = Checkpoint.from_dict({"step": step}) session.report({ "iterations": step, "mean_loss": intermediate_score }, checkpoint=checkpoint)
def train_func(config): batch_size = config.get("batch_size", 32) hidden_size = config.get("hidden_size", 1) lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) train_dataset_shard = session.get_dataset_shard("train") validation_dataset = session.get_dataset_shard("validation") model = nn.Linear(1, hidden_size) model = train.torch.prepare_model(model) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) results = [] for _ in range(epochs): train_torch_dataset = train_dataset_shard.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) validation_torch_dataset = validation_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) device = train.torch.get_device() train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) if session.get_world_rank() == 0: result = validate_epoch(validation_torch_dataset, model, loss_fn, device) else: result = {} results.append(result) session.report(result, checkpoint=Checkpoint.from_dict(dict(model=model))) return results
def train_func(config): batch_size = config.get("batch_size", 32) hidden_size = config.get("hidden_size", 1) lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) train_dataset_pipeline_shard = session.get_dataset_shard("train") validation_dataset_pipeline_shard = session.get_dataset_shard("validation") model = nn.Linear(1, hidden_size) model = train.torch.prepare_model(model) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) train_dataset_iterator = train_dataset_pipeline_shard.iter_epochs() validation_dataset_iterator = validation_dataset_pipeline_shard.iter_epochs( ) for _ in range(epochs): train_dataset = next(train_dataset_iterator) validation_dataset = next(validation_dataset_iterator) train_torch_dataset = train_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) validation_torch_dataset = validation_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) device = train.torch.get_device() train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) result = validate_epoch(validation_torch_dataset, model, loss_fn, device) session.report(result)
def train_convnet(config): # Create our data loaders, model, and optmizer. step = 0 train_loader, test_loader = get_data_loaders() model = ConvNet() optimizer = optim.SGD( model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9), ) # If `session.get_checkpoint()` is not None, then we are resuming from a checkpoint. # Load model state and iteration step from checkpoint. if session.get_checkpoint(): print("Loading from checkpoint.") loaded_checkpoint = session.get_checkpoint() with loaded_checkpoint.as_directory() as loaded_checkpoint_dir: path = os.path.join(loaded_checkpoint_dir, "checkpoint.pt") checkpoint = torch.load(path) model.load_state_dict(checkpoint["model_state_dict"]) step = checkpoint["step"] while True: train(model, optimizer, train_loader) acc = test(model, test_loader) checkpoint = None if step % 5 == 0: # Every 5 steps, checkpoint our current state. # First get the checkpoint directory from tune. # Need to create a directory under current working directory # to construct an AIR Checkpoint object from. os.makedirs("my_model", exist_ok=True) torch.save( { "step": step, "model_state_dict": model.state_dict(), }, "my_model/checkpoint.pt", ) checkpoint = Checkpoint.from_directory("my_model") step += 1 session.report({"mean_accuracy": acc}, checkpoint=checkpoint)
def train(config, checkpoint_dir=None): step = 0 if checkpoint_dir: with open(os.path.join(checkpoint_dir, "checkpoint")) as f: step = json.loads(f.read())["timestep"] for timestep in range(step, 100): v = np.tanh(float(timestep) / config.get("width", 1)) v *= config.get("height", 1) # Checkpoint the state of the training every 3 steps # Note that this is only required for certain schedulers checkpoint = None if timestep % 3 == 0: checkpoint = Checkpoint.from_dict({"timestep": timestep}) # Here we use `episode_reward_mean`, but you can also report other # objectives such as loss or accuracy. session.report({"episode_reward_mean": v}, checkpoint=checkpoint)
def train_loop_per_worker(config): import torch import horovod.torch as hvd hvd.init() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") mode = config["mode"] net = Net(mode).to(device) optimizer = torch.optim.SGD( net.parameters(), lr=config["lr"], ) optimizer = hvd.DistributedOptimizer(optimizer) num_steps = 5 print(hvd.size()) np.random.seed(1 + hvd.rank()) torch.manual_seed(1234) # To ensure consistent initialization across workers, hvd.broadcast_parameters(net.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) start = time.time() x_max = config["x_max"] for step in range(1, num_steps + 1): features = torch.Tensor(np.random.rand(1) * 2 * x_max - x_max).to(device) if mode == "square": labels = sq(features) else: labels = qu(features) optimizer.zero_grad() outputs = net(features) loss = torch.nn.MSELoss()(outputs, labels) loss.backward() optimizer.step() time.sleep(0.1) session.report(dict(loss=loss.item())) total = time.time() - start print(f"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.")
def train_fn(config): start_epoch = 0 print(session.get_trial_resources()) checkpoint = session.get_checkpoint() if checkpoint: # assume that we have run the session.report() example # and successfully save some model weights checkpoint_dict = checkpoint.to_dict() start_epoch = checkpoint_dict.get("epoch", -1) + 1 # wrap the model in DDP for epoch in range(start_epoch, config["num_epochs"]): checkpoint = Checkpoint.from_dict(dict(epoch=epoch)) session.report( { "metric": config["metric"] * epoch, "epoch": epoch, "num_cpus": session.get_trial_resources().required_resources["CPU"], }, checkpoint=checkpoint, )
def train_func(config): num_epochs = config.get("num_epochs", 10) log_interval = config.get("log_interval", 10) use_cuda = config.get("use_cuda", False) save_model_as_dict = config.get("save_model_as_dict", False) model, optimizer, train_loader, train_sampler = setup(config) results = [] for epoch in range(num_epochs): loss = train_epoch(model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda) if save_model_as_dict: checkpoint_dict = dict(model=model.state_dict()) else: checkpoint_dict = dict(model=model) checkpoint_dict = Checkpoint.from_dict(checkpoint_dict) results.append(loss) session.report(dict(loss=loss), checkpoint=checkpoint_dict) # Only used for testing. return results
def train_loop_per_worker(config): raw_model = resnet18(pretrained=True) model = train.torch.prepare_model(raw_model) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train_dataset_shard = session.get_dataset_shard("train") for epoch in range(config["num_epochs"]): running_loss = 0.0 for i, data in enumerate( train_dataset_shard.iter_batches( batch_size=config["batch_size"], batch_format="numpy")): # get the inputs; data is a list of [inputs, labels] inputs = torch.as_tensor(data["image"], dtype=torch.float32).to(device="cuda") labels = torch.as_tensor(data["label"], dtype=torch.int64).to(device="cuda") # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() if i % 2000 == 1999: # print every 2000 mini-batches print( f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}" ) running_loss = 0.0 session.report( dict(running_loss=running_loss), checkpoint=TorchCheckpoint.from_model(model), )
def train_loop_per_worker(config): batch_size = config["batch_size"] lr = config["lr"] epochs = config["num_epochs"] num_features = config["num_features"] # Get the Ray Dataset shard for this data parallel worker, # and convert it to a PyTorch Dataset. train_data = train.get_dataset_shard("train") def to_tensor_iterator(dataset, batch_size): data_iterator = dataset.iter_batches(batch_format="numpy", batch_size=batch_size) for d in data_iterator: # "concat_out" is the output column of the Concatenator. yield ( torch.Tensor(d["concat_out"]).float(), torch.Tensor(d["target"]).float(), ) # Create model. model = create_model(num_features) model = train.torch.prepare_model(model) loss_fn = nn.BCELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) for cur_epoch in range(epochs): for inputs, labels in to_tensor_iterator(train_data, batch_size): optimizer.zero_grad() predictions = model(inputs) train_loss = loss_fn(predictions, labels.unsqueeze(1)) train_loss.backward() optimizer.step() loss = train_loss.item() session.report({"loss": loss}, checkpoint=TorchCheckpoint.from_model(model))
def train_fn(config, checkpoint_dir=None): # some Modin operations here # import modin.pandas as pd session.report({"metric": metric})
def dcgan_train(config): step = 0 use_cuda = config.get("use_gpu") and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") netD = Discriminator().to(device) netD.apply(weights_init) netG = Generator().to(device) netG.apply(weights_init) criterion = nn.BCELoss() optimizerD = optim.Adam(netD.parameters(), lr=config.get("lr", 0.01), betas=(beta1, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=config.get("lr", 0.01), betas=(beta1, 0.999)) with FileLock(os.path.expanduser("~/.data.lock")): dataloader = get_data_loader() if session.get_checkpoint(): loaded_checkpoint = session.get_checkpoint() with loaded_checkpoint.as_directory() as loaded_checkpoint_dir: path = os.path.join(loaded_checkpoint_dir, "checkpoint.pt") checkpoint = torch.load(path) netD.load_state_dict(checkpoint["netDmodel"]) netG.load_state_dict(checkpoint["netGmodel"]) optimizerD.load_state_dict(checkpoint["optimD"]) optimizerG.load_state_dict(checkpoint["optimG"]) step = checkpoint["step"] if "netD_lr" in config: for param_group in optimizerD.param_groups: param_group["lr"] = config["netD_lr"] if "netG_lr" in config: for param_group in optimizerG.param_groups: param_group["lr"] = config["netG_lr"] while True: lossG, lossD, is_score = train( netD, netG, optimizerG, optimizerD, criterion, dataloader, step, device, config["mnist_model_ref"], ) step += 1 os.makedirs("my_model", exist_ok=True) torch.save( { "netDmodel": netD.state_dict(), "netGmodel": netG.state_dict(), "optimD": optimizerD.state_dict(), "optimG": optimizerG.state_dict(), "step": step, }, "my_model/checkpoint.pt", ) session.report( { "lossg": lossG, "lossd": lossD, "is_score": is_score }, checkpoint=Checkpoint.from_directory("my_model"), )
def train_loop_per_worker(train_loop_config): dataset = train_loop_config["dataset_fn"]() batch_size = train_loop_config["batch_size"] num_epochs = train_loop_config["num_epochs"] data = dataset[0] train_idx = data.train_mask.nonzero(as_tuple=False).view(-1) train_idx = train_idx.split( train_idx.size(0) // session.get_world_size())[session.get_world_rank()] train_loader = NeighborSampler( data.edge_index, node_idx=train_idx, sizes=[25, 10], batch_size=batch_size, shuffle=True, ) # Disable distributed sampler since the train_loader has already been split above. train_loader = train.torch.prepare_data_loader(train_loader, add_dist_sampler=False) # Do validation on rank 0 worker only. if session.get_world_rank() == 0: subgraph_loader = NeighborSampler(data.edge_index, node_idx=None, sizes=[-1], batch_size=2048, shuffle=False) subgraph_loader = train.torch.prepare_data_loader( subgraph_loader, add_dist_sampler=False) model = SAGE(dataset.num_features, 256, dataset.num_classes) model = train.torch.prepare_model(model) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) x, y = data.x.to(train.torch.get_device()), data.y.to( train.torch.get_device()) for epoch in range(num_epochs): model.train() # ``batch_size`` is the number of samples in the current batch. # ``n_id`` are the ids of all the nodes used in the computation. This is # needed to pull in the necessary features just for the current batch that is # being trained on. # ``adjs`` is a list of 3 element tuple consisting of ``(edge_index, e_id, # size)`` for each sample in the batch, where ``edge_index``represent the # edges of the sampled subgraph, ``e_id`` are the ids of the edges in the # sample, and ``size`` holds the shape of the subgraph. # See ``torch_geometric.loader.neighbor_sampler.NeighborSampler`` for more info. for batch_size, n_id, adjs in train_loader: optimizer.zero_grad() out = model(x[n_id], adjs) loss = F.nll_loss(out, y[n_id[:batch_size]]) loss.backward() optimizer.step() if session.get_world_rank() == 0: print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}") train_accuracy = validation_accuracy = test_accuracy = None # Do validation on rank 0 worker only. if session.get_world_rank() == 0: model.eval() with torch.no_grad(): out = model.module.test(x, subgraph_loader) res = out.argmax(dim=-1) == data.y train_accuracy = int(res[data.train_mask].sum()) / int( data.train_mask.sum()) validation_accuracy = int(res[data.val_mask].sum()) / int( data.val_mask.sum()) test_accuracy = int(res[data.test_mask].sum()) / int( data.test_mask.sum()) session.report( dict( train_accuracy=train_accuracy, validation_accuracy=validation_accuracy, test_accuracy=test_accuracy, ))
def function_trainable_dict(config): session.report({"metric": 2}, checkpoint=Checkpoint.from_dict({"checkpoint_data": 3}))
def train_func(config): session.report({"loss": config["x"]})
def train_func(): for i in range(3): session.report({"epoch": i}, checkpoint=Checkpoint.from_dict({"model": i}))
def train_function(config, checkpoint_dir=None): for i in range(30): loss = config["mean"] + config["sd"] * np.random.randn() session.report({"loss": loss})
def function_trainable_directory(config): tmpdir = tempfile.mkdtemp("checkpoint_test") with open(os.path.join(tmpdir, "data.json"), "w") as f: json.dump({"checkpoint_data": 5}, f) session.report({"metric": 4}, checkpoint=Checkpoint.from_directory(tmpdir))
def decorated_train_function(config, checkpoint_dir=None): for i in range(30): loss = config["mean"] + config["sd"] * np.random.randn() session.report({"loss": loss}) wandb.log(dict(loss=loss))
def train_loop_per_worker(): import pandas as pd rank = session.get_world_rank() data_shard = session.get_dataset_shard("train") start = time.perf_counter() epochs_read, batches_read, bytes_read = 0, 0, 0 batch_delays = [] def generate_epochs(data: Union[Dataset, DatasetPipeline], epochs: int): if isinstance(data, DatasetPipeline): for epoch in data_shard.iter_epochs(epochs): yield epoch else: # Dataset for _ in range(epochs): yield data print("Starting train loop on worker", rank) for epoch_data in generate_epochs(data_shard, num_epochs): epochs_read += 1 batch_start = time.perf_counter() for batch in epoch_data.iter_batches( prefetch_blocks=prefetch_blocks, batch_size=batch_size): batch_delay = time.perf_counter() - batch_start batch_delays.append(batch_delay) batches_read += 1 if isinstance(batch, pd.DataFrame): bytes_read += int( batch.memory_usage(index=True, deep=True).sum()) elif isinstance(batch, np.ndarray): bytes_read += batch.nbytes else: # NOTE: This isn't recursive and will just return the size of # the object pointers if list of non-primitive types. bytes_read += sys.getsizeof(batch) session.report( dict( bytes_read=bytes_read, batches_read=batches_read, epochs_read=epochs_read, batch_delay=batch_delay, )) batch_start = time.perf_counter() delta = time.perf_counter() - start print("Time to read all data", delta, "seconds") print( "P50/P95/Max batch delay (s)", np.quantile(batch_delays, 0.5), np.quantile(batch_delays, 0.95), np.max(batch_delays), ) print("Num epochs read", epochs_read) print("Num batches read", batches_read) print("Num bytes read", round(bytes_read / (1024 * 1024), 2), "MiB") print("Mean throughput", round(bytes_read / (1024 * 1024) / delta, 2), "MiB/s") if rank == 0: print("Ingest stats from rank=0:\n\n{}".format( data_shard.stats()))
def train_func(): model = build_model().get_weights() session.report({}, checkpoint=Checkpoint.from_dict({MODEL_KEY: model}))
def train_func(use_ray: bool, config: Dict): if use_ray: from ray.air import session import ray.train as train batch_size = config["batch_size"] lr = config["lr"] epochs = config["epochs"] shuffle = config.get("shuffle", False) if use_ray: world_size = session.get_world_size() local_rank = distributed.get_rank() else: world_size = distributed.get_world_size() local_rank = distributed.get_rank() worker_batch_size = batch_size // world_size # Load datasets. Use download=False to catch errors in preparation, as the # data should have already been downloaded. training_data = datasets.FashionMNIST( root="/tmp/data_fashion_mnist", train=True, download=False, transform=ToTensor(), ) test_data = datasets.FashionMNIST( root="/tmp/data_fashion_mnist", train=False, download=False, transform=ToTensor(), ) if use_ray: # Ray adds DistributedSampler in train.torch.prepare_data_loader below training_sampler = None test_sampler = None else: # In vanilla PyTorch we create the distributed sampler here training_sampler = DistributedSampler(training_data, shuffle=shuffle) test_sampler = DistributedSampler(test_data, shuffle=shuffle) if not use_ray and config.get("use_gpu", False): assert torch.cuda.is_available(), "No GPUs available" gpu_id = config.get("gpu_id", 0) vanilla_device = torch.device(f"cuda:{gpu_id}") torch.cuda.set_device(vanilla_device) print( "Setting GPU ID to", gpu_id, "with visible devices", os.environ.get("CUDA_VISIBLE_DEVICES"), ) def collate_fn(x): return tuple(x_.to(vanilla_device) for x_ in default_collate(x)) else: vanilla_device = torch.device("cpu") collate_fn = None # Create data loaders and potentially pass distributed sampler train_dataloader = DataLoader( training_data, shuffle=shuffle, batch_size=worker_batch_size, sampler=training_sampler, collate_fn=collate_fn, ) test_dataloader = DataLoader( test_data, shuffle=shuffle, batch_size=worker_batch_size, sampler=test_sampler, collate_fn=collate_fn, ) if use_ray: # In Ray, we now retrofit the DistributedSampler train_dataloader = train.torch.prepare_data_loader(train_dataloader) test_dataloader = train.torch.prepare_data_loader(test_dataloader) # Create model. model = NeuralNetwork() # Prepare model if use_ray: model = train.torch.prepare_model(model) else: model = model.to(vanilla_device) if config.get("use_gpu", False): model = nn.parallel.DistributedDataParallel( model, device_ids=[gpu_id], output_device=gpu_id ) else: model = nn.parallel.DistributedDataParallel(model) loss_fn = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) for _ in range(epochs): train_epoch( train_dataloader, model, loss_fn, optimizer, world_size=world_size, local_rank=local_rank, ) loss = validate_epoch( test_dataloader, model, loss_fn, world_size=world_size, local_rank=local_rank, ) if use_ray: session.report(dict(loss=loss)) else: print(f"Reporting loss: {loss:.4f}") if local_rank == 0: with open(VANILLA_RESULT_JSON, "w") as f: json.dump({"loss": loss}, f)
def train_func(): for i in range(9): session.report(dict(test=i)) session.report( dict(test=i + 1), checkpoint=Checkpoint.from_dict(dict(hello="world")) )
def train(config): random_result = np.random.uniform(0, 100, size=1).item() session.report({"result": random_result})
def train_loop(config): session.report({"loss": 1}) return FailOnUnpickle()
def train_func(): checkpoint = session.get_checkpoint() session.report(metrics=checkpoint.to_dict(), checkpoint=checkpoint) return checkpoint.to_dict()[key]
def train_func(): session.report({"loss": 1})
def train_cifar(config): net = Net(config["l1"], config["l2"]) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" if torch.cuda.device_count() > 1: net = nn.DataParallel(net) net.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) # Load existing checkpoint through `session.get_checkpoint()` API. if session.get_checkpoint(): loaded_checkpoint = session.get_checkpoint() with loaded_checkpoint.as_directory() as loaded_checkpoint_dir: model_state, optimizer_state = torch.load(os.path.join(loaded_checkpoint_dir, "checkpoint.pt")) net.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) data_dir = os.path.abspath("./data") trainset, testset = load_data(data_dir) test_abs = int(len(trainset) * 0.8) train_subset, val_subset = random_split( trainset, [test_abs, len(trainset) - test_abs]) trainloader = torch.utils.data.DataLoader( train_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8) valloader = torch.utils.data.DataLoader( val_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8) for epoch in range(10): # loop over the dataset multiple times running_loss = 0.0 epoch_steps = 0 for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() epoch_steps += 1 if i % 2000 == 1999: # print every 2000 mini-batches print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps)) running_loss = 0.0 # Validation loss val_loss = 0.0 val_steps = 0 total = 0 correct = 0 for i, data in enumerate(valloader, 0): with torch.no_grad(): inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) outputs = net(inputs) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() loss = criterion(outputs, labels) val_loss += loss.cpu().numpy() val_steps += 1 # Here we save a checkpoint. It is automatically registered with # Ray Tune and will potentially be accessed through in ``session.get_checkpoint()`` # in future iterations. # Note to save a file like checkpoint, you still need to put it under a directory # to construct an AIR checkpoint. os.makedirs("my_model", exist_ok=True) # ok to overwrite the previous one. path = os.path.join("my_model", "checkpoint.pt") torch.save( (net.state_dict(), optimizer.state_dict()), path) checkpoint = Checkpoint.from_directory("my_model") session.report({"loss": (val_loss / val_steps), "accuracy": correct / total}, checkpoint=checkpoint) print("Finished Training")
def train_func(): assert ray.available_resources()["CPU"] == 1 session.report({"loss": 1})