def train_func(config: Dict): batch_size = config["batch_size"] lr = config["lr"] epochs = config["epochs"] # Create data loaders. train_dataloader = DataLoader(training_data, batch_size=batch_size) test_dataloader = DataLoader(test_data, batch_size=batch_size) train_dataloader = train.torch.prepare_data_loader(train_dataloader) test_dataloader = train.torch.prepare_data_loader(test_dataloader) # Create model. model = NeuralNetwork() model = train.torch.prepare_model(model) loss_fn = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) loss_results = [] for _ in range(epochs): train_epoch(train_dataloader, model, loss_fn, optimizer) loss = validate_epoch(test_dataloader, model, loss_fn) train.report(loss=loss) loss_results.append(loss) return loss_results
def train_func(config: Dict): batch_size = config["batch_size"] lr = config["lr"] epochs = config["epochs"] device = torch.device( f"cuda:{train.local_rank()}" if torch.cuda.is_available() else "cpu") # Create data loaders. train_dataloader = DataLoader(training_data, batch_size=batch_size, sampler=DistributedSampler(training_data)) test_dataloader = DataLoader(test_data, batch_size=batch_size, sampler=DistributedSampler(test_data)) # Create model. model = NeuralNetwork() model = model.to(device) model = DistributedDataParallel( model, device_ids=[device.index] if torch.cuda.is_available() else None) loss_fn = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) loss_results = [] for _ in range(epochs): train_epoch(train_dataloader, model, loss_fn, optimizer, device) loss = validate_epoch(test_dataloader, model, loss_fn, device) train.report(loss=loss) loss_results.append(loss) return loss_results
def train_epochs_remote(config): ''' This function will be run on each remote worker. It contains the epoch loop. ''' train_dataset, val_dataset, model, loss_fn, optimizer = training_setup( config) batch_size = config.get('batch_size') epochs = config.get('epochs') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, sampler=DistributedSampler(train_dataset)) validation_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, sampler=DistributedSampler(val_dataset)) # Prepare the data and the model for distributed training. train_loader = train.torch.prepare_data_loader(train_loader) validation_loader = train.torch.prepare_data_loader(validation_loader) model = train.torch.prepare_model(model) # epoch loop results = [] for epoch in range(epochs): train_batches(train_loader, model, loss_fn, optimizer, config) result = validate_epoch(validation_loader, model, loss_fn) result['epoch'] = epoch + 1 train.report(**result) results.append(result) return model.state_dict(), results
def train_func(config): epochs = config.pop("epochs", 3) model = ResNet18(config) model = train.torch.prepare_model(model) # Create optimizer. optimizer = torch.optim.SGD( model.parameters(), lr=config.get("lr", 0.1), momentum=config.get("momentum", 0.9), ) # Load in training and validation data. transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) # meanstd transformation transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) with FileLock(".ray.lock"): train_dataset = CIFAR10(root="~/data", train=True, download=True, transform=transform_train) validation_dataset = CIFAR10(root="~/data", train=False, download=False, transform=transform_test) if config.get("test_mode"): train_dataset = Subset(train_dataset, list(range(64))) validation_dataset = Subset(validation_dataset, list(range(64))) train_loader = DataLoader(train_dataset, batch_size=config["batch_size"]) validation_loader = DataLoader(validation_dataset, batch_size=config["batch_size"]) train_loader = train.torch.prepare_data_loader(train_loader) validation_loader = train.torch.prepare_data_loader(validation_loader) # Create loss. criterion = nn.CrossEntropyLoss() results = [] for _ in range(epochs): train_epoch(train_loader, model, criterion, optimizer) result = validate_epoch(validation_loader, model, criterion) train.report(**result) results.append(result) return results
def train_epochs_remote(config): ''' This function will be run on a remote worker. ''' train_dataset, val_dataset, model, loss_fn, optimizer = training_setup(config) batch_size = config.get("batch_size", 32) epochs = config.get("epochs", 3) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size) validation_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size) # Prepare the data and the model for distributed training. train_loader = train.torch.prepare_data_loader(train_loader) validation_loader = train.torch.prepare_data_loader(validation_loader) model = train.torch.prepare_model(model) #model = DistributedDataParallel(model) results = [] for epoch in range(epochs): train_batches(train_loader, model, loss_fn, optimizer) result = validate_epoch(validation_loader, model, loss_fn) result['epoch'] = epoch + 1 train.report(**result) results.append(result) return model.state_dict(), results
def train_func(config): data_size = config.get("data_size", 1000) val_size = config.get("val_size", 400) batch_size = config.get("batch_size", 32) hidden_size = config.get("hidden_size", 1) lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) train_dataset = LinearDataset(2, 5, size=data_size) val_dataset = LinearDataset(2, 5, size=val_size) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size) validation_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size) train_loader = train.torch.prepare_data_loader(train_loader) validation_loader = train.torch.prepare_data_loader(validation_loader) model = nn.Linear(1, hidden_size) model = train.torch.prepare_model(model) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) results = [] for _ in range(epochs): train_epoch(train_loader, model, loss_fn, optimizer) result = validate_epoch(validation_loader, model, loss_fn) train.report(**result) results.append(result) return results
def train_func(config): itr = 0 ckpt = train.load_checkpoint() if ckpt is not None: itr = ckpt["iter"] + 1 for i in range(itr, config["max_iter"]): train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i)
def train_func(): checkpoint = train.load_checkpoint() if checkpoint: epoch = checkpoint["epoch"] else: epoch = 0 print("Epoch: ", epoch) for i in range(epoch, 2): train.report(loss=1, iter=i) train.save_checkpoint(epoch=i + 1)
def train_func(config): batch_size = config.get("batch_size", 32) hidden_size = config.get("hidden_size", 1) lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) train_dataset_pipeline_shard = train.get_dataset_shard("train") validation_dataset_pipeline_shard = train.get_dataset_shard("validation") device = torch.device( f"cuda:{train.local_rank()}" if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): torch.cuda.set_device(device) model = nn.Linear(1, hidden_size) model = model.to(device) model = DistributedDataParallel( model, device_ids=[train.local_rank()] if torch.cuda.is_available() else None) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) results = [] train_dataset_iterator = train_dataset_pipeline_shard.iter_datasets() validation_dataset_iterator = \ validation_dataset_pipeline_shard.iter_datasets() for _ in range(epochs): train_dataset = next(train_dataset_iterator) validation_dataset = next(validation_dataset_iterator) train_torch_dataset = train_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=[torch.float], batch_size=batch_size, ) validation_torch_dataset = validation_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=[torch.float], batch_size=batch_size) train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) result = validate_epoch(validation_torch_dataset, model, loss_fn, device) train.report(**result) results.append(result) return results
def train_loop_per_worker(): import pandas as pd rank = train.world_rank() data_shard = train.get_dataset_shard("train") start = time.perf_counter() num_epochs, num_batches, num_bytes = 0, 0, 0 batch_delays = [] print("Starting train loop on worker", rank) while time.perf_counter() - start < runtime_seconds: num_epochs += 1 batch_start = time.perf_counter() for batch in data_shard.iter_batches( prefetch_blocks=prefetch_blocks, batch_size=batch_size ): batch_delay = time.perf_counter() - batch_start batch_delays.append(batch_delay) num_batches += 1 if isinstance(batch, pd.DataFrame): num_bytes += int( batch.memory_usage(index=True, deep=True).sum() ) elif isinstance(batch, np.ndarray): num_bytes += batch.nbytes else: # NOTE: This isn't recursive and will just return the size of # the object pointers if list of non-primitive types. num_bytes += sys.getsizeof(batch) train.report( bytes_read=num_bytes, num_batches=num_batches, num_epochs=num_epochs, batch_delay=batch_delay, ) batch_start = time.perf_counter() delta = time.perf_counter() - start print("Time to read all data", delta, "seconds") print( "P50/P95/Max batch delay (s)", np.quantile(batch_delays, 0.5), np.quantile(batch_delays, 0.95), np.max(batch_delays), ) print("Num epochs read", num_epochs) print("Num batches read", num_batches) print("Num bytes read", round(num_bytes / (1024 * 1024), 2), "MiB") print( "Mean throughput", round(num_bytes / (1024 * 1024) / delta, 2), "MiB/s" ) if rank == 0: print("Ingest stats from rank=0:\n\n{}".format(data_shard.stats()))
def train_fn(): model = torch.nn.Linear(1, 1) # Wrap in DDP. model = train.torch.prepare_model(model) # Save DDP wrapped model. train.save_checkpoint(model=model) # Report DDP wrapped model. train.report(model=model)
def train_func(): ckpt = train.load_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? itr = 0 if ckpt: itr = ckpt["iter"] + 1 for i in range(itr, 4): if i == 2 and not restored: raise Exception("try to fail me") train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i)
def train_func(config): batch_size = config.get("batch_size", 32) hidden_size = config.get("hidden_size", 1) lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) train_dataset_pipeline_shard = train.get_dataset_shard("train") validation_dataset_pipeline_shard = train.get_dataset_shard("validation") model = nn.Linear(1, hidden_size) model = train.torch.prepare_model(model) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) results = [] train_dataset_iterator = train_dataset_pipeline_shard.iter_epochs() validation_dataset_iterator = validation_dataset_pipeline_shard.iter_epochs( ) for _ in range(epochs): train_dataset = next(train_dataset_iterator) validation_dataset = next(validation_dataset_iterator) train_torch_dataset = train_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) validation_torch_dataset = validation_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) device = train.torch.get_device() train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) result = validate_epoch(validation_torch_dataset, model, loss_fn, device) train.report(**result) results.append(result) return results
def training_loop(config): # Create model. model = ResNet18(config) model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=1, padding=3, bias=False) model = train.torch.prepare_model(model) # Create optimizer. optimizer = torch.optim.SGD( model.parameters(), lr=config.get("lr", 0.1), momentum=config.get("momentum", 0.9), ) # Load in training and validation data. train_dataset = load_mnist_data(True, True) validation_dataset = load_mnist_data(False, False) if config["test_mode"]: train_dataset = Subset(train_dataset, list(range(64))) validation_dataset = Subset(validation_dataset, list(range(64))) train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], num_workers=2) validation_loader = DataLoader(validation_dataset, batch_size=config["batch_size"], num_workers=2) train_loader = train.torch.prepare_data_loader(train_loader) validation_loader = train.torch.prepare_data_loader(validation_loader) # Create loss. criterion = nn.CrossEntropyLoss() for epoch_idx in range(2): train_epoch(train_loader, model, criterion, optimizer) validation_loss = validate_epoch(validation_loader, model, criterion) train.save_checkpoint(model_state_dict=model.module.state_dict()) train.report(**validation_loss)
def train_loop_per_worker(): rank = train.world_rank() data_shard = train.get_dataset_shard("train") start = time.perf_counter() num_epochs, num_batches, num_bytes = 0, 0, 0 batch_delays = [] print("Starting train loop on worker", rank) while time.perf_counter() - start < runtime_seconds: num_epochs += 1 batch_start = time.perf_counter() for batch in data_shard.iter_batches( prefetch_blocks=prefetch_blocks, batch_size=batch_size): batch_delay = time.perf_counter() - batch_start batch_delays.append(batch_delay) num_batches += 1 num_bytes += int( batch.memory_usage(index=True, deep=True).sum()) train.report( bytes_read=num_bytes, num_batches=num_batches, num_epochs=num_epochs, batch_delay=batch_delay, ) batch_start = time.perf_counter() delta = time.perf_counter() - start print("Time to read all data", delta, "seconds") print( "P50/P95/Max batch delay (s)", np.quantile(batch_delays, 0.5), np.quantile(batch_delays, 0.95), np.max(batch_delays), ) print("Num epochs read", num_epochs) print("Num batches read", num_batches) print("Num bytes read", round(num_bytes / (1024 * 1024), 2), "MiB") print("Mean throughput", round(num_bytes / (1024 * 1024) / delta, 2), "MiB/s") if rank == 0: print("Ingest stats from rank=0:\n\n{}".format( data_shard.stats()))
def train_func(): from ray.train.torch import TorchWorkerProfiler from torch.profiler import profile, record_function, schedule twp = TorchWorkerProfiler() with profile( activities=[], schedule=schedule(wait=0, warmup=0, active=1), on_trace_ready=twp.trace_handler, ) as p: for epoch in range(num_epochs): with record_function("test_function"): pass p.step() profile_results = twp.get_and_clear_profile_traces() train.report(epoch=epoch, **profile_results)
def train_loop_per_worker(config): import torch import horovod.torch as hvd hvd.init() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") mode = config["mode"] net = Net(mode).to(device) optimizer = torch.optim.SGD( net.parameters(), lr=config["lr"], ) optimizer = hvd.DistributedOptimizer(optimizer) num_steps = 5 print(hvd.size()) np.random.seed(1 + hvd.rank()) torch.manual_seed(1234) # To ensure consistent initialization across workers, hvd.broadcast_parameters(net.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) start = time.time() x_max = config["x_max"] for step in range(1, num_steps + 1): features = torch.Tensor(np.random.rand(1) * 2 * x_max - x_max).to(device) if mode == "square": labels = sq(features) else: labels = qu(features) optimizer.zero_grad() outputs = net(features) loss = torch.nn.MSELoss()(outputs, labels) loss.backward() optimizer.step() time.sleep(0.1) train.report(loss=loss.item()) total = time.time() - start print(f"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.")
def train_func(): twp = TorchWorkerProfiler() with profile( activities=[], schedule=schedule(wait=0, warmup=0, active=1), on_trace_ready=twp.trace_handler, ) as p: # Setup model. model = torch.nn.Linear(1, 1) model = train.torch.prepare_model(model) loss_fn = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=1e-2) # Setup data. input = torch.randn(1000, 1) labels = input * 2 dataset = torch.utils.data.TensorDataset(input, labels) dataloader = torch.utils.data.DataLoader(dataset, batch_size=32) dataloader = train.torch.prepare_data_loader(dataloader) # Train. for epoch in range(5): with record_function("train_epoch"): for X, y in dataloader: pred = model(X) loss = loss_fn(pred, y) optimizer.zero_grad() loss.backward() optimizer.step() with record_function("train_checkpoint"): state_dict = model.state_dict() consume_prefix_in_state_dict_if_present(state_dict, "module.") train.save_checkpoint(epoch=epoch, model_weights=state_dict) p.step() with record_function("train_report"): profile_results = twp.get_and_clear_profile_traces() train.report(epoch=epoch, **profile_results)
def train_func(): train.report(episode_reward_mean=4) train.report(episode_reward_mean=5) train.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1}) return 1
def fail_train_2(): for _ in range(2): train.report(loss=1) raise NotImplementedError
def on_epoch_end(self, epoch, logs=None): train.report(**logs)
def train_func(): checkpoint = train.load_checkpoint() train.report(**checkpoint) train.save_checkpoint(**checkpoint) return checkpoint[key]
def train_mismatch(): train.save_checkpoint(epoch=0) train.report(index=0) # skip checkpoint train.report(index=1)
def train_func(): train.report(rank=train.world_rank())
def train_func(config): train.report(episode_reward_mean=4) train.report(episode_reward_mean=5) train.report(episode_reward_mean=6) return 1
def train_func(): for i in range(num_iters): train.report(index=i) return 1
def train_func(): for _ in range(2): train.report(loss=1) return 1
def train_func(): for i in range(3): train.report(epoch=i)
def train_actor_failure(): for _ in range(2): train.report(loss=1) import sys sys.exit(0)
def train_func(): for i in range(2): train.report(loss=1, iter=i)