def get_dataset(): # Train dataset should be sharded. train_dataset = session.get_dataset_shard("train") assert train_dataset.count() == num_train_data / scale_config.num_workers # All other datasets should not be sharded. val_dataset = session.get_dataset_shard("val") assert val_dataset.count() == num_val_data
def train_loop_per_worker(): data_shard = session.get_dataset_shard("train") if expect_ds: assert isinstance(data_shard, Dataset), data_shard else: assert isinstance(data_shard, DatasetPipeline), data_shard for k, v in expect_sizes.items(): shard = session.get_dataset_shard(k) if v == -1: assert shard is None, shard else: if isinstance(shard, DatasetPipeline): assert next(shard.iter_epochs()).count() == v, shard else: assert shard.count() == v, shard
def train_func(config: dict): strategy = tf.distribute.MultiWorkerMirroredStrategy() with strategy.scope(): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = build_model() multi_worker_model.compile( optimizer=tf.keras.optimizers.SGD( learning_rate=config.get("lr", 1e-3)), loss=tf.keras.losses.mean_squared_error, metrics=[tf.keras.metrics.mean_squared_error], ) dataset = session.get_dataset_shard("train") for _ in range(config.get("epoch", 3)): tf_dataset = prepare_dataset_shard( dataset.to_tf( label_column="y", output_signature=( tf.TensorSpec(shape=(None, 1), dtype=tf.float32), tf.TensorSpec(shape=(None), dtype=tf.float32), ), batch_size=32, )) multi_worker_model.fit(tf_dataset, callbacks=[Callback()])
def train_loop_per_worker(): data_shard = session.get_dataset_shard("train") assert isinstance(data_shard, DatasetPipeline), data_shard results = [] for epoch in data_shard.iter_epochs(2): results.append(epoch.take()) check_results_fn(data_shard, results)
def train_func(config): batch_size = config.get("batch_size", 32) hidden_size = config.get("hidden_size", 1) lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) train_dataset_shard = session.get_dataset_shard("train") validation_dataset = session.get_dataset_shard("validation") model = nn.Linear(1, hidden_size) model = train.torch.prepare_model(model) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) results = [] for _ in range(epochs): train_torch_dataset = train_dataset_shard.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) validation_torch_dataset = validation_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) device = train.torch.get_device() train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) if session.get_world_rank() == 0: result = validate_epoch(validation_torch_dataset, model, loss_fn, device) else: result = {} results.append(result) session.report(result, checkpoint=Checkpoint.from_dict(dict(model=model))) return results
def train_loop_per_worker(): # By default, bulk loading is used and returns a Dataset object. data_shard = session.get_dataset_shard("train") # Manually iterate over the data 10 times (10 epochs). for _ in range(10): for batch in data_shard.iter_batches(): print("Do some training on batch", batch)
def train_func(config): batch_size = config.get("batch_size", 32) hidden_size = config.get("hidden_size", 1) lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) train_dataset_pipeline_shard = session.get_dataset_shard("train") validation_dataset_pipeline_shard = session.get_dataset_shard("validation") model = nn.Linear(1, hidden_size) model = train.torch.prepare_model(model) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) train_dataset_iterator = train_dataset_pipeline_shard.iter_epochs() validation_dataset_iterator = validation_dataset_pipeline_shard.iter_epochs( ) for _ in range(epochs): train_dataset = next(train_dataset_iterator) validation_dataset = next(validation_dataset_iterator) train_torch_dataset = train_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) validation_torch_dataset = validation_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) device = train.torch.get_device() train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) result = validate_epoch(validation_torch_dataset, model, loss_fn, device) session.report(result)
def train_loop_per_worker(): # A DatasetPipeline object is returned when `use_stream_api` is set. data_shard: DatasetPipeline = session.get_dataset_shard("train") # Use iter_epochs(10) to iterate over 10 epochs of data. for epoch in data_shard.iter_epochs(10): for batch in epoch.iter_batches(): print("Do some training on batch", batch) # View the stats for performance debugging. print(data_shard.stats())
def train_loop_per_worker(config): raw_model = resnet18(pretrained=True) model = train.torch.prepare_model(raw_model) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train_dataset_shard = session.get_dataset_shard("train") for epoch in range(config["num_epochs"]): running_loss = 0.0 for i, data in enumerate( train_dataset_shard.iter_batches( batch_size=config["batch_size"], batch_format="numpy")): # get the inputs; data is a list of [inputs, labels] inputs = torch.as_tensor(data["image"], dtype=torch.float32).to(device="cuda") labels = torch.as_tensor(data["label"], dtype=torch.int64).to(device="cuda") # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() if i % 2000 == 1999: # print every 2000 mini-batches print( f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}" ) running_loss = 0.0 session.report( dict(running_loss=running_loss), checkpoint=TorchCheckpoint.from_model(model), )
def train_func(config: dict): per_worker_batch_size = config.get("batch_size", 64) epochs = config.get("epochs", 3) dataset_shard = session.get_dataset_shard("train") strategy = tf.distribute.MultiWorkerMirroredStrategy() with strategy.scope(): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = build_autoencoder_model() learning_rate = config.get("lr", 0.001) multi_worker_model.compile( loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=[ "binary_crossentropy", ], ) results = [] for epoch in range(epochs): tf_dataset = prepare_dataset_shard( dataset_shard.to_tf( feature_columns=["image"], label_column="label", output_signature=( tf.TensorSpec(shape=(None, 784), dtype=tf.float32), tf.TensorSpec(shape=(None, 784), dtype=tf.float32), ), batch_size=per_worker_batch_size, )) history = multi_worker_model.fit( tf_dataset, callbacks=[TrainCheckpointReportCallback()]) results.append(history.history) return results
def train_func(config): batch_size = config.get("batch_size", 64) epochs = config.get("epochs", 3) strategy = tf.distribute.MultiWorkerMirroredStrategy() with strategy.scope(): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = build_and_compile_model(config) dataset_pipeline = session.get_dataset_shard("train") dataset_iterator = dataset_pipeline.iter_epochs() for _ in range(epochs): dataset = next(dataset_iterator) tf_dataset = prepare_dataset_shard( dataset.to_tf( label_column="y", output_signature=( tf.TensorSpec(shape=(None, 1), dtype=tf.float32), tf.TensorSpec(shape=(None), dtype=tf.float32), ), batch_size=batch_size, )) multi_worker_model.fit(tf_dataset, callbacks=[TrainReportCallback()])
def train_loop_per_worker(): data_shard = session.get_dataset_shard("train") assert isinstance(data_shard, Dataset), data_shard results = data_shard.take() check_results_fn(data_shard, results)
def train_loop_per_worker(): import pandas as pd rank = session.get_world_rank() data_shard = session.get_dataset_shard("train") start = time.perf_counter() epochs_read, batches_read, bytes_read = 0, 0, 0 batch_delays = [] def generate_epochs(data: Union[Dataset, DatasetPipeline], epochs: int): if isinstance(data, DatasetPipeline): for epoch in data_shard.iter_epochs(epochs): yield epoch else: # Dataset for _ in range(epochs): yield data print("Starting train loop on worker", rank) for epoch_data in generate_epochs(data_shard, num_epochs): epochs_read += 1 batch_start = time.perf_counter() for batch in epoch_data.iter_batches( prefetch_blocks=prefetch_blocks, batch_size=batch_size): batch_delay = time.perf_counter() - batch_start batch_delays.append(batch_delay) batches_read += 1 if isinstance(batch, pd.DataFrame): bytes_read += int( batch.memory_usage(index=True, deep=True).sum()) elif isinstance(batch, np.ndarray): bytes_read += batch.nbytes else: # NOTE: This isn't recursive and will just return the size of # the object pointers if list of non-primitive types. bytes_read += sys.getsizeof(batch) session.report( dict( bytes_read=bytes_read, batches_read=batches_read, epochs_read=epochs_read, batch_delay=batch_delay, )) batch_start = time.perf_counter() delta = time.perf_counter() - start print("Time to read all data", delta, "seconds") print( "P50/P95/Max batch delay (s)", np.quantile(batch_delays, 0.5), np.quantile(batch_delays, 0.95), np.max(batch_delays), ) print("Num epochs read", epochs_read) print("Num batches read", batches_read) print("Num bytes read", round(bytes_read / (1024 * 1024), 2), "MiB") print("Mean throughput", round(bytes_read / (1024 * 1024) / delta, 2), "MiB/s") if rank == 0: print("Ingest stats from rank=0:\n\n{}".format( data_shard.stats()))
def _huggingface_train_loop_per_worker(config): """Per-worker training loop for HuggingFace Transformers.""" trainer_init_per_worker = config.pop("_trainer_init_per_worker") # Env vars necessary for HF to setup DDP os.environ["RANK"] = str(session.get_world_rank()) os.environ["WORLD_SIZE"] = str(session.get_world_size()) os.environ["LOCAL_RANK"] = str(session.get_local_rank()) train_dataset = session.get_dataset_shard(TRAIN_DATASET_KEY) eval_dataset = session.get_dataset_shard(EVALUATION_DATASET_KEY) train_torch_dataset, eval_torch_dataset = process_datasets( train_dataset, eval_dataset, ) trainer: transformers.trainer.Trainer = trainer_init_per_worker( train_torch_dataset, eval_torch_dataset, **config) if trainer.args.push_to_hub and not trainer.args.hub_token: warnings.warn( "You have set `push_to_hub=True` but didn't specify `hub_token`. " "Pushing to hub will most likely fail, as the credentials will not " "be automatically propagated from the local enviroment to the Ray Actors. " "If that happens, specify `hub_token` in `TrainingArguments`.") if (trainer.args.evaluation_strategy == "steps" or trainer.args.save_strategy == "steps" or trainer.args.logging_strategy == "steps"): raise ValueError( "'steps' value for `evaluation_strategy`, `logging_strategy` " "or `save_strategy` is not yet supported.") trainer = wrap_transformers_trainer(trainer) # ensure no HF logging callbacks are added # aside from doubling functionality with our callbacks, # the Wandb callbacks causes training to freeze integration_callbacks = transformers.trainer.get_reporting_integration_callbacks( trainer.args.report_to) for callback in integration_callbacks: trainer.pop_callback(callback) trainer.add_callback(TrainReportCallback) checkpoint = session.get_checkpoint() checkpoint_path = None remove_checkpoint_path = False if checkpoint: assert isinstance(checkpoint, Checkpoint) checkpoint_dict = checkpoint.to_dict() source_ip = checkpoint_dict[NODE_IP_KEY] source_path = checkpoint_dict[CHECKPOINT_PATH_ON_NODE_KEY] target_ip = get_node_ip_address() if source_ip == target_ip: checkpoint_path = source_path else: checkpoint_path = tempfile.mkdtemp( suffix=Path(trainer.args.output_dir).name) remove_checkpoint_path = True sync_dir_between_nodes( source_ip=source_ip, source_path=source_path, target_ip=target_ip, target_path=checkpoint_path, return_futures=False, max_size_bytes=None, ) trainer.train(resume_from_checkpoint=checkpoint_path) if remove_checkpoint_path: shutil.rmtree(checkpoint_path, ignore_errors=True)
def train_func(config): use_gpu = config["use_gpu"] num_epochs = config["num_epochs"] batch_size = config["batch_size"] num_layers = config["num_layers"] num_hidden = config["num_hidden"] dropout_every = config["dropout_every"] dropout_prob = config["dropout_prob"] num_features = config["num_features"] print("Defining model, loss, and optimizer...") # Setup device. device = torch.device(f"cuda:{session.get_local_rank()}" if use_gpu and torch.cuda.is_available() else "cpu") print(f"Device: {device}") # Setup data. train_dataset_pipeline = session.get_dataset_shard("train") train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs() test_dataset = session.get_dataset_shard("test") test_torch_dataset = test_dataset.to_torch(label_column="label", batch_size=batch_size, drop_last=True) net = Net( n_layers=num_layers, n_features=num_features, num_hidden=num_hidden, dropout_every=dropout_every, drop_prob=dropout_prob, ).to(device) print(net.parameters) net = train.torch.prepare_model(net) criterion = nn.BCEWithLogitsLoss() optimizer = optim.Adam(net.parameters(), weight_decay=0.0001) print("Starting training...") for epoch in range(num_epochs): train_dataset = next(train_dataset_epoch_iterator) train_torch_dataset = train_dataset.to_torch(label_column="label", batch_size=batch_size) train_running_loss, train_num_correct, train_num_total = train_epoch( train_torch_dataset, net, device, criterion, optimizer) train_acc = train_num_correct / train_num_total print(f"epoch [{epoch + 1}]: training accuracy: " f"{train_num_correct} / {train_num_total} = {train_acc:.4f}") test_running_loss, test_num_correct, test_num_total = test_epoch( test_torch_dataset, net, device, criterion) test_acc = test_num_correct / test_num_total print(f"epoch [{epoch + 1}]: testing accuracy: " f"{test_num_correct} / {test_num_total} = {test_acc:.4f}") # Checkpoint model. module = net.module if isinstance(net, DistributedDataParallel) else net checkpoint = Checkpoint.from_dict(dict(model=module.state_dict())) # Record and log stats. print(f"session report on {session.get_world_rank()}") session.report( dict( train_acc=train_acc, train_loss=train_running_loss, test_acc=test_acc, test_loss=test_running_loss, ), checkpoint=checkpoint, )