def get_dataset(): # Train dataset should be sharded. train_dataset = train.get_dataset_shard("train") assert train_dataset.count( ) == num_train_data / scale_config["num_workers"] # All other datasets should not be sharded. val_dataset = train.get_dataset_shard("val") assert val_dataset.count() == num_val_data
def train_func(config): batch_size = config.get("batch_size", 32) hidden_size = config.get("hidden_size", 1) lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) train_dataset_pipeline_shard = train.get_dataset_shard("train") validation_dataset_pipeline_shard = train.get_dataset_shard("validation") device = torch.device( f"cuda:{train.local_rank()}" if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): torch.cuda.set_device(device) model = nn.Linear(1, hidden_size) model = model.to(device) model = DistributedDataParallel( model, device_ids=[train.local_rank()] if torch.cuda.is_available() else None) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) results = [] train_dataset_iterator = train_dataset_pipeline_shard.iter_datasets() validation_dataset_iterator = \ validation_dataset_pipeline_shard.iter_datasets() for _ in range(epochs): train_dataset = next(train_dataset_iterator) validation_dataset = next(validation_dataset_iterator) train_torch_dataset = train_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=[torch.float], batch_size=batch_size, ) validation_torch_dataset = validation_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=[torch.float], batch_size=batch_size) train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) result = validate_epoch(validation_torch_dataset, model, loss_fn, device) train.report(**result) results.append(result) return results
def train_fn( executable_kwargs: Dict[str, Any] = None, model: "LudwigModel" = None, # noqa: F821 training_set_metadata: Dict[str, Any] = None, features: Dict[str, Dict] = None, **kwargs, ): # Pin GPU before loading the model to prevent memory leaking onto other devices hvd = initialize_horovod() initialize_pytorch(horovod=hvd) train_shard = RayDatasetShard( rt.get_dataset_shard("train"), features, training_set_metadata, ) try: val_shard = rt.get_dataset_shard("val") except KeyError: val_shard = None if val_shard is not None: val_shard = RayDatasetShard( val_shard, features, training_set_metadata, ) try: test_shard = rt.get_dataset_shard("test") except KeyError: test_shard = None if test_shard is not None: test_shard = RayDatasetShard( test_shard, features, training_set_metadata, ) trainer = RemoteTrainer(model=model, **executable_kwargs) results = trainer.train(train_shard, val_shard, test_shard, **kwargs) # TODO(shreya): Figure out GPU memory leak # TODO(shreya): Check if placing model off GPU explicitly makes a difference # Clear CUDA memory, place model on CPU, return model to user # torch.cuda.empty_cache() # model.cpu() return results, trainer.validation_field, trainer.validation_metric
def train_func(config): batch_size = config.get("batch_size", 32) hidden_size = config.get("hidden_size", 1) lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) train_dataset_pipeline_shard = train.get_dataset_shard("train") validation_dataset_pipeline_shard = train.get_dataset_shard("validation") model = nn.Linear(1, hidden_size) model = train.torch.prepare_model(model) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) results = [] train_dataset_iterator = train_dataset_pipeline_shard.iter_epochs() validation_dataset_iterator = validation_dataset_pipeline_shard.iter_epochs( ) for _ in range(epochs): train_dataset = next(train_dataset_iterator) validation_dataset = next(validation_dataset_iterator) train_torch_dataset = train_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) validation_torch_dataset = validation_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) device = train.torch.get_device() train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) result = validate_epoch(validation_torch_dataset, model, loss_fn, device) train.report(**result) results.append(result) return results
def train_loop_per_worker(): data_shard = train.get_dataset_shard("train") if expect_ds: assert isinstance(data_shard, Dataset), data_shard else: assert isinstance(data_shard, DatasetPipeline), data_shard for k, v in expect_sizes.items(): shard = train.get_dataset_shard(k) if v == -1: assert shard is None, shard else: if isinstance(shard, DatasetPipeline): assert next(shard.iter_epochs()).count() == v, shard else: assert shard.count() == v, shard
def train_loop_per_worker(config): batch_size = config["batch_size"] lr = config["lr"] epochs = config["num_epochs"] num_features = config["num_features"] # Get the Ray Dataset shard for this data parallel worker, # and convert it to a Tensorflow Dataset. train_data = train.get_dataset_shard("train") strategy = tf.distribute.MultiWorkerMirroredStrategy() with strategy.scope(): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = create_keras_model(num_features) multi_worker_model.compile( optimizer=tf.keras.optimizers.SGD(learning_rate=lr), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=[ tf.keras.metrics.BinaryCrossentropy( name="loss", ) ], ) results = [] for _ in range(epochs): tf_dataset = to_tf_dataset(dataset=train_data, batch_size=batch_size) history = multi_worker_model.fit( tf_dataset, callbacks=[KerasCallback()], verbose=0, ) return results
def eval_fn( predictor_kwargs: Dict[str, Any] = None, model_ref: ObjectRef = None, # noqa: F821 training_set_metadata: Dict[str, Any] = None, features: Dict[str, Dict] = None, **kwargs, ): # Pin GPU before loading the model to prevent memory leaking onto other devices hvd = initialize_horovod() try: initialize_pytorch(horovod=hvd) eval_shard = RayDatasetShard( rt.get_dataset_shard("eval"), features, training_set_metadata, ) model = ray.get(model_ref) device = get_torch_device() model = model.to(device) predictor = RemotePredictor(model=model, horovod=hvd, report_tqdm_to_ray=True, **predictor_kwargs) return predictor.batch_evaluation(eval_shard, **kwargs) finally: torch.cuda.empty_cache() hvd.shutdown()
def train_func(config): batch_size = config.get("batch_size", 64) epochs = config.get("epochs", 3) strategy = tf.distribute.MultiWorkerMirroredStrategy() with strategy.scope(): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = build_and_compile_model(config) dataset_pipeline = train.get_dataset_shard() dataset_iterator = dataset_pipeline.iter_epochs() results = [] for _ in range(epochs): dataset = next(dataset_iterator) tf_dataset = prepare_dataset_shard( dataset.to_tf( label_column="y", output_signature=( tf.TensorSpec(shape=(None, 1), dtype=tf.float32), tf.TensorSpec(shape=(None), dtype=tf.float32), ), batch_size=batch_size, )) history = multi_worker_model.fit(tf_dataset, callbacks=[TrainReportCallback()]) results.append(history.history) return results
def train_func(config: dict): batch_size = config.get("batch_size", 64) epochs = config.get("epochs", 3) strategy = tf.distribute.MultiWorkerMirroredStrategy() with strategy.scope(): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = build_model() multi_worker_model.compile( optimizer=tf.keras.optimizers.SGD(learning_rate=config.get("lr", 1e-3)), loss=tf.keras.losses.mean_squared_error, metrics=[tf.keras.metrics.mean_squared_error], ) dataset = train.get_dataset_shard("train") results = [] for _ in range(epochs): tf_dataset = prepare_dataset_shard( dataset.to_tf( label_column="y", output_signature=( tf.TensorSpec(shape=(None, 1), dtype=tf.float32), tf.TensorSpec(shape=(None), dtype=tf.float32), ), batch_size=batch_size, ) ) history = multi_worker_model.fit( tf_dataset, callbacks=[TrainCheckpointReportCallback()] ) results.append(history.history) return results
def train_loop_per_worker(): data_shard = train.get_dataset_shard("train") assert isinstance(data_shard, DatasetPipeline), data_shard results = [] for epoch in data_shard.iter_epochs(2): results.append(epoch.take()) check_results_fn(data_shard, results)
def get_dataset(): data_train_all_epochs = [] data_val_all_epochs = [] for _ in range(2): data_this_epoch_train = [] train_dataset = train.get_dataset_shard("train") for batch in train_dataset.iter_batches(): data_this_epoch_train.extend(batch) data_train_all_epochs.append(data_this_epoch_train) data_this_epoch_val = [] val_dataset = train.get_dataset_shard("val") for batch in val_dataset.iter_batches(): data_this_epoch_val.extend(batch) data_val_all_epochs.append(data_this_epoch_val) return data_train_all_epochs, data_val_all_epochs
def get_dataset(): data_all_epochs = [] for _ in range(2): data_this_epoch = [] dataset = train.get_dataset_shard() for batch in dataset.iter_batches(): data_this_epoch.extend(batch) data_all_epochs.append(data_this_epoch) return data_all_epochs
def train_loop_per_worker(): data_shard: Dataset = train.get_dataset_shard("train") # Iterate over 10 epochs of data. for epoch in range(10): for batch in data_shard.iter_batches( batch_size=10_000, local_shuffle_buffer_size=100_000, ): print("Do some training on batch", batch)
def train_loop_per_worker(): data_shard: Dataset = train.get_dataset_shard("train") # Iterate over 10 epochs of data. for epoch in range(10): for batch in data_shard.iter_batches(): print("Do some training on batch", batch) # View the stats for performance debugging. print(data_shard.stats())
def get_dataset(): pipeline_iterator = train.get_dataset_shard().iter_datasets() data_all_epochs = [] for _ in range(num_epochs): dataset_this_epoch = next(pipeline_iterator) data_this_epoch = [] for batch in dataset_this_epoch.iter_batches(): data_this_epoch.extend(batch) data_all_epochs.append(data_this_epoch) return data_all_epochs
def train_loop_per_worker(): # A DatasetPipeline object is returned when `use_stream_api` is set. data_shard: DatasetPipeline = train.get_dataset_shard("train") # Use iter_epochs(10) to iterate over 10 epochs of data. for epoch in data_shard.iter_epochs(10): for batch in epoch.iter_batches(): print("Do some training on batch", batch) # View the stats for performance debugging. print(data_shard.stats())
def train_loop_per_worker(): # By default, bulk loading is used and returns a Dataset object. data_shard: Dataset = train.get_dataset_shard("train") # Manually iterate over the data 10 times (10 epochs). for _ in range(10): for batch in data_shard.iter_batches(): print("Do some training on batch", batch) # View the stats for performance debugging. print(data_shard.stats())
def train_loop_per_worker(): import pandas as pd rank = train.world_rank() data_shard = train.get_dataset_shard("train") start = time.perf_counter() num_epochs, num_batches, num_bytes = 0, 0, 0 batch_delays = [] print("Starting train loop on worker", rank) while time.perf_counter() - start < runtime_seconds: num_epochs += 1 batch_start = time.perf_counter() for batch in data_shard.iter_batches( prefetch_blocks=prefetch_blocks, batch_size=batch_size ): batch_delay = time.perf_counter() - batch_start batch_delays.append(batch_delay) num_batches += 1 if isinstance(batch, pd.DataFrame): num_bytes += int( batch.memory_usage(index=True, deep=True).sum() ) elif isinstance(batch, np.ndarray): num_bytes += batch.nbytes else: # NOTE: This isn't recursive and will just return the size of # the object pointers if list of non-primitive types. num_bytes += sys.getsizeof(batch) train.report( bytes_read=num_bytes, num_batches=num_batches, num_epochs=num_epochs, batch_delay=batch_delay, ) batch_start = time.perf_counter() delta = time.perf_counter() - start print("Time to read all data", delta, "seconds") print( "P50/P95/Max batch delay (s)", np.quantile(batch_delays, 0.5), np.quantile(batch_delays, 0.95), np.max(batch_delays), ) print("Num epochs read", num_epochs) print("Num batches read", num_batches) print("Num bytes read", round(num_bytes / (1024 * 1024), 2), "MiB") print( "Mean throughput", round(num_bytes / (1024 * 1024) / delta, 2), "MiB/s" ) if rank == 0: print("Ingest stats from rank=0:\n\n{}".format(data_shard.stats()))
def get_dataset(): pipeline_iterator = train.get_dataset_shard().iter_datasets() data_all_epochs = [] for _ in range(2): dataset_this_epoch = next(pipeline_iterator) data_this_epoch = [] for batch in dataset_this_epoch.iter_batches(): data_this_epoch.extend(batch) if len(data_all_epochs) > 0: # Make sure data is shuffled per epoch. assert data_this_epoch != data_all_epochs[-1] data_all_epochs.append(data_this_epoch) return data_all_epochs
def train_loop_per_worker(): rank = train.world_rank() data_shard = train.get_dataset_shard("train") start = time.perf_counter() num_epochs, num_batches, num_bytes = 0, 0, 0 batch_delays = [] print("Starting train loop on worker", rank) while time.perf_counter() - start < runtime_seconds: num_epochs += 1 batch_start = time.perf_counter() for batch in data_shard.iter_batches( prefetch_blocks=prefetch_blocks, batch_size=batch_size): batch_delay = time.perf_counter() - batch_start batch_delays.append(batch_delay) num_batches += 1 num_bytes += int( batch.memory_usage(index=True, deep=True).sum()) train.report( bytes_read=num_bytes, num_batches=num_batches, num_epochs=num_epochs, batch_delay=batch_delay, ) batch_start = time.perf_counter() delta = time.perf_counter() - start print("Time to read all data", delta, "seconds") print( "P50/P95/Max batch delay (s)", np.quantile(batch_delays, 0.5), np.quantile(batch_delays, 0.95), np.max(batch_delays), ) print("Num epochs read", num_epochs) print("Num batches read", num_batches) print("Num bytes read", round(num_bytes / (1024 * 1024), 2), "MiB") print("Mean throughput", round(num_bytes / (1024 * 1024) / delta, 2), "MiB/s") if rank == 0: print("Ingest stats from rank=0:\n\n{}".format( data_shard.stats()))
def train_func(config: dict): per_worker_batch_size = config.get("batch_size", 64) epochs = config.get("epochs", 3) dataset_shard = train.get_dataset_shard("train") strategy = tf.distribute.MultiWorkerMirroredStrategy() with strategy.scope(): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = build_autoencoder_model() learning_rate = config.get("lr", 0.001) multi_worker_model.compile( loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=[ "binary_crossentropy", ], ) results = [] for epoch in range(epochs): tf_dataset = prepare_dataset_shard( dataset_shard.to_tf( feature_columns=["image"], label_column="label", output_signature=( tf.TensorSpec(shape=(None, 784), dtype=tf.float32), tf.TensorSpec(shape=(None, 784), dtype=tf.float32), ), batch_size=per_worker_batch_size, )) history = multi_worker_model.fit( tf_dataset, callbacks=[TrainCheckpointReportCallback()]) results.append(history.history) return results
def train_loop_per_worker(config): batch_size = config["batch_size"] lr = config["lr"] epochs = config["num_epochs"] num_features = config["num_features"] # Get the Ray Dataset shard for this data parallel worker, # and convert it to a PyTorch Dataset. train_data = train.get_dataset_shard("train") def to_tensor_iterator(dataset, batch_size): data_iterator = dataset.iter_batches(batch_format="numpy", batch_size=batch_size) for d in data_iterator: # "concat_out" is the output column of the Concatenator. yield ( torch.Tensor(d["concat_out"]).float(), torch.Tensor(d["target"]).float(), ) # Create model. model = create_model(num_features) model = train.torch.prepare_model(model) loss_fn = nn.BCELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) for cur_epoch in range(epochs): for inputs, labels in to_tensor_iterator(train_data, batch_size): optimizer.zero_grad() predictions = model(inputs) train_loss = loss_fn(predictions, labels.unsqueeze(1)) train_loss.backward() optimizer.step() loss = train_loss.item() session.report({"loss": loss}, checkpoint=TorchCheckpoint.from_model(model))
def train_fn( executable_kwargs: Dict[str, Any] = None, model_ref: ObjectRef = None, # noqa: F821 training_set_metadata: Dict[str, Any] = None, features: Dict[str, Dict] = None, **kwargs, ): # Pin GPU before loading the model to prevent memory leaking onto other devices hvd = initialize_horovod() try: initialize_pytorch(horovod=hvd) train_shard = RayDatasetShard( rt.get_dataset_shard("train"), features, training_set_metadata, ) try: val_shard = rt.get_dataset_shard("val") except KeyError: val_shard = None if val_shard is not None: val_shard = RayDatasetShard( val_shard, features, training_set_metadata, ) try: test_shard = rt.get_dataset_shard("test") except KeyError: test_shard = None if test_shard is not None: test_shard = RayDatasetShard( test_shard, features, training_set_metadata, ) model = ray.get(model_ref) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) trainer = RemoteTrainer(model=model, horovod=hvd, **executable_kwargs) results = trainer.train(train_shard, val_shard, test_shard, **kwargs) if results is not None: # only return the model state dict back to the head node. trained_model, *args = results results = (trained_model.cpu().state_dict(), *args) torch.cuda.empty_cache() train_results = results, trainer.validation_field, trainer.validation_metric finally: hvd.shutdown() return train_results
def train_func(config): use_gpu = config["use_gpu"] num_epochs = config["num_epochs"] batch_size = config["batch_size"] num_layers = config["num_layers"] num_hidden = config["num_hidden"] dropout_every = config["dropout_every"] dropout_prob = config["dropout_prob"] num_features = config["num_features"] print("Defining model, loss, and optimizer...") # Setup device. device = torch.device(f"cuda:{train.local_rank()}" if use_gpu and torch.cuda.is_available() else "cpu") print(f"Device: {device}") # Setup data. train_dataset_pipeline = train.get_dataset_shard("train_dataset") train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs() test_dataset = train.get_dataset_shard("test_dataset") test_torch_dataset = test_dataset.to_torch(label_column="label", batch_size=batch_size) net = Net( n_layers=num_layers, n_features=num_features, num_hidden=num_hidden, dropout_every=dropout_every, drop_prob=dropout_prob, ).to(device) print(net.parameters) net = train.torch.prepare_model(net) criterion = nn.BCEWithLogitsLoss() optimizer = optim.Adam(net.parameters(), weight_decay=0.0001) print("Starting training...") for epoch in range(num_epochs): train_dataset = next(train_dataset_epoch_iterator) train_torch_dataset = train_dataset.to_torch(label_column="label", batch_size=batch_size) train_running_loss, train_num_correct, train_num_total = train_epoch( train_torch_dataset, net, device, criterion, optimizer, num_features) train_acc = train_num_correct / train_num_total print(f"epoch [{epoch + 1}]: training accuracy: " f"{train_num_correct} / {train_num_total} = {train_acc:.4f}") test_running_loss, test_num_correct, test_num_total = test_epoch( test_torch_dataset, net, device, criterion) test_acc = test_num_correct / test_num_total print(f"epoch [{epoch + 1}]: testing accuracy: " f"{test_num_correct} / {test_num_total} = {test_acc:.4f}") # Record and log stats. train.report( train_acc=train_acc, train_loss=train_running_loss, test_acc=test_acc, test_loss=test_running_loss, ) # Checkpoint model. module = net.module if isinstance(net, DistributedDataParallel) else net train.save_checkpoint(model_state_dict=module.state_dict()) if train.world_rank() == 0: return module.cpu()
def train_func(config): is_distributed = config.get("is_distributed", False) use_gpu = config["use_gpu"] num_epochs = config["num_epochs"] batch_size = config["batch_size"] num_layers = config["num_layers"] num_hidden = config["num_hidden"] dropout_every = config["dropout_every"] dropout_prob = config["dropout_prob"] num_features = config["num_features"] print("Defining model, loss, and optimizer...") # Setup device. if is_distributed: device = torch.device(f"cuda:{train.local_rank()}" if use_gpu and torch.cuda.is_available() else "cpu") else: device = torch.device( "cuda:0" if use_gpu and torch.cuda.is_available() else "cpu") print(f"Device: {device}") # Setup data. if is_distributed: train_dataset_pipeline = train.get_dataset_shard("train_dataset") train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs() test_dataset = train.get_dataset_shard("test_dataset") else: train_dataset_epoch_iterator = config["train_dataset"].iter_epochs() test_dataset = config["test_dataset"] test_torch_dataset = test_dataset.to_torch(label_column="label", batch_size=batch_size) # Setup Tensorboard and MLflow. if is_distributed: # Setup is done through Callback. pass else: writer = SummaryWriter() mlflow.start_run() mlflow_config = config.copy() mlflow_config.pop("test_dataset") mlflow_config.pop("train_dataset") mlflow.log_params(mlflow_config) net = Net( n_layers=num_layers, n_features=num_features, num_hidden=num_hidden, dropout_every=dropout_every, drop_prob=dropout_prob, ).to(device) print(net.parameters) if is_distributed: net = DistributedDataParallel(net) criterion = nn.BCEWithLogitsLoss() optimizer = optim.Adam(net.parameters(), weight_decay=0.0001) print("Starting training...") for epoch in range(num_epochs): train_dataset = next(train_dataset_epoch_iterator) train_torch_dataset = train_dataset.to_torch(label_column="label", batch_size=batch_size) train_running_loss, train_num_correct, train_num_total = train_epoch( train_torch_dataset, net, device, criterion, optimizer) train_acc = train_num_correct / train_num_total print( f"epoch [{epoch + 1}]: training accuracy: {train_num_correct} / {train_num_total} = {train_acc:.4f}" ) test_running_loss, test_num_correct, test_num_total = test_epoch( test_torch_dataset, net, device, criterion) test_acc = test_num_correct / test_num_total print( f"epoch [{epoch + 1}]: testing accuracy: {test_num_correct} / {test_num_total} = {test_acc:.4f}" ) # Record and log stats. if is_distributed: train.report(train_acc=train_acc, train_loss=train_running_loss, test_acc=test_acc, test_loss=test_running_loss) else: writer.add_scalar("Accuracy/train", train_acc, epoch) writer.add_scalar("Loss/train", train_running_loss, epoch) writer.add_scalar("Accuracy/test", test_acc, epoch) writer.add_scalar("Loss/test", test_running_loss, epoch) writer.flush() mlflow.log_metrics({ "train_acc": train_acc, "train_loss": train_running_loss, "test_acc": test_acc, "test_loss": test_running_loss }) # Checkpoint model. if is_distributed: import copy model_copy = copy.deepcopy(net.module) train.save_checkpoint( model_state_dict=model_copy.cpu().state_dict()) else: torch.save(net.state_dict(), f"models/model-epoch-{epoch}.torch") # Shutdown Tensorboard and MLflow. if is_distributed: pass else: writer.close() # mlflow.end_run() if is_distributed: if train.world_rank() == 0: return net.module.cpu() else: return None else: return net
def train_func(): return train.get_dataset_shard()
def _huggingface_train_loop_per_worker(config): """Per-worker training loop for HuggingFace Transformers.""" trainer_init_per_worker = config.pop("_trainer_init_per_worker") # Env vars necessary for HF to setup DDP os.environ["RANK"] = str(train.world_rank()) os.environ["WORLD_SIZE"] = str(train.world_size()) os.environ["LOCAL_RANK"] = str(train.local_rank()) train_dataset = train.get_dataset_shard(TRAIN_DATASET_KEY) eval_dataset = train.get_dataset_shard(EVALUATION_DATASET_KEY) train_torch_dataset, eval_torch_dataset = process_datasets( train_dataset, eval_dataset, ) trainer: transformers.trainer.Trainer = trainer_init_per_worker( train_torch_dataset, eval_torch_dataset, **config) if trainer.args.push_to_hub and not trainer.args.hub_token: warnings.warn( "You have set `push_to_hub=True` but didn't specify `hub_token`. " "Pushing to hub will most likely fail, as the credentials will not " "be automatically propagated from the local enviroment to the Ray Actors. " "If that happens, specify `hub_token` in `TrainingArguments`.") if (trainer.args.evaluation_strategy == "steps" or trainer.args.save_strategy == "steps" or trainer.args.logging_strategy == "steps"): raise ValueError( "'steps' value for `evaluation_strategy`, `logging_strategy` " "or `save_strategy` is not yet supported.") trainer = wrap_transformers_trainer(trainer) # ensure no HF logging callbacks are added # aside from doubling functionality with our callbacks, # the Wandb callbacks causes training to freeze integration_callbacks = transformers.trainer.get_reporting_integration_callbacks( trainer.args.report_to) for callback in integration_callbacks: trainer.pop_callback(callback) trainer.add_callback(TrainReportCallback) checkpoint = session.get_checkpoint() checkpoint_path = None remove_checkpoint_path = False if checkpoint: assert isinstance(checkpoint, Checkpoint) checkpoint_dict = checkpoint.to_dict() source_ip = checkpoint_dict[NODE_IP_KEY] source_path = checkpoint_dict[CHECKPOINT_PATH_ON_NODE_KEY] target_ip = get_node_ip_address() if source_ip == target_ip: checkpoint_path = source_path else: checkpoint_path = tempfile.mkdtemp( suffix=Path(trainer.args.output_dir).name) remove_checkpoint_path = True sync_dir_between_nodes( source_ip=source_ip, source_path=source_path, target_ip=target_ip, target_path=checkpoint_path, return_futures=False, max_size_bytes=None, ) trainer.train(resume_from_checkpoint=checkpoint_path) if remove_checkpoint_path: shutil.rmtree(checkpoint_path, ignore_errors=True)
def train_loop_per_worker(): data_shard = train.get_dataset_shard("train") assert isinstance(data_shard, Dataset), data_shard results = data_shard.take() check_results_fn(data_shard, results)