def test_run(self): """Test that run just invokes the main method in the same process.""" hr = HorovodRunner(np=-1) data = [] def append(value): data.append(value) hr.run(append, value=1) self.assertEquals(data[0], 1)
def test_return_value(self): """Test that the return value is returned to the user.""" hr = HorovodRunner(np=-1) return_value = hr.run(lambda: 42) self.assertEquals(return_value, 42)
def test_init_keyword_only(self): """Test that user must use keyword args in __init__""" with self.assertRaises(TypeError): HorovodRunner(2)
# save checkpoint if hvd.rank() == 0: save_checkpoint(model, optimizer_hvd, epoch) val_loss, val_acc = evaluate(model, val_dataloader_iter, validation_steps, device, metric_agg_fn=metric_average) return val_loss, val_acc # COMMAND ---------- hr = HorovodRunner(np=10) # This assumes the cluster consists of 10 workers. hr.run(train_and_evaluate_hvd) # COMMAND ---------- # DBTITLE 1,Review Checkpoint Files # review checkpoint files display(dbutils.fs.ls('dbfs:/ml/horovod_pytorch/take2/PetaFlights')) # COMMAND ---------- # DBTITLE 1,Single Worker Test Set NUM_EPOCHS = 1 BATCH_SIZE = 100
batch_size=batch_size, validation_data=(x_train[test_start:test_end, :], y_train[test_start:test_end, :]), epochs=epochs, callbacks=callbacks) tf.keras.models.save_model(model, model_path, overwrite=True, include_optimizer=True) score = model.evaluate(x_test, y_test, batch_size=batch_size) tf.logging.info("Score: {}".format(score)) # COMMAND ---------- from sparkdl import HorovodRunner hr = HorovodRunner(np=2) now = datetime.datetime.now() model_name = "sentiment_model_" + now.strftime("%Y%m%d%H%M") + ".h5" model_checkpoints = model_name + "_weights.hdf5" logdir = "/dbfs/FileStore/ml/logs/multi/" + model_name checkpoints_path = "/dbfs/mnt/models/" + model_checkpoints model_save_path = "/dbfs/mnt/models/" + model_name hr.run(train, epochs=80, logdir=logdir, model_path=model_save_path, checkpoints_path=checkpoints_path) # COMMAND ----------
# Broadcast initial parameters so all workers start with the same parameters. hvd.broadcast_parameters(model.state_dict(), root_rank=0) for epoch in range(1, num_epochs + 1): train_one_epoch(model, device, train_loader, optimizer, epoch) # Only save checkpoints on the first worker. if hvd.rank() == 0: save_checkpoint(model, optimizer, epoch) # COMMAND ---------- # MAGIC %md # MAGIC With the function `run_training_horovod` defined previously with Horovod hooks, you can easily build the `HorovodRunner` and run distributed training. # COMMAND ---------- hr = HorovodRunner(np=2) # We assume cluster consists of two workers. hr.run(train_hvd, learning_rate=0.001) # COMMAND ---------- # MAGIC %md # MAGIC Under the hood, HorovodRunner takes a Python method that contains deep learning training code with Horovod hooks. This method gets pickled on the driver and sent to Spark workers. A Horovod MPI job is embedded as a Spark job using the barrier execution mode. The first executor collects the IP addresses of all task executors using BarrierTaskContext and triggers a Horovod job using `mpirun`. Each Python MPI process loads the pickled user program back, deserializes it, and runs it. # MAGIC # MAGIC For further information on HorovodRunner API, please refer to the [documentation](https://databricks.github.io/spark-deep-learning/docs/_site/api/python/index.html#sparkdl.HorovodRunner). Note that you can use `np=-1` to spawn a subprocess on the driver node for quicker development cycle. # MAGIC ``` # MAGIC hr = HorovodRunner(np=-1) # MAGIC hr.run(run_training) # MAGIC ```
model.fit(x_train, y_train, batch_size=batch_size, callbacks=callbacks, epochs=epochs, verbose=2, validation_data=(x_test, y_test)) # COMMAND ---------- # MAGIC %md # MAGIC Now that we have a training function with Horovod, we can use `HorovodRunner` to run distributed. To run this example on a cluster with 2 workers, each with a single GPU, initialize `HorovodRunner` with `np=2`: # COMMAND ---------- from sparkdl import HorovodRunner hr = HorovodRunner(np=2) hr.run(train_hvd, learning_rate=0.1) # COMMAND ---------- # MAGIC %md # MAGIC Under the hood, HorovodRunner takes a Python method that contains deep learning training code with Horovod hooks. This method gets pickled on the driver and sent to Spark workers. A Horovod MPI job is embedded as a Spark job using the barrier execution mode. The first executor collects the IP addresses of all task executors using BarrierTaskContext and triggers a Horovod job using `mpirun`. Each Python MPI process loads the pickled user program back, deserializes it, and runs it. # MAGIC # MAGIC For further information on HorovodRunner API, please refer to the [documentation](https://databricks.github.io/spark-deep-learning/docs/_site/api/python/index.html#sparkdl.HorovodRunner). Note that you can use `np=-1` to spawn a subprocess on the driver node for quicker development cycle. # MAGIC ``` # MAGIC hr = HorovodRunner(np=-1) # MAGIC hr.run(run_training) # MAGIC ```
def main(cls, units, **kwargs): hr = HorovodRunner(np=units) hr.run(SparkHorovod.train, **kwargs)