def test_run(self):
        """Test that run just invokes the main method in the same process."""
        hr = HorovodRunner(np=-1)
        data = []

        def append(value):
            data.append(value)

        hr.run(append, value=1)
        self.assertEquals(data[0], 1)
 def test_return_value(self):
     """Test that the return value is returned to the user."""
     hr = HorovodRunner(np=-1)
     return_value = hr.run(lambda: 42)
     self.assertEquals(return_value, 42)
 def test_init_keyword_only(self):
     """Test that user must use keyword args in __init__"""
     with self.assertRaises(TypeError):
         HorovodRunner(2)
Esempio n. 4
0
            # save checkpoint
            if hvd.rank() == 0: save_checkpoint(model, optimizer_hvd, epoch)

            val_loss, val_acc = evaluate(model,
                                         val_dataloader_iter,
                                         validation_steps,
                                         device,
                                         metric_agg_fn=metric_average)

    return val_loss, val_acc


# COMMAND ----------

hr = HorovodRunner(np=10)  # This assumes the cluster consists of 10 workers.
hr.run(train_and_evaluate_hvd)

# COMMAND ----------

# DBTITLE 1,Review Checkpoint Files
# review checkpoint files
display(dbutils.fs.ls('dbfs:/ml/horovod_pytorch/take2/PetaFlights'))

# COMMAND ----------

# DBTITLE 1,Single Worker Test Set
NUM_EPOCHS = 1
BATCH_SIZE = 100

              batch_size=batch_size,
              validation_data=(x_train[test_start:test_end, :],
                               y_train[test_start:test_end, :]),
              epochs=epochs,
              callbacks=callbacks)
    tf.keras.models.save_model(model,
                               model_path,
                               overwrite=True,
                               include_optimizer=True)
    score = model.evaluate(x_test, y_test, batch_size=batch_size)
    tf.logging.info("Score: {}".format(score))


# COMMAND ----------

from sparkdl import HorovodRunner
hr = HorovodRunner(np=2)
now = datetime.datetime.now()
model_name = "sentiment_model_" + now.strftime("%Y%m%d%H%M") + ".h5"
model_checkpoints = model_name + "_weights.hdf5"
logdir = "/dbfs/FileStore/ml/logs/multi/" + model_name
checkpoints_path = "/dbfs/mnt/models/" + model_checkpoints
model_save_path = "/dbfs/mnt/models/" + model_name
hr.run(train,
       epochs=80,
       logdir=logdir,
       model_path=model_save_path,
       checkpoints_path=checkpoints_path)

# COMMAND ----------
Esempio n. 6
0
    # Broadcast initial parameters so all workers start with the same parameters.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    for epoch in range(1, num_epochs + 1):
        train_one_epoch(model, device, train_loader, optimizer, epoch)
        # Only save checkpoints on the first worker.
        if hvd.rank() == 0:
            save_checkpoint(model, optimizer, epoch)


# COMMAND ----------

# MAGIC %md
# MAGIC With the function `run_training_horovod` defined previously with Horovod hooks, you can easily build the `HorovodRunner` and run distributed training.

# COMMAND ----------

hr = HorovodRunner(np=2)  # We assume cluster consists of two workers.
hr.run(train_hvd, learning_rate=0.001)

# COMMAND ----------

# MAGIC %md
# MAGIC Under the hood, HorovodRunner takes a Python method that contains deep learning training code with Horovod hooks. This method gets pickled on the driver and sent to Spark workers. A Horovod MPI job is embedded as a Spark job using the barrier execution mode. The first executor collects the IP addresses of all task executors using BarrierTaskContext and triggers a Horovod job using `mpirun`. Each Python MPI process loads the pickled user program back, deserializes it, and runs it.
# MAGIC
# MAGIC For further information on HorovodRunner API, please refer to the [documentation](https://databricks.github.io/spark-deep-learning/docs/_site/api/python/index.html#sparkdl.HorovodRunner). Note that you can use `np=-1` to spawn a subprocess on the driver node for quicker development cycle.
# MAGIC ```
# MAGIC hr = HorovodRunner(np=-1)
# MAGIC hr.run(run_training)
# MAGIC ```
    model.fit(x_train,
              y_train,
              batch_size=batch_size,
              callbacks=callbacks,
              epochs=epochs,
              verbose=2,
              validation_data=(x_test, y_test))


# COMMAND ----------

# MAGIC %md
# MAGIC Now that we have a training function with Horovod, we can use `HorovodRunner` to run distributed. To run this example on a cluster with 2 workers, each with a single GPU, initialize `HorovodRunner` with `np=2`:

# COMMAND ----------

from sparkdl import HorovodRunner

hr = HorovodRunner(np=2)
hr.run(train_hvd, learning_rate=0.1)

# COMMAND ----------

# MAGIC %md
# MAGIC Under the hood, HorovodRunner takes a Python method that contains deep learning training code with Horovod hooks. This method gets pickled on the driver and sent to Spark workers. A Horovod MPI job is embedded as a Spark job using the barrier execution mode. The first executor collects the IP addresses of all task executors using BarrierTaskContext and triggers a Horovod job using `mpirun`. Each Python MPI process loads the pickled user program back, deserializes it, and runs it.
# MAGIC
# MAGIC For further information on HorovodRunner API, please refer to the [documentation](https://databricks.github.io/spark-deep-learning/docs/_site/api/python/index.html#sparkdl.HorovodRunner). Note that you can use `np=-1` to spawn a subprocess on the driver node for quicker development cycle.
# MAGIC ```
# MAGIC hr = HorovodRunner(np=-1)
# MAGIC hr.run(run_training)
# MAGIC ```
Esempio n. 8
0
 def main(cls, units, **kwargs):
     hr = HorovodRunner(np=units)
     hr.run(SparkHorovod.train, **kwargs)