def init_horovod_if_needed(self):
        for _ in range(DEFAULT_MAX_ALLREDUCE_RETRY_NUM):
            rank_response = self._master_client.get_comm_rank()
            if rank_response.rank_id < 0:
                logger.warning(
                    "The master has not added the worker host into "
                    "rendezvous yet. Retrying to get rank"
                )
                time.sleep(5)
            else:
                break

        # If the rendezvous from master is unequal to self._rendezvous_id,
        # the worker should rebuild the communication because the master
        # has updated the communication group.
        if rank_response.rendezvous_id != self._rendezvous_id:
            os.environ[HorovodEnv.RENDEZVOUS_PORT] = str(
                rank_response.rendezvous_port
            )
            os.environ[HorovodEnv.RANK] = str(rank_response.rank_id)
            os.environ[HorovodEnv.SIZE] = str(rank_response.world_size)
            hvd.shutdown()
            hvd.init()
            self._world_size = hvd.size()
            self._rendezvous_id = rank_response.rendezvous_id
            self._need_broadcast = True
 def test_train_minibatch(self):
     self._trainer.init_horovod_if_needed()
     features = tf.constant([[0.5], [0.6], [0.7]])
     labels = tf.constant([[1.0], [0.0], [1.0]])
     _, version, loss = self._trainer.train_minibatch(features, labels)
     self.assertEqual(version, 1)
     self.assertIsNotNone(loss)
     hvd.shutdown()
 def test_training_process_with_fault_tolerance(self):
     self._trainer.init_horovod_if_needed()
     features = tf.constant([[0.5], [0.6], [0.7]])
     labels = tf.constant([[1.0], [0.0], [1.0]])
     version, _ = self._trainer.training_process_with_fault_tolerance(
         features, labels)
     # Firstly, we will call model locally to create variables
     # for the model and optimizer. Then, we will train the
     # model using Horovod. So, the iteration step = 2.
     self.assertEqual(version, 2)
     hvd.shutdown()
Beispiel #4
0
def stop_train(rank):
    # hvd shutdown
    logging.info('hvd shutdown at rank %d', rank)
    hvd.shutdown()
Beispiel #5
0
    def test_static(self):
        mpi_rank, mpi_size = mpi_env_rank_and_size()
        gloo_rank = int(os.getenv('HOROVOD_RANK', -1))
        gloo_size = int(os.getenv('HOROVOD_SIZE', -1))
        is_mpi = gloo_rank == -1

        rank = max(mpi_rank, gloo_rank)
        size = max(mpi_size, gloo_size)

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        if is_mpi:
            try:
                import mpi4py
                mpi4py.rc.initialize = False
            except ImportError:
                pass

        if rank == 0:
            my_process_sets = [
                hvd.ProcessSet([0]),
                hvd.ProcessSet(range(1, size)),
                hvd.ProcessSet(range(size - 1, -1, -1)),  # duplicate
                hvd.ProcessSet([0])  # duplicate
            ]
        else:
            my_process_sets = [
                hvd.ProcessSet([0]),
                hvd.ProcessSet(reversed(range(
                    1, size))),  # permuting a process set does not matter
                hvd.ProcessSet(range(size - 1, -1, -1)),  # duplicate
                hvd.ProcessSet([0])  # duplicate
            ]
        with self.assertRaises(ValueError):
            hvd.init(process_sets=my_process_sets)

        if rank == 0:
            my_process_sets = [
                hvd.ProcessSet([0]),
                hvd.ProcessSet(range(1, size)),
            ]
        else:
            my_process_sets = [
                hvd.ProcessSet([0]),
                hvd.ProcessSet(reversed(range(
                    1, size))),  # permuting a process set does not matter
            ]
        hvd.init(process_sets=my_process_sets)

        self.assertEqual(hvd.global_process_set.process_set_id, 0)
        self.assertListEqual(hvd.global_process_set.ranks, list(range(size)))

        # Here we test some implementation details (numeric process set id values) using an internal function.
        ps = hvd.mpi_ops._basics._get_process_set_ids_and_ranks()
        self.assertDictEqual(ps, {
            0: list(range(size)),
            1: [0],
            2: list(range(1, size))
        })

        # If another process initiates shutdown while this process is still processing _get_process_set_ids_and_ranks(),
        # a race condition may be triggered. Avoid with a barrier.
        try:
            if is_mpi:
                # barrier before shutdown
                from mpi4py import MPI
                MPI.COMM_WORLD.barrier()
            else:
                time.sleep(0.1)
        except ImportError:
            time.sleep(0.1)

        hvd.shutdown()
Beispiel #6
0
def main(unused_argv):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    #(train_data, train_labels), (eval_data, eval_labels) = \
    #    keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())
    f = np.load("/opt/ml/input/data/training/mnist.npz")
    train_data, train_labels = f['x_train'], f['y_train']
    eval_data, eval_labels = f['x_test'], f['y_test']
    print('train_labels: ', train_labels[0], 'rank: ', hvd.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    train_data = np.reshape(train_data, (-1, 784)) / 255.0
    eval_data = np.reshape(eval_data, (-1, 784)) / 255.0

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    starttime = time.time()
    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir = '/opt/ml/model/' if hvd.rank() == 0 else None

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn,
        model_dir=model_dir,
        config=tf.estimator.RunConfig(session_config=config))

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                              every_n_iter=1000)

    # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
    # rank 0 to all other processes. This is necessary to ensure consistent
    # initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data},
                                                        y=train_labels,
                                                        batch_size=100,
                                                        num_epochs=None,
                                                        shuffle=True)

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(input_fn=train_input_fn,
                           steps=1000 // hvd.size(),
                           hooks=[logging_hook, bcast_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data},
                                                       y=eval_labels,
                                                       num_epochs=1,
                                                       shuffle=False)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)
    print('total time: ', time.time() - starttime, 'rank: ', hvd.rank())
    print(hvd.rank(), " shutdown")
    hvd.shutdown()
    def test_multi_comm(self):
        gloo_size = int(os.getenv('HOROVOD_SIZE', -1))
        if gloo_size != -1:
            self.skipTest("This test is specific to MPI and does not apply with Gloo controller.")

        try:
            from mpi4py import MPI
        except ImportError:
            self.skipTest("This test requires mpi4py.")

        # This will be our baseline world communicator
        comm = MPI.COMM_WORLD

        size = comm.size
        if size < 2:
            self.skipTest("This test requires multiple workers.")

        # Split COMM_WORLD into subcommunicators
        subcomm = MPI.COMM_WORLD.Split(color=MPI.COMM_WORLD.rank % 2,
                                       key=MPI.COMM_WORLD.rank)
        comm_clone = comm.Dup()
        subcomm_clone = subcomm.Dup()
        subcomm_effective_clone = hvd.ProcessSet(range(0, comm.size, 2))  # identified as a clone on even ranks

        # 3+ duplicates
        my_process_sets = [hvd.ProcessSet(subcomm),
                           hvd.ProcessSet(comm_clone),
                           hvd.ProcessSet(subcomm_clone),
                           subcomm_effective_clone,
                           hvd.ProcessSet([0]),
                           ]
        with self.assertRaises(ValueError):
            hvd.init(comm=comm, process_sets=my_process_sets)

        ## Internally Horovod has been initialized successfully, but we need to call hvd.init() with a valid list of
        ## process sets to proceed.

        # 2+ duplicates
        my_process_sets = [hvd.ProcessSet(subcomm),
                           hvd.ProcessSet(comm_clone),
                           subcomm_effective_clone,
                           hvd.ProcessSet([0]),
                           ]
        with self.assertRaises(ValueError):
            hvd.init(comm=comm, process_sets=my_process_sets)

        # 1+ duplicates
        my_process_sets = [hvd.ProcessSet(subcomm),
                           hvd.ProcessSet(comm_clone),
                           hvd.ProcessSet([0]),
                           ]
        with self.assertRaises(ValueError):
            hvd.init(comm=comm, process_sets=my_process_sets)

        # 1+ duplicates
        my_process_sets = [hvd.ProcessSet(subcomm),
                           subcomm_effective_clone,
                           hvd.ProcessSet([0]),
                           ]
        if hvd.size() == 2 or hvd.rank() % 2 == 0:
            with self.assertRaises(ValueError):
                hvd.init(comm=comm, process_sets=my_process_sets)
        else:
            hvd.init(comm=comm, process_sets=my_process_sets)

        # no duplicates
        if size > 2:
            my_process_sets = [hvd.ProcessSet(subcomm),
                               hvd.ProcessSet([0]),
                               ]
            hvd.init(comm=comm, process_sets=my_process_sets)
        else:
            my_process_sets = [hvd.ProcessSet(subcomm), ]
            hvd.init(comm=comm, process_sets=my_process_sets)


        self.assertEqual(hvd.global_process_set.process_set_id, 0)
        self.assertListEqual(hvd.global_process_set.ranks, list(range(size)))
        self.assertEqual(hvd.global_process_set.mpi_comm, comm)

        # Here we test some implementation details (numeric process set id values) using an internal function.
        ps = hvd.mpi_ops._basics._get_process_set_ids_and_ranks()
        if size > 2:
            self.assertDictEqual(ps, {0: list(range(size)),
                                      1: list(range(0, size, 2)),
                                      2: list(range(1, size, 2)),
                                      3: [0],
                                      })
        else:
            self.assertDictEqual(ps, {0: list(range(size)),
                                      1: list(range(0, size, 2)),
                                      2: list(range(1, size, 2)),
                                      })

        if hvd.rank() % 2 == 0:
            self.assertEqual(my_process_sets[0].process_set_id, 1)
        else:
            self.assertEqual(my_process_sets[0].process_set_id, 2)

        # If another process initiates shutdown while this process is still processing _get_process_set_ids_and_ranks(),
        # a race condition may be triggered. Avoid with a barrier.
        MPI.COMM_WORLD.barrier()

        hvd.shutdown()
Beispiel #8
0
    def test_horovod_linear_regression(self):

        import horovod.tensorflow as hvd
        # Horovod: initialize Horovod.
        hvd.init()

        logdir = args.logdir + '/horovod_test/'
        if hvd.rank() == 0:
            if not os.path.exists(logdir):
                os.makedirs(logdir)

        assert args.training_size % hvd.size() == 0
        assert (args.training_size // hvd.size()) % args.batch_size == 0

        training_data_filename = logdir + 'training_data.npy'
        if hvd.rank() == 0:
            with open(training_data_filename, 'w') as f:
                full_training_data = np.random.random(
                    size=(args.training_size, ))
                full_training_data.tofile(f)
            print("Full training data:")
            print(full_training_data)
        hvd.allgather(tf.constant([0]))

        with open(training_data_filename, 'r') as f:
            training_data = np.fromfile(f)
            training_data_size = training_data.shape[0]
            local_training_data_size = (training_data.shape[0] + hvd.size() -
                                        1) // hvd.size()
            local_training_data_begin = hvd.rank() * local_training_data_size
            if hvd.rank() == hvd.size(
            ) - 1 and training_data.shape[0] % local_training_data_size != 0:
                local_training_data_size = training_data.shape[
                    0] % local_training_data_size
            local_training_data = training_data[
                local_training_data_begin:local_training_data_begin +
                local_training_data_size]
            print("Local training data:")
            print(local_training_data)

        # Define Tensorflow graph
        graph = tf.Graph()

        with graph.as_default():
            x_ph = tf.placeholder(tf.float32,
                                  shape=[None, training_data_size],
                                  name='x')
            y_ph = tf.placeholder(tf.float32, shape=[None], name='y')
            w = tf.Variable(np.zeros((training_data_size, )),
                            dtype=tf.float32,
                            name='w')
            loss_func = tf.constant(0.5) * tf.reduce_sum(
                tf.square(y_ph - tf.tensordot(x_ph, w, axes=1)))

            opt = tf.train.GradientDescentOptimizer(learning_rate=hvd.size() *
                                                    1.0)
            # Horovod: wrap local optimizer in distributed Horovod optimizer.
            opt = hvd.DistributedOptimizer(opt)

            #train_step = opt.minimize(loss_func)
            #grads_and_vars = opt.compute_gradients(loss_func)
            train_step = opt.minimize(
                loss_func)  # apply_gradients(grads_and_vars)

            config = tf.ConfigProto()
            config.intra_op_parallelism_threads = 22
            config.inter_op_parallelism_threads = 8
            config.gpu_options.allow_growth = True
            config.allow_soft_placement = True

            # Horovod
            # Add variable initializer.
            init = tf.global_variables_initializer()

            # Horovod: broadcast initial variable states from rank 0 to all other processes.
            # This is necessary to ensure consistent initialization of all workers when
            # training is started with random weights or restored from a checkpoint.
            bcast = hvd.broadcast_global_variables(0)

        print("Local training data:")
        print(local_training_data)
        print('Opening tf.Session...')
        with tf.Session(graph=graph, config=config) as sess:
            # We must initialize all variables before we use them.
            init.run()
            bcast.run()
            print('Initialized all Horovod ranks.')
            print('Begin training - batch size = {}.'.format(args.batch_size),
                  flush=True)

            if hvd.rank() == 0:
                training_writer = tf.summary.FileWriter(logdir,
                                                        graph=sess.graph)

            for i in range((local_training_data_size + args.batch_size - 1) //
                           args.batch_size):
                batch_begin = i * args.batch_size
                batch_size = args.batch_size \
                    if i != (local_training_data_size+args.batch_size-1)//args.batch_size else \
                    local_training_data_size % args.batch_size
                x = np.zeros(shape=(batch_size, training_data_size))
                x[np.arange(x.shape[0]), local_training_data_begin +
                  batch_begin + np.arange(x.shape[0])] = 1.0
                y = local_training_data[batch_begin:batch_begin + batch_size]

                feed_train = {x_ph: x, y_ph: y}

                # Only compute shuffle indices, do not compute shuffled data set to avoid memory error
                time_before = time.time()
                #grad_op = grads_and_vars[0][0]
                loss, _ = sess.run([loss_func, train_step],
                                   feed_dict=feed_train)
                time_after = time.time()
                print(
                    'Step {0:5d}  -  loss: {1:6.2f}  -  latency: {2:6.2f} ms.'.
                    format(i, loss, 1000 * (time_after - time_before)),
                    flush=True)

            print('Finished training - local residual w - y_training is:')
            local_residual = sess.run([
                w[local_training_data_begin:local_training_data_begin +
                  local_training_data_size] - y_ph
            ],
                                      feed_dict={y_ph: local_training_data})
            print(local_residual)
            print('Locally trained variable components')
            print(
                sess.run([
                    w[local_training_data_begin:local_training_data_begin +
                      local_training_data_size]
                ]))
            print('Local training data')
            print(local_training_data)
            self.assertTrue(
                np.allclose(local_residual,
                            np.zeros(len(local_residual), ),
                            rtol=1e-7))

        if hvd.rank() == 0:
            os.remove(training_data_filename)
        hvd.allgather(tf.constant([0]))
        hvd.shutdown()
Beispiel #9
0
def run_gnn(args,model_ops,test_items,train_items=None,optimizer=None):
    # Split ops.
    inputs_p_ph, inputs_l_ph, targets_ph, inputs_p_op, inputs_l_op, targets_op, output_ops, loss_op, step_op = model_ops
    # Create new TF session.
    banner_print("Create TF config / session.")
    config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    config.gpu_options.allow_growth = True
    if args.hvd:
        import horovod.tensorflow as hvd
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        #Some versions of tensorflow(version<1.15) have issues with allocating all device memory
        #In this case, uncomment the following line
        #config.gpu_options.per_process_gpu_memory_fraction = 0.5
        checkpoint_dir = './checkpoints' if RANK == 0 else './checkpoints_test'
    else:
        config.gpu_options.visible_device_list = str(0)
        checkpoint_dir = './checkpoints'
    try:
        os.mkdir(checkpoint_dir)
    except OSError:  
        print ("Creation of directory %s failed!"%checkpoint_dir)
    else:  
        print ("Successfully created directory %s."%checkpoint_dir)
    sess = tf.Session(config=config)
    # Initialize Model.
    if(RANK==0):
        print("All workers are initializing global variables...")
    #All ranks should initialize their variables before 
    #loading checkpoints or getting broadcast variables 
    #from rank 0
    sess.run(tf.global_variables_initializer())
    if(RANK==0):
        print("Done global variables init.")

    saver = tf.train.Saver()
    model_path = checkpoint_dir + '/model%s.ckpt'%RANK
    restore_path = model_path
    
    if RANK==0:
        # Test Save / Restore model with Rank 0.
        save_path = saver.save(sess, model_path)
        print("Coordinator Test checkpoint saved to: %s"%save_path)
    else:
        save_path = saver.save(sess, model_path)
        print("Worker test checkpoint saved to: %s"%save_path)

    
    if args.restore != None:
        restore_path = args.restore
        print("Restoring model from: %s"%restore_path)
        saver.restore(sess,restore_path)
        print("Model restored sucessfully.")
        print("To resume training use --restore %s"%str(os.getcwd()+"/"+restore_path))
    else:
        restore_path=model_path
        saver.restore(sess, restore_path)
        print("Worker fresh checkpoint restore test success.")
        print("To resume training use --restore %s"%str(os.getcwd()+"/"+restore_path))
        print("Training new model.")

    # Print total model parameters.
    if 0:
        total_parameters = 0
        for variable in tf.trainable_variables():
            variable_parameters = 1
            for dim in variable.get_shape():
                variable_parameters *= dim.value
            total_parameters += variable_parameters
        print("Total trainable params: ", total_parameters)

    if args.hvd:
        print("Broadcasting...")
        import horovod.tensorflow as hvd
        bcast_op = hvd.broadcast_global_variables(0)
        sess.run(bcast_op)
        time.sleep(10)
        print("Done broadcast")


    # Training / inference loop.
    banner_print("Start training / testing loop.")
    acc_best = 0.0
    epoch_best = 0
    log_epochs, solveds_tr, solveds_ge, losses_tr, losses_ge, lr_hist = ([],[],[],[],[],[])
    for epoch in range(args.epochs):
        if RANK == 0:
            print("Epoch %d:"%(epoch))
            log_epochs.append(epoch)
        # Run training step.
        if not INFERENCE_ONLY:
            if RANK == 0:
                print("  Training.")
            elapsed, solved, loss, count = run_batches(sess, lambda: item_batch_iter(train_items,args.batch_size),
                                                       inputs_p_ph, inputs_l_ph, targets_ph,
                                                       inputs_p_op, inputs_l_op, targets_op,
                                                       output_ops, step_op, loss_op)
            acc = solved / count
            loss = loss / count
            lr = sess.run(optimizer._learning_rate)
            if args.hvd:
                acc, solved, loss = average_distributed_metrics(sess, acc, solved, loss)
                count = hvd.size()*count
                solved = hvd.size()*solved
            if RANK == 0:
                print("    Time:  %.1fs"%(elapsed))
                print("    LrnR:  %.6f"%lr)
                print("    Loss:  %f"%(loss))
                print("    Acc.:  %f  (%.1f/%.1f)"%(acc,solved,count))
                solveds_tr.append(acc)
                losses_tr.append(loss)
                lr_hist.append(lr)
        # Run a test step.
        if RANK == 0:
            print("  Testing.")
        elapsed, solved, loss, count = run_batches(sess, lambda: item_batch_iter(test_items,args.batch_size_test, shuffle=False),
                                                   inputs_p_ph, inputs_l_ph, targets_ph,
                                                   inputs_p_op, inputs_l_op, targets_op,
                                                   output_ops, None, loss_op)
        acc = solved / count
        loss = loss / count
        if args.hvd:
            acc, solved, loss = average_distributed_metrics(sess, acc, solved, loss)
            count = hvd.size()*count
            solved = hvd.size()*solved
        if RANK == 0:
            print("    Time:  %.1fs"%(elapsed))
            print("    Loss:  %f"%(loss))
            print("    Acc.:  %f  (%.1f/%.1f)"%(acc,solved,count))
            solveds_ge.append(acc)
            losses_ge.append(loss)
            if(args.plot_history):
                plot_history(log_epochs, solveds_tr, solveds_ge, 'PharML-Accuracy', 'accuracy')
                plot_history(log_epochs, losses_tr, losses_ge, 'PharML-Loss','loss')
                plot_history(log_epochs, lr_hist, lr_hist, 'PharML-LR','learning rate')

        # Checkpoint if needed.
        if acc > acc_best and not INFERENCE_ONLY:
            acc_best = acc
            epoch_best = epoch
            if RANK == 0:
                print("  New Best Test Acc: ", acc_best)
                print("   -> Occurred at epoch ", epoch_best)
                sys.stdout.flush()
                save_path = saver.save(sess, model_path)
                print("   -> Saved checkpoint to %s"%(save_path))

        if INFERENCE_ONLY:
            # Exit loop after first inference if in inference only-mode.
            print("Inference only mode, done with single pass so exiting...")
            hvd.shutdown()
            break;

        # If test accuracy has not improved for more than 
        # 15 epochs, call it converged and exit - this is what 
        # was used in paper, but since we've found lowering 
        # this to 5 epochs is sufficient in some cases
        if( (epoch-epoch_best) >= 15 and not INFERENCE_ONLY):
            print("Model Converged! Exiting Nicely...")
            #sys.exit(0)
            hvd.shutdown()
            break;
            
    # Success!
    banner_print("Success!")