def _do_batch_all_reduce(self, reduce_op, dense_values):
    """Run batch all-reduces."""
    logging.log_first_n(
        logging.INFO, "batch_all_reduce: %d all-reduces with algorithm = %s,"
        "num_packs = %d, agg_small_grads_max_bytes = %d and "
        "agg_small_grads_max_group = %d" %
        (len(dense_values), self._all_reduce_alg, self._num_packs,
         self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)

    destinations = dense_values[0].devices
    grouped = _group_value_by_device(dense_values)

    device_grad_packs, tensor_packer = _pack_tensors(
        grouped, self._num_packs, self._agg_small_grads_max_bytes,
        self._agg_small_grads_max_group)

    # The actual aggregation of the repacked gradients. Note that they are
    # sharded among different aggregation trees. So it is important to strike
    # the balance on num_splits.
    if self._all_reduce_alg == "nccl":
      # TODO(yuefengz): merge this into the all-reduce library.
      reduced = cross_device_utils.aggregate_gradients_using_nccl(
          device_grad_packs)
    else:
      # TODO(yuefengz): check that gpu ids in `destinations` are in ascending
      # order.
      reduced = (
          cross_device_utils.aggregate_gradients_using_hierarchical_copy(
              destinations, device_grad_packs))

    reduced = _unpack_tensors(reduced, tensor_packer)
    return _ungroup_and_make_mirrored(reduced, dense_values[0], reduce_op)
Beispiel #2
0
    def _do_batch_all_reduce(self, reduce_op, dense_values):
        """Run batch all-reduces."""
        logging.log_first_n(
            logging.INFO,
            "batch_all_reduce invoked for batches size = %d with "
            "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and "
            "agg_small_grads_max_group = %d" %
            (len(dense_values), self._all_reduce_alg, self._num_packs,
             self._agg_small_grads_max_bytes, self._agg_small_grads_max_group),
            10)

        destinations = dense_values[0].devices
        grouped = _group_value_by_device(dense_values)

        device_grad_packs, tensor_packer = _pack_tensors(
            grouped, self._num_packs, self._agg_small_grads_max_bytes,
            self._agg_small_grads_max_group)

        # The actual aggregation of the repacked gradients. Note that they are
        # sharded among different aggregation trees. So it is important to strike
        # the balance on num_splits.
        if self._all_reduce_alg == "nccl":
            # TODO(yuefengz): merge this into the all-reduce library.
            reduced = cross_device_utils.aggregate_gradients_using_nccl(
                device_grad_packs)
        else:
            # TODO(yuefengz): check that gpu ids in `destinations` are in ascending
            # order.
            reduced = (
                cross_device_utils.aggregate_gradients_using_hierarchical_copy(
                    destinations, device_grad_packs))

        reduced = _unpack_tensors(reduced, tensor_packer)
        return _ungroup_and_make_mirrored(reduced, dense_values[0], reduce_op)
  def model_fn(features, labels, mode, params):
    """Defines how to train, evaluate and predict from the transformer model."""  
    cluster_spec = cluster.as_dict()
#    num_gpus=len(cluster_spec["worker"])
    num_gpus=2 
    learning_rate = get_learning_rate(learning_rate=params["learning_rate"], hidden_size=params["hidden_size"], learning_rate_warmup_steps=params["learning_rate_warmup_steps"])
    optimizers = [tf.contrib.opt.LazyAdamOptimizer(learning_rate, beta1=params["optimizer_adam_beta1"], beta2=params["optimizer_adam_beta2"], epsilon=params["optimizer_adam_epsilon"]) for _ in range(num_gpus)]

    if params["dtype"] == "fp16":
      optimizers = [tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) for optimizer in optimizers]

    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)
    grad_list= []
    losses = []
    logits = []
    for gpu_idx in range(num_gpus):
#      device_setter = local_device_setter(cluster, worker_device="/job:worker/task:%d" % gpu_idx)
      device_setter = local_device_setter(cluster, worker_device="gpu:%d" % gpu_idx)
      with tf.device(device_setter):
#      with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % gpu_idx, cluster=cluster)):
#      with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_idx)), tf.variable_scope('tower%d'%gpu_idx):
#with tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster_spec)):
        logit, loss = create_tower_network(model, params, features, labels)
#        feature_shard, label_shard = next(iterator)
#        logit, loss = create_tower_network(model, params, features, labels)
        logits.append(logit)
        losses.append(loss)
        grad_list.append([x for x in optimizers[gpu_idx].compute_gradients(loss) if x[0] is not None])

#    output_train = tf.concat(logits, axis=0)
    output_train = tf.reduce_mean(logits, axis=0)
    loss_train = tf.reduce_mean(losses, name='loss')
   
    '''
    grads = []
    all_vars= []
    for tower in grad_list:
      grads.append([x[0] for x in tower])
      all_vars.append([x[1] for x in tower])

    reduced_grad = []
    if num_gpus==1:
      reduced_grad = grads
    else:
      new_all_grads = []
      for grad in zip(*grads):
        summed = nccl_ops.all_sum(grad)
        grads_for_devices = []
        for g in summed:
          with tf.device(g.device):
            g = tf.multiply(g, 1.0 / num_gpus, name='allreduce_avg')
          grads_for_devices.append(g)
        new_all_grads.append(grads_for_devices)
      reduced_grad = list(zip(*new_all_grads))
    grads = [list(zip(gs, vs)) for gs, vs in zip(reduced_grad, all_vars)]
    '''
    from tensorflow.python.distribute import cross_device_utils
    grads = cross_device_utils.aggregate_gradients_using_nccl(grad_list)
    #apply gradients to each GPU by broadcasting summed gradient
    train_ops = []
    for idx, grad_and_vars in enumerate(grads):
      with tf.name_scope('apply_gradients'), tf.device(tf.DeviceSpec(device_type="GPU", device_index=idx)):
#        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope='tower%d'%idx)
        global_step = tf.train.get_global_step()
        update_ops = tf.assign(global_step, global_step+1, name='update_global_step')
#        with tf.control_dependencies(update_ops):
        train_ops.append(optimizers[idx].apply_gradients(grad_and_vars, name='apply_grad_{}'.format(idx)))
    optimize_op = tf.group(update_ops, *train_ops, name='train_op')
    train_metrics = {"learning_rate": learning_rate}

    tf.identity(loss_train, "cross_entropy")

    if mode == tf.estimator.ModeKeys.TRAIN:
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss_train, train_op=optimize_op)
    if mode == tf.estimator.ModeKeys.EVAL:
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss_train, predictions={"predictions": output_train}, eval_metric_ops=metrics.get_eval_metrics(output_train, labels, params))
    if mode == tf.estimator.ModeKeys.PREDICT:
      return tf.estimator.EstimatorSpec(mode=mode, predictions=output_train, export_outputs={"translate": tf.estimator.export.PredictOutput(output_train)})