Example #1
0
def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
  """Build a subgraph that does one full all-reduce, using NCCL.

  Args:
    input_tensors: list of T @{tf.Tensor} of same-shape and type values to
      be reduced.
    red_op: binary elementwise reduction operator.  Must be one of
      {tf.add}
    un_op: optional unary elementwise Op to apply to fully-reduce values.

  Returns:
    list of T @{tf.Tensor} of reduced values.

  Raises:
    ValueError: red_op not supported.
  """
  if red_op == math_ops.add:
    output_tensors = nccl.all_sum(input_tensors)
  else:
    raise ValueError("red_op not supported by NCCL all-reduce: ", red_op)
  if un_op:
    un_op_wrapped = []
    for t in output_tensors:
      with ops.colocate_with(t):
        un_op_wrapped.append(un_op(t))
    output_tensors = un_op_wrapped
  return output_tensors
Example #2
0
def allreduce_grads(all_grads, average):
    """
    All-reduce average the gradients among devices. Results are broadcasted to all devices.

    Args:
        all_grads (K x N x 2): A list of K lists. Each of the list is a list of N (grad, var) tuples.
            The variables have to be the same across the K lists.
        average (bool): average gradients or not.

    Returns:
        (K x N x 2): same as input, but each grad is replaced by the average over K lists.
    """
    from tensorflow.contrib import nccl
    nr_tower = len(all_grads)
    if nr_tower == 1:
        return all_grads
    new_all_grads = []  # NVar * NGPU * 2
    with tf.name_scope('AvgGrad'):
        for grad_and_vars in zip(*all_grads):
            v = grad_and_vars[0][1]
            grads = [g for g, _ in grad_and_vars]
            summed = nccl.all_sum(grads)

            grads_for_a_var = []
            for (_, v), g in zip(grad_and_vars, summed):
                with tf.device(g.device):
                    # tensorflow/benchmarks didn't average gradients
                    if average:
                        g = tf.multiply(g, 1.0 / nr_tower)
                    grads_for_a_var.append((g, v))
            new_all_grads.append(grads_for_a_var)

    # transpose
    ret = [k for k in zip(*new_all_grads)]
    return ret
Example #3
0
def allreduce_grads(all_grads, average):
    """
    All-reduce average the gradients among K devices. Results are broadcasted to all devices.

    Args:
        all_grads (K x N): List of list of gradients. N is the number of variables.
        average (bool): average gradients or not.

    Returns:
        K x N: same as input, but each grad is replaced by the average over K devices.
    """
    from tensorflow.contrib import nccl
    nr_tower = len(all_grads)
    if nr_tower == 1:
        return all_grads
    new_all_grads = []  # N x K
    for grads in zip(*all_grads):
        summed = nccl.all_sum(grads)

        grads_for_devices = []  # K
        for g in summed:
            with tf.device(g.device):
                # tensorflow/benchmarks didn't average gradients
                if average:
                    g = tf.multiply(g, 1.0 / nr_tower)
            grads_for_devices.append(g)
        new_all_grads.append(grads_for_devices)

    # transpose to K x N
    ret = list(zip(*new_all_grads))
    return ret
def all_avg_gradients(tower_gradvars, devices, param_server_device='/gpu:0',
                      usenccl=True):
    if len(devices) == 1:
        return tower_gradvars

    num_devices = len(devices)
    avg_gradvars = []
    for layer in zip(*tower_gradvars):
        grads_on_devices, vars_on_devices = zip(*layer)
        if have_nccl and usenccl:
            # Note: These nccl ops _must_ be run on all devices, else deadlock
            # print('ALL_AVG_GRADIENTS GRADS_ON_DEVICES:',
            #       grads_on_devices)  # DEBUG
            avg_grads_on_devices = nccl.all_sum(grads_on_devices)
            for d, device in enumerate(devices):
                with tf.device(device):
                    avg_grads_on_devices[d] *= 1. / num_devices
        else:
            with tf.device(param_server_device):
                avg_grad = tf.reduce_mean(tf.stack(grads_on_devices), 0)
            avg_grads_on_devices = [avg_grad] * num_devices
        avg_gradvars_on_devices = zip(*(avg_grads_on_devices, vars_on_devices))
        avg_gradvars.append(avg_gradvars_on_devices)

    return list(zip(*avg_gradvars))
Example #5
0
def sum_grad_and_var_all_reduce(grad_and_vars, devices):
  # Note that each grad_and_vars looks like the following:
  #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))

  scaled_grads = [g for _, (g, _) in zip(devices, grad_and_vars)]
  summed_grads = nccl.all_sum(scaled_grads)

  result = []
  for d, (_, v), g in zip(devices, grad_and_vars, summed_grads):
    with tf.device(d):
      result.append((g, v))
  return result
def aggregate_gradients_using_nccl(tower_grads):
  """Aggregate gradients using nccl allreduce."""
  agg_all_g_and_v = []
  for single_g_and_v in zip(*tower_grads):
    single_grads = [g for g, _ in single_g_and_v]
    agg_grads = nccl.all_sum(single_grads)
    agg_all_g_and_v.append(
        [(g, v) for g, (_, v) in zip(agg_grads, single_g_and_v)])

  agg_all_g_and_v = list(zip(*agg_all_g_and_v))

  return agg_all_g_and_v
Example #7
0
def sum_grad_and_var_all_reduce(grad_and_vars,
                                num_workers,
                                alg,
                                gpu_indices,
                                aux_devices=None,
                                num_shards=1):
    """Apply all-reduce algorithm over specified gradient tensors."""
    with tf.name_scope('allreduce'):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        scaled_grads = [g for g, _ in grad_and_vars]
        if alg == 'nccl':
            summed_grads = nccl.all_sum(scaled_grads)
        elif alg == 'simple':
            summed_grads = build_reduce_sum(scaled_grads)
        elif alg == 'trivial':
            summed_grads = build_trivial_sum(scaled_grads)
        elif alg == 'xring':
            summed_grads = all_reduce.build_ring_all_reduce(
                scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
        elif alg == 'nccl/xring':
            summed_grads = all_reduce.build_nccl_then_ring(
                scaled_grads, num_shards, tf.add)
        elif alg == 'nccl/rechd':
            summed_grads = all_reduce.build_nccl_then_recursive_hd(
                scaled_grads, tf.add)
        elif alg == 'nccl/pscpu':
            summed_grads = all_reduce.build_nccl_then_shuffle(
                scaled_grads, aux_devices, tf.add, tf.add_n)
        elif alg == 'pscpu/pscpu':
            summed_grads = all_reduce.build_shuffle_then_shuffle(
                scaled_grads,
                aux_devices,
                # TODO(tucker): devise a way of better specifying the device
                # for the second level.
                [aux_devices[0]],
                tf.add_n)
        elif alg in ['pscpu', 'psgpu']:
            summed_grads = all_reduce.build_shuffle_all_reduce(
                scaled_grads, aux_devices, tf.add_n)
        else:
            raise ValueError('unsupported all_reduce alg: ', alg)

        result = []
        for (_, v), g in zip(grad_and_vars, summed_grads):
            result.append([g, v])
        return result
Example #8
0
def sum_grad_and_var_all_reduce(grad_and_vars,
                                num_workers,
                                alg,
                                gpu_indices,
                                aux_devices=None,
                                num_shards=1):
    """Apply all-reduce algorithm over specified gradient tensors."""
    with tf.name_scope('allreduce'):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        scaled_grads = [g for g, _ in grad_and_vars]
        if alg == 'nccl':
            summed_grads = nccl.all_sum(scaled_grads)
        elif alg == 'xring':
            summed_grads = all_reduce.build_ring_all_reduce(
                scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
        elif alg == 'nccl/xring':
            summed_grads = all_reduce.build_nccl_then_ring(
                scaled_grads, num_shards, tf.add)
        elif alg == 'nccl/rechd':
            summed_grads = all_reduce.build_nccl_then_recursive_hd(
                scaled_grads, tf.add)
        elif alg == 'nccl/pscpu':
            summed_grads = all_reduce.build_nccl_then_shuffle(
                scaled_grads, aux_devices, tf.add, tf.add_n)
        elif alg == 'pscpu/pscpu':
            summed_grads = all_reduce.build_shuffle_then_shuffle(
                scaled_grads,
                aux_devices,
                # TODO(tucker): devise a way of better specifying the device set
                # for the second level.
                [aux_devices[0]],
                tf.add_n)
        elif alg in ['pscpu', 'psgpu']:
            summed_grads = all_reduce.build_shuffle_all_reduce(
                scaled_grads, aux_devices, tf.add_n)
        else:
            raise ValueError('unsupported all_reduce alg: ', alg)

        result = []
        for (_, v), g in zip(grad_and_vars, summed_grads):
            result.append([g, v])
        return result
Example #9
0
    def _allreduce_grads(tower_grads):
        from tensorflow.contrib import nccl
        nr_tower = len(tower_grads)
        if nr_tower == 1:
            return [[x] for x in tower_grads[0]]
        new_tower_grads = []
        with tf.name_scope('AvgGrad'):
            for grad_and_vars in zip(*tower_grads):
                v = grad_and_vars[0][1]
                grads = [g for g, _ in grad_and_vars]
                summed = nccl.all_sum(grads)

                grads_for_a_var = []
                for (_, v), g in zip(grad_and_vars, summed):
                    with tf.device(g.device):
                        g = tf.multiply(g, 1.0 / nr_tower)
                        grads_for_a_var.append((g, v))
                new_tower_grads.append(grads_for_a_var)
        # NVar * NGPU * 2
        return new_tower_grads
Example #10
0
    def _allreduce_grads(tower_grads):
        from tensorflow.contrib import nccl
        nr_tower = len(tower_grads)
        if nr_tower == 1:
            return tower_grads[0]
        new_tower_grads = []
        with tf.name_scope('AvgGrad'):
            for grad_and_vars in zip(*tower_grads):
                v = grad_and_vars[0][1]
                grads = [g for g, _ in grad_and_vars]
                if not MultiGPUTrainerBase.check_none_grads(v.op.name, grads):
                    continue
                summed = nccl.all_sum(grads)

                grads_for_a_var = []
                for (_, v), g in zip(grad_and_vars, summed):
                    grads_for_a_var.append((g, v))
                new_tower_grads.append(grads_for_a_var)
        # NVar * NGPU * 2
        return new_tower_grads
Example #11
0
    def testCombined(self):
        if not test.is_gpu_available():
            return  # Test requires access to a GPU

        for dtype in [np.float32, np.int32, np.int64, np.float64]:
            # Create session inside outer loop to test use of
            # same communicator across multiple sessions.
            with self.test_session(use_gpu=True) as sess:
                for devices in [[
                        '/device:GPU:0', '/device:GPU:0', '/device:GPU:0'
                ], ['/device:GPU:0', '/device:GPU:0']]:
                    shape = (3, 4)

                    # all-reduce
                    np_ans = np.zeros(shape=shape, dtype=dtype)
                    tensors = []
                    for d in devices:
                        with ops.device(d):
                            t = ((np.random.random_sample(shape) - .5) *
                                 1024).astype(dtype)
                            np_ans += t
                            tensors.append(array_ops.identity(t))
                    all_reduce_tensors = nccl.all_sum(tensors)

                    sender = np.random.randint(0, len(devices) - 1)
                    other_devices = devices[:sender] + devices[sender + 1:]
                    send_op, received_tensors = nccl.broadcast(
                        all_reduce_tensors[sender], other_devices)

                    # sender doesn't need to be fetched as part of outputs of session.run.
                    del all_reduce_tensors[sender]

                    # Verify shape inference.
                    for r in received_tensors:
                        self.assertEqual(shape, r.get_shape())

                    # Run and verify results.
                    nccl_results = sess.run(received_tensors + [send_op] +
                                            all_reduce_tensors)
                    for r in nccl_results[:len(received_tensors)]:
                        self.assertAllClose(r, np_ans)
Example #12
0
def allreduce_gradients_bak(tower_grads):
    from tensorflow.contrib import nccl
    nr_tower = len(tower_grads)
    new_all_grads = []  # NVar * NGPU * 2
    with tf.name_scope('gradient_allreduce'):
        for grad_and_vars in zip(*tower_grads):
            #v = grad_and_vars[0][1]
            grads = [g for g, _ in grad_and_vars]
            summed = nccl.all_sum(grads)

            grads_for_a_var = []
            for (_, v), g in zip(grad_and_vars, summed):
                with tf.device(g.device):
                    g = tf.multiply(g, 1.0 / nr_tower)
                    grads_for_a_var.append((g, v))
            new_all_grads.append(grads_for_a_var)

    # transpose
    ret = [list(k) for k in zip(*new_all_grads)]

    return ret
Example #13
0
def allreduce_grads(all_grads, average=True):
    from tensorflow.contrib import nccl
    nr_tower = len(all_grads)
    if nr_tower == 1:
        return all_grads
    new_all_grads = []  # N x K
    for grads_and_vars in zip(*all_grads):
        grads = [g for g, _ in grads_and_vars]
        _vars = [v for _, v in grads_and_vars]
        summed = nccl.all_sum(grads)
        grads_for_devices = []  # K
        for g in summed:
            with tf.device(g.device):
                # tensorflow/benchmarks didn't average gradients
                if average:
                    g = tf.multiply(g, 1.0 / nr_tower, name='allreduce_avg')
            grads_for_devices.append(g)
        new_all_grads.append(zip(grads_for_devices, _vars))

    # transpose to K x N
    ret = list(zip(*new_all_grads))
    return ret
Example #14
0
  def testCombined(self):
    if not test.is_gpu_available():
      return  # Test requires access to a GPU

    for dtype in [np.float32, np.int32, np.int64, np.float64]:
      # Create session inside outer loop to test use of
      # same communicator across multiple sessions.
      with self.test_session(use_gpu=True) as sess:
        for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
          shape = (3, 4)

          # all-reduce
          np_ans = np.zeros(shape=shape, dtype=dtype)
          tensors = []
          for d in devices:
            with ops.device(d):
              t = ((np.random.random_sample(shape) - .5) * 1024).astype(dtype)
              np_ans += t
              tensors.append(array_ops.identity(t))
          all_reduce_tensors = nccl.all_sum(tensors)

          sender = np.random.randint(0, len(devices) - 1)
          other_devices = devices[:sender] + devices[sender + 1:]
          send_op, received_tensors = nccl.broadcast(all_reduce_tensors[sender],
                                                     other_devices)

          # sender doesn't need to be fetched as part of outputs of session.run.
          del all_reduce_tensors[sender]

          # Verify shape inference.
          for r in received_tensors:
            self.assertEqual(shape, r.get_shape())

          # Run and verify results.
          nccl_results = sess.run(
              received_tensors + [send_op] + all_reduce_tensors)
          for r in nccl_results[:len(received_tensors)]:
            self.assertAllClose(r, np_ans)
Example #15
0
 def testErrors(self):
   with self.assertRaisesRegexp(ValueError, 'Device assignment required'):
     nccl.all_sum([array_ops.identity(np.random.random_sample((3, 4)))])
   with self.assertRaisesRegexp(ValueError, 'Must pass >0 tensors'):
     nccl.all_sum([])
Example #16
0
  def __call__(self, **inputs):
    # Inputs
    images_splits = tf.split(axis=0, num_or_size_splits=self.num_gpus, value=inputs['images'])
    labels_splits = tf.split(axis=0, num_or_size_splits=self.num_gpus, value=inputs['labels'])

    # Inference 
    tower_grads = []
    tower_losses = []
    for device_id in xrange(self.num_gpus):
      with tf.variable_scope('replicated_%s' % device_id):
        with tf.name_scope('TOWER_%d' % device_id) as name_scope:
          with tf.device('/gpu:%d' % device_id):
            # Forward
            pre_logits = self.model(images_splits[device_id], is_training=True)
            logits = fully_connected(pre_logits, num_outputs=inputs['num_classes'], 
                                     activation_fn=None, biases_initializer=None,
                                     weights_regularizer=l2_regularizer(0.0005))
            # Losses
            losses, losses_name = loss_function(logits, labels_splits[device_id], scope=name_scope)
            total_loss = tf.add_n(losses, name='total_loss')

            # Variables 
            params = [v for v in tf.trainable_variables() if v.name.startswith('replicated_%s/' % device_id)]

            # Gradients
            grads = tf.gradients(total_loss, params, aggregation_method=tf.AggregationMethod.DEFAULT)
            grads = [grad/self.num_gpus for grad in grads]

            gradvars = list(zip(grads, params))

            for grad, var in gradvars:
              if grad is not None:
                tf.summary.histogram(var.name, var)
                tf.summary.histogram(var.op.name + '/gradients', grad)

            # Tower grads, losses and updates
            tower_grads.append(gradvars)
            tower_losses.append(losses)
            if device_id == 0:
              update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope)

            print('Tower %d has been inferenced.' % device_id)

    # Allreduce losses
    allreduce_losses = [tf.add_n(losses)/self.num_gpus for losses in zip(*tower_losses)]

    # Allreduce gradients
    allreduce_grads = []
    for grad_and_vars in zip(*tower_grads):
      grads = [g for g, _ in grad_and_vars]
      summed_grads = nccl.all_sum(grads)
      new_grads_and_vars = [(g, v) for (_, v), g in zip(grad_and_vars, summed_grads)]
      allreduce_grads.append(new_grads_and_vars)
    grad_state = [list(x) for x in zip(*allreduce_grads)]

    # Optimizier
    tower_train_ops = []
    for device_id in xrange(self.num_gpus):
      with tf.device('/gpu:%d' % device_id):
        # Gradients of TOWER_(device_id)
        grads = grad_state[device_id]
        # Optimizer configure
        opt = tf.train.MomentumOptimizer(self.lr, 0.9)
        # Tower train_ops
        tower_train_ops.append(opt.apply_gradients(grads))

        print('Optimizer %d has been configured.' % device_id)

    global_step = tf.train.get_global_step()
    global_step_op = global_step.assign_add(1)
    train_ops = tf.group(*(tower_train_ops+update_ops+[global_step_op]))

    return train_ops, self.lr, allreduce_losses, losses_name
 def testErrors(self):
     with self.assertRaisesRegexp(ValueError, 'Device assignment required'):
         nccl.all_sum([array_ops.identity(np.random.random_sample((3, 4)))])
     with self.assertRaisesRegexp(ValueError, 'Must pass >0 tensors'):
         nccl.all_sum([])
Example #18
0
import tensorflow as tf
from tensorflow.contrib.nccl import all_sum
# from tensorflow.contrib.rccl import all_sum

with tf.device('/gpu:0'):
    a = tf.get_variable(
        "a", initializer=tf.constant(1.0, shape=(2, 2)))

with tf.device('/gpu:1'):
    b = tf.get_variable(
        "b", initializer=tf.constant(2.0, shape=(2, 2)))

sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=True))

init = tf.global_variables_initializer()
sess.run(init)

with tf.device('/gpu:0'):
    summed = sess.run(all_sum([a, b]))

print(summed[0])
print(summed[1])

# expected output
# [[3. 3.]
#  [3. 3.]]
# [[3. 3.]
#  [3. 3.]]
Example #19
0
def model(X, S, Y, hps, train=False, ema=None):

    xs = tf.split(X, hps.ngpu, 1)
    ys = tf.split(Y, hps.ngpu, 1)
    ss = tf.split(S, hps.ngpu, 2 - hps.axis)

    losses = []
    states = []
    grads = []
    for gpu in range(hps.ngpu):
        with tf.device("/gpu:%d" % gpu), tf.variable_scope("model%d" % gpu,
                                                           reuse=not train):
            lstm_model = LSTM_Model(hps, train)
            loss, state = lstm_model.forward(xs[gpu],
                                             ss[gpu],
                                             ys[gpu],
                                             ema=ema)
            losses.append(loss)
            states.append(state)
            if train:
                grads.append(lstm_model.backward())

    if train:
        ngrads = len(grads[0])
        if hps.ngpu > 1:
            # all reduce grads
            for i in range(ngrads):

                sum_grads = nccl.all_sum(
                    [grads[gpu][i][0] for gpu in range(hps.ngpu)])
                for gpu in range(hps.ngpu):
                    grads[gpu][i] = (sum_grads[gpu], grads[gpu][i][1])

        train = list()
        for gpu, gpu_grads in enumerate(grads):
            with tf.device("/gpu:%d" % gpu), tf.variable_scope("opt%d" % gpu):

                # compute average from sum
                if hps.ngpu > 1:
                    for i in range(ngrads):
                        # Note the scalar division must appear in a device context otherwise
                        # it will do a whole lot unnecessary of gpu to gpu copying.
                        # Also rebuild the tuple.
                        gpu_grads[i] = (gpu_grads[i][0] / float(hps.ngpu),
                                        gpu_grads[i][1])

                if hps.optimizer == 'adam_old':
                    trainer = tf.train.AdamOptimizer(learning_rate=hps.lr,
                                                     beta2=hps.beta2)
                    train.append(trainer.apply_gradients(gpu_grads))
                else:
                    param_grads = [gpu_grads[i][0] for i in range(ngrads)]
                    param_names = [gpu_grads[i][1] for i in range(ngrads)]
                    if hps.optimizer == 'adam':
                        train.append(
                            layers.adam_updates(param_names,
                                                param_grads,
                                                lr=hps.lr,
                                                mom2=hps.beta2,
                                                gamma=hps.gamma))
                    if hps.optimizer == 'adamax':
                        train.append(
                            layers.adamax_updates(param_names,
                                                  param_grads,
                                                  lr=hps.lr,
                                                  mom2=hps.beta2))

        train = tf.group(*train)
    else:
        train = None

    states = tf.concat(states, 2 - hps.axis)

    return train, tf.add_n(losses) / hps.ngpu, states
Example #20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--num_gpus', default=2, type=int)
    parser.add_argument('--max_step', default=1000, type=int)
    args = parser.parse_args()
    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(
        [str(i) for i in range(args.num_gpus)])

    # avoid unimplemented gpu kernel error
    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:

        dataset = build_dataset(args.num_gpus)
        iterator = dataset.make_initializable_iterator()
        tower_batches = iterator.get_next()

        tower_grads_list = []
        tower_tvars_list = []
        tower_gvars_list = []
        tower_loss_list = []
        for index, tower_batch in enumerate(tower_batches):
            # by-device variable scope
            with tf.variable_scope("tower_%d" % index) as scope, \
                    tf.device('/gpu:%d' % index):

                tower_loss = build_tower(tower_batch)
                tower_gvars = tf.global_variables(scope._name)
                tower_tvars = tf.trainable_variables(scope._name)
                tower_grads = tf.gradients(tower_loss, tower_tvars)

                tower_loss_list.append(tower_loss)
                tower_tvars_list.append(tower_tvars)
                tower_gvars_list.append(tower_gvars)
                tower_grads_list.append(tower_grads)

                if index == 0:
                    # only one variable global saver
                    def clean(name):
                        name = re.sub('^tower_\d+/', '', name)
                        name = re.sub(':\d+$', '', name)
                        return name

                    save_dict = {clean(var.name): var for var in tower_gvars}
                    saver = tf.train.Saver(save_dict)

        with tf.name_scope("tower_gvar_sync"):
            # different device is init with different random seed
            # need explicit synchronization before training!!!
            if len(tower_gvars_list) == 1:
                tower_gvar_sync = tf.no_op()
            else:
                sync_ops = []
                for vars in zip(*tower_gvars_list):
                    for var in vars[1:]:
                        sync_ops.append(tf.assign(var, vars[0]))
                tower_gvar_sync = tf.group(*sync_ops)

        with tf.name_scope('all_reduce'):
            avg_tower_grads_list = []
            for grads_to_avg in zip(*tower_grads_list):
                # nccl.all_sum will automatically
                # convert sparse gradients into dense one
                avg_tower_grads_list.append(nccl.all_sum(grads_to_avg))
            avg_tower_grads_list = zip(*avg_tower_grads_list)

        with tf.name_scope('metrics'):
            loss = tf.add_n(tower_loss_list) / len(tower_loss_list)

        train_ops = []
        for index, (tower_vars, tower_grads) in \
                enumerate(zip(tower_tvars_list, avg_tower_grads_list)):
            with tf.variable_scope("tower_%d" % index), \
                 tf.device('/gpu:%d' % index):
                tower_grads = [
                    grad / len(tower_batches) for grad in tower_grads
                ]
                if index == 0:
                    # only increment global step with the first worker
                    step = tf.train.get_or_create_global_step()

                tower_optimizer = tf.train.AdamOptimizer()
                tower_train_op = tower_optimizer.apply_gradients(
                    zip(tower_grads, tower_vars),
                    global_step=step if index == 0 else None)
                train_ops.append(tower_train_op)
        train_op = tf.group(train_ops)

        # start running
        sess.run(tf.global_variables_initializer())
        sess.run(iterator.initializer)
        # important to sync variables before training!
        sess.run(tower_gvar_sync)
        while True:
            try:
                fetch_loss, fetch_step, _ = sess.run([loss, step, train_op])
                if fetch_step % 20 == 0:
                    print("step: %d, loss: %.4f" % (fetch_step, fetch_loss))
                if fetch_step > args.max_step:
                    break
            except tf.errors.OutOfRangeError:
                break
        saver.save(sess, "./model")
Example #21
0
def main(_):
    training = tf.Variable(True)

    accuracies = []
    training_steps = []
    optimisers = []
    device_grads = []
    losses = []

    for device_num in range(GPUS):
        with tf.variable_scope('v{}'.format(device_num)):
            with tf.device('/cpu:0'):
                train_path = os.path.join(FLAGS.data_dir, 'train')
                test_path = os.path.join(FLAGS.data_dir, 'test')
                x, y_ = get_iterators(train_path, test_path)

            with tf.device('/gpu:{}'.format(device_num)):
                y = get_model(x, training=training)

                cross_entropy = tf.losses.sparse_softmax_cross_entropy(
                    labels=y_, logits=y)
                losses.append(cross_entropy)

                correct_prediction = tf.equal(
                    tf.cast(tf.argmax(y, 1), dtype=tf.int32), y_)
                accuracy = tf.reduce_mean(
                    tf.cast(correct_prediction, tf.float32))
                accuracies.append(accuracy)

                params = [
                    v for v in tf.get_collection('trainable_variables')
                    if v.name.startswith('v%s/' % device_num)
                ]

                opt = tf.train.GradientDescentOptimizer(0.1)
                optimisers.append(opt)

                grads = opt.compute_gradients(cross_entropy, params)

                device_grads.append(grads)

    new_device_grads = []
    for grad_and_vars in zip(*device_grads):
        scaled_grads = [g for g, _ in grad_and_vars]
        summed_grads = nccl.all_sum(scaled_grads)

        aggregated_device_grads = []
        for (_, v), g in zip(grad_and_vars, summed_grads):
            aggregated_device_grads.append([g, v])

        new_device_grads.append(aggregated_device_grads)

    aggregated_device_grads = [list(x) for x in zip(*new_device_grads)]

    training_ops = []
    for d, device in enumerate(['/gpu:{}'.format(x) for x in range(GPUS)]):
        with tf.device(device):
            opt = optimisers[d]
            avg_grads = aggregated_device_grads[d]
            training_ops.append(optimisers[d].apply_gradients(avg_grads))

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    if FLAGS.xla:
        # Turns on XLA JIT compilation.
        jit_level = tf.OptimizerOptions.ON_1
        config.graph_options.optimizer_options.global_jit_level = jit_level
    run_metadata = tf.RunMetadata()
    sess = tf.Session(config=config)

    sess.run(tf.global_variables_initializer())

    local_var_init_op = tf.local_variables_initializer()
    variable_mgr_init_ops = [local_var_init_op]
    with tf.control_dependencies([local_var_init_op]):
        variable_mgr_init_ops.extend(get_post_init_ops())
    local_var_init_op_group = tf.group(*variable_mgr_init_ops)
    sess.run(local_var_init_op_group)

    # Get handles to enable iterator feeding.
    sess.run([
        tf.get_collection('trn_iterator_inits'),
        tf.get_collection('val_iterator_inits')
    ])
    training_handles = sess.run(tf.get_collection('trn_iterator_handles'))
    test_handles = sess.run(tf.get_collection('test_iterator_handles'))
    feedable_handles = tf.get_collection('feedable_iterator_handles')
    training_feed_dict = dict(zip(feedable_handles, training_handles))
    test_feed_dict = dict(zip(feedable_handles, test_handles))

    # Train
    train_step = tf.group(training_ops)
    loss = tf.reduce_mean(losses)

    loss_window = 200
    loss_agg = np.zeros(loss_window)
    for i in range(FLAGS.train_loops):
        # Create a timeline for the last loop and export to json to view with
        # chrome://tracing/.
        if i == FLAGS.train_loops - 1:
            sess.run(
                [loss, train_step],
                feed_dict=training_feed_dict,
                options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
                run_metadata=run_metadata)
            trace = timeline.Timeline(step_stats=run_metadata.step_stats)
            with open('timeline.ctf.json', 'w') as trace_file:
                trace_file.write(trace.generate_chrome_trace_format())
        else:
            l, _ = sess.run([loss, train_step], feed_dict=training_feed_dict)
            loss_agg[i % loss_window] = l

            print('Step: {}/{} Loss: {}'.format(i, FLAGS.train_loops,
                                                np.mean(loss_agg)),
                  end="\r")
    # Print loss as it's overwritten in log
    print('Loss: {}'.format(np.mean(loss_agg)))
    # Change dataset to test version
    # Assign training = false
    sess.run(
        [tf.get_collection('test_iterator_inits'),
         training.assign(False)])
    #for
    print('Accuracy:', sess.run(accuracy, feed_dict=test_feed_dict))
    sess.close()
import tensorflow as tf
from itertools import repeat
from tensorflow.contrib.nccl import all_sum

with tf.device('/gpu:0'):
    g0 = tf.placeholder(tf.float32, (2, 2), f"g0")

with tf.device('/gpu:1'):
    g1 = tf.placeholder(tf.float32, (2, 2), f"g1")

all_reduce_sum = all_sum([g0, g1])

sess = tf.Session(config=tf.ConfigProto(log_device_placement=True,
                                        allow_soft_placement=False))

init = tf.global_variables_initializer()
sess.run(init)

r = [[1, 1], [1, 1]], [[2, 2], [2, 2]]
for x, y in repeat(r):
    sess.run(all_reduce_sum, feed_dict={g0: x, g1: y})
Example #23
0
from tensorflow import logging
import tokenization
import util
import os
from tensorflow.python.framework import ops

from tensorflow.contrib import nccl

dim = 10000

with tf.device('/gpu:0'):
    a = tf.get_variable("a", initializer=tf.constant(1.0, shape=(dim, dim)))

with tf.device('/gpu:1'):
    b = tf.get_variable("b", initializer=tf.constant(2.0, shape=(dim, dim)))

with tf.device('/gpu:0'):
    summed_node = nccl.all_sum([a, b])
    for i in summed_node:
        print('before', i, i.device)

sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=True))

init = tf.global_variables_initializer()
sess.run(init)

with tf.device('/gpu:0'):
    summed = sess.run(summed_node)
    #print('summed: ', summed)
def train(config, restore=False):
    sess_config = tf.ConfigProto()
    sess_config.allow_soft_placement = True
    sess_config.gpu_options.allow_growth = True

    with tf.Graph().as_default(), \
         tf.Session(config=sess_config) as sess:
        logger.info("Attempt to load embedding.")
        embedding_init = np.load(config.embed_path).astype(np.float32)
        logger.info("Done.")
        logger.info("Prepare datasets...")

        with open(config.vocab_path, 'r') as fin:
            vocabulary = [line.strip() for line in fin.readlines()]
        vocab_table = tf.contrib.lookup.index_table_from_tensor(
            vocabulary, default_value=137)  # default is unk
        doc_table = tf.contrib.lookup.index_table_from_tensor(
            ['1', '0', '-1', '-2'], default_value=-1)
        train_set = get_csv_dataset([config.train_path],
                                    vocab_table,
                                    doc_table,
                                    config.batch_size,
                                    num_sub_batch=config.num_gpus,
                                    shuffle=True,
                                    bucket_width=100)
        train_eval_set = get_csv_dataset([config.train_eval_path],
                                         vocab_table,
                                         doc_table,
                                         config.eval_batch_size,
                                         config.num_gpus,
                                         shuffle=False)
        valid_eval_set = get_csv_dataset([config.valid_path],
                                         vocab_table,
                                         doc_table,
                                         config.eval_batch_size,
                                         config.num_gpus,
                                         shuffle=False)

        iterator = tf.data.Iterator.from_structure(train_set.output_types,
                                                   train_set.output_shapes)
        train_iter_init = iterator.make_initializer(train_set)
        train_eval_iter_init = iterator.make_initializer(train_eval_set)
        valid_eval_iter_init = iterator.make_initializer(valid_eval_set)
        logger.info("Done.")

        # build model
        logger.info("Build train graph...")
        tower_grads_list = []
        tower_tvars_list = []
        tower_gvars_list = []
        tower_loss_list = []
        tower_labels_list = []
        tower_oh_preds_list = []

        tower_batches = iterator.get_next()
        for index, tower_batch in enumerate(tower_batches):
            with tf.variable_scope("tower_%d" % index) as scope, \
                    tf.device('/gpu:%d' % index):

                tower_ids, tower_raw_seqs, tower_seqs, tower_lengths, tower_labels = tower_batch
                tower_train_loss, tower_eval_oh_preds, tower_elmo_saver = \
                    build_tower(config, tower_seqs, tower_lengths, tower_labels,
                                initializers={"embedding_init": embedding_init})
                tower_gvars = tf.global_variables(scope._name)
                tower_tvars = tf.trainable_variables(scope._name)
                tower_grads = tf.gradients(tower_train_loss, tower_tvars)
                tower_loss_list.append(tower_train_loss)
                tower_tvars_list.append(tower_tvars)
                tower_gvars_list.append(tower_gvars)
                tower_grads_list.append(tower_grads)

                tower_labels_list.append(tower_labels)
                tower_oh_preds_list.append(tower_eval_oh_preds)
                if index == 0:
                    saver = tf.train.Saver(tower_gvars)
                    elmo_saver = tower_elmo_saver

        with tf.name_scope("tower_gvar_sync"):
            if len(tower_gvars_list) == 1:
                tower_gvar_sync = tf.no_op()
            else:
                sync_ops = []
                for vars in zip(*tower_gvars_list):
                    for var in vars[1:]:
                        sync_ops.append(tf.assign(var, vars[0]))
                tower_gvar_sync = tf.group(*sync_ops)

        with tf.name_scope('all_reduce'):
            avg_tower_grads_list = []
            for grads_to_avg in zip(*tower_grads_list):
                if None in grads_to_avg:
                    avg_tower_grads_list.append(grads_to_avg)
                    continue
                avg_tower_grads_list.append(nccl.all_sum(grads_to_avg))
            avg_tower_grads_list = zip(*avg_tower_grads_list)

        with tf.device('/gpu:0'), tf.name_scope('metrics'):
            # metrics
            labels = tf.concat(tower_labels_list, axis=0)
            # [batch_size, num_aspects, num_labels]
            oh_preds = tf.concat(tower_oh_preds_list, axis=0)
            # [batch_size, num_aspects, num_labels]
            oh_labels = tf.one_hot(labels,
                                   depth=4,
                                   on_value=True,
                                   off_value=False,
                                   dtype=tf.bool)
            tps = tf.get_local_variable("tps", shape=[20, 4], dtype=tf.float64)
            fps = tf.get_local_variable("fps", shape=[20, 4], dtype=tf.float64)
            fns = tf.get_local_variable("fns", shape=[20, 4], dtype=tf.float64)

            def cross_and_sum(pred_bool, label_bool):
                cross = tf.logical_and(tf.equal(oh_preds, pred_bool),
                                       tf.equal(oh_labels, label_bool))
                return tf.reduce_sum(tf.cast(cross, tf.float64), axis=0)

            f1_updates = tf.group(
                tf.assign_add(tps,
                              cross_and_sum(pred_bool=True, label_bool=True)),
                tf.assign_add(fps,
                              cross_and_sum(pred_bool=True, label_bool=False)),
                tf.assign_add(fns,
                              cross_and_sum(pred_bool=False, label_bool=True)),
            )
            precisions = tps / (tps + fps + 1e-50)
            recalls = tps / (tps + fns + 1e-50)
            f1s = 2 * precisions * recalls / (precisions + recalls + 1e-50)
            macro_f1 = tf.reduce_mean(f1s)
            metrics_update = tf.group(f1_updates)

            # train loss
            loss = tf.add_n(tower_loss_list) / len(tower_loss_list)
        tower_train_ops = []
        for index, (tower_vars, tower_grads) in \
                enumerate(zip(tower_tvars_list, avg_tower_grads_list)):
            with tf.variable_scope("tower_%d" % index), \
                 tf.device('/gpu:%d' % index):
                tower_grads = [
                    grad / len(tower_batches) if grad is not None else None
                    for grad in tower_grads
                ]
                if index == 0:
                    global_step = tf.train.get_or_create_global_step()
                    lr = cyclic_learning_rate(global_step=global_step,
                                              min_lr=0.00005,
                                              max_lr=0.002,
                                              step_size=8205)

                tower_optimizer = tf.contrib.opt.NadamOptimizer(lr)
                tower_grads, _ = tf.clip_by_global_norm(
                    tower_grads, config.grad_clip_max_norm)
                tower_train_op = tower_optimizer.apply_gradients(
                    zip(tower_grads, tower_vars),
                    global_step=global_step if index == 0 else None)
                tower_train_ops.append(tower_train_op)
        with tf.control_dependencies(tf.get_collection(
                tf.GraphKeys.UPDATE_OPS)):
            train_op = tf.group(tower_train_ops)
        logger.info("Done.")

        # start training
        logger.info("Init model...")
        sess.run([
            tf.global_variables_initializer(),
            tf.local_variables_initializer(),
            tf.tables_initializer()
        ])
        logger.info("Done.")

        if elmo_saver is not None:
            logger.info("Restoring elmo...")
            elmo_saver.restore(sess, config.elmo_path)
            logger.info("Done.")

        if restore:
            logger.info("Restore model from {}".format(config.model_path))
            saver.restore(sess, config.model_path)
            logger.info("Done.")
        logger.info("Synchronize towers...")
        sess.run(tower_gvar_sync)
        logger.info("Done.")

        fetch_dict = {
            'loss': loss,
            'train_op': train_op,
            'step': global_step,
            'lr': lr
        }
        loss_tracker = CSVTracker(fields=['epoch', 'step', 'loss', 'lr'],
                                  fmts=['%d', '%d', "%.4f", '%g'],
                                  start_time=config.start_time,
                                  log_dir=config.output_dir,
                                  filename='loss')
        acc_tracker = StatefulTracker(
            cmp_field="valid_f1",
            fields=["epoch", "train_f1", "valid_f1", "diff_f1"],
            log_dir=config.output_dir,
            start_time=config.start_time,
            filename='acc')

        def _train(iter_init, epoch):
            sess.run([iter_init])
            fetch = {"epoch": epoch}
            step = sess.run(global_step)
            while True:
                try:
                    if step % 50 == 0:
                        fetch.update(sess.run(fetch_dict))
                        loss_tracker.track(fetch)
                    else:
                        sess.run(train_op)
                    step += 1
                except tf.errors.OutOfRangeError:
                    break

        def _evaluate(iter_init):
            timer = Timer()
            sess.run([iter_init, tf.local_variables_initializer()])
            while True:
                try:
                    sess.run(metrics_update)
                except tf.errors.OutOfRangeError:
                    break
            logger.info("Time elapsed: %s" % timer.tock())
            fetch_macro_f1 = \
                sess.run(macro_f1)
            return fetch_macro_f1

        logger.info("Start training.")
        for epoch in range(config.max_epoch):
            _train(train_iter_init, epoch)
            logger.info("Evaluate train set...")
            train_f1 = _evaluate(train_eval_iter_init)
            logger.info("Evaluate valid set...")
            valid_f1 = _evaluate(valid_eval_iter_init)
            acc_tracker.track(
                dict(epoch=epoch,
                     train_f1=train_f1,
                     valid_f1=valid_f1,
                     diff_f1=train_f1 - valid_f1))
            if acc_tracker.improved:
                logger.info("Save checkpoint to {}".format(
                    repr(config.model_path)))
                saver.save(sess, config.model_path)
                logger.info("Done.")
            if acc_tracker.staled_tracks > config.early_stop_epoch:
                logger.warning("Stop improve for %d epoch, early stop." %
                               acc_tracker.staled_tracks)
                break
        logger.info("Finish training.")