def testErrors(self): with self.assertRaisesRegex(ValueError, 'Device assignment .* required'): nccl_ops.all_sum( [array_ops.identity(np.random.random_sample((3, 4)))]) with self.assertRaisesRegex(ValueError, 'Must pass >0 tensors'): nccl_ops.all_sum([])
def allreduce_tensors(all_tensors, average=True): """ REFERENCE : https://github.com/ppwwyyxx/tensorpack/blob/83e4e187af5765792408e7b7163efd4744d63628/tensorpack/graph_builder/utils.py All-reduce average the per-device tensors of the variables among K devices. Args: all_tensors (K x N): List of list of tensors. N is the number of (independent) variables. average (bool): divide the tensors by N or not. Returns: K x N: same as input, but each tensor is replaced by the all reduce over K devices. """ nr_tower = len(all_tensors) if nr_tower == 1: return all_tensors new_all_tensors = [] # N x K for tensors in zip(*all_tensors): summed = nccl.all_sum(tensors) tensors_for_devices = [] # K for tensor in summed: with tf.device(tensor.device): # tensorflow/benchmarks didn't average gradients if average: tensor = tf.multiply(tensor, 1.0 / nr_tower, name='allreduce_avg') tensors_for_devices.append(tensor) new_all_tensors.append(tensors_for_devices) # transpose to K x N ret = list(zip(*new_all_tensors)) return ret
def allreduce_grads(all_grads, average): """ All-reduce average the gradients among K devices. Results are broadcasted to all devices. Args: all_grads (K x N): List of list of gradients. N is the number of variables. average (bool): average gradients or not. Returns: K x N: same as input, but each grad is replaced by the average over K devices. """ if get_tf_version_tuple() <= (1, 12): from tensorflow.contrib import nccl else: from tensorflow.python.ops import nccl_ops as nccl nr_tower = len(all_grads) if nr_tower == 1: return all_grads new_all_grads = [] # N x K for grads in zip(*all_grads): summed = nccl.all_sum(grads) grads_for_devices = [] # K for g in summed: with tf.device(g.device): # tensorflow/benchmarks didn't average gradients if average: g = tf.multiply(g, 1.0 / nr_tower) grads_for_devices.append(g) new_all_grads.append(grads_for_devices) # transpose to K x N ret = list(zip(*new_all_grads)) return ret
def apply_update(self): device_list = list(self._dev_opt.keys()) for dev in device_list: self._dev_grad_sum[dev] = [] ops = [] if len(device_list) > 1: with tf.name_scope("all_reduce"), tf.device(None): var_length = len(self._dev_grad[device_list[0]]) for var_idx in range(var_length): g = [ self._dev_grad[dev][var_idx][0] for dev in device_list ] g = nccl_ops.all_sum(g) for dev, gg in zip(device_list, g): self._dev_grad_sum[dev].append( (gg, self._dev_grad[dev][var_idx][1])) for dev_idx, (device, grads) in enumerate(self._dev_grad_sum.items()): with tf.name_scope("Apply_grad%d" % dev_idx), tf.device(device): update_op = self._dev_opt[device].apply_gradients(grads) ops.append(update_op) else: for device, grads in self._dev_grad.items(): with tf.name_scope("Apply_grad"), tf.device(device): update_op = self._dev_opt[device].apply_gradients(grads) ops.append(update_op) ops.extend(self.reset_opt_state()) return tf.group(*ops, name='TrainingOp')
def build_nccl_all_reduce(input_tensors, red_op, un_op=None): """Build a subgraph that does one full all-reduce, using NCCL. Args: input_tensors: list of T `tf.Tensor` of same-shape and type values to be reduced. red_op: binary elementwise reduction operator. Must be one of {tf.add} un_op: optional unary elementwise Op to apply to fully-reduce values. Returns: list of T `tf.Tensor` of reduced values. Raises: ValueError: red_op not supported. """ if red_op == math_ops.add: output_tensors = nccl_ops.all_sum(input_tensors) else: raise ValueError("red_op not supported by NCCL all-reduce: ", red_op) if un_op: un_op_wrapped = [] for t in output_tensors: with ops.colocate_with(t): un_op_wrapped.append(un_op(t)) output_tensors = un_op_wrapped return output_tensors
def build_nccl_all_reduce(input_tensors, red_op, un_op=None): """Build a subgraph that does one full all-reduce, using NCCL. Args: input_tensors: list of T `tf.Tensor` of same-shape and type values to be reduced. red_op: binary elementwise reduction operator. Must be one of {tf.add} un_op: optional unary elementwise Op to apply to fully-reduce values. Returns: list of T `tf.Tensor` of reduced values. Raises: ValueError: red_op not supported. """ if red_op == math_ops.add: output_tensors = nccl_ops.all_sum(input_tensors) else: raise ValueError("red_op not supported by NCCL all-reduce: ", red_op) if un_op: un_op_wrapped = [] for t in output_tensors: with ops.colocate_with(t): un_op_wrapped.append(un_op(t)) output_tensors = un_op_wrapped return output_tensors
def allreduce_grads(all_grads, average=True): """ REFERENCE : https://github.com/ppwwyyxx/tensorpack/blob/83e4e187af5765792408e7b7163efd4744d63628/tensorpack/graph_builder/utils.py All-reduce average the gradients among K devices. Results are broadcasted to all devices. Args: all_grads (K x N): List of list of gradients. N is the number of variables. average (bool): average gradients or not. Returns: K x N: same as input, but each grad is replaced by the average over K devices. """ # from tensorflow.contrib import nccl from tensorflow.python.ops import nccl_ops nr_tower = len(all_grads) if nr_tower == 1: return all_grads new_all_grads = [] # N x K for grads in zip(*all_grads): summed = nccl_ops.all_sum(grads) grads_for_devices = [] # K for g in summed: with tf.device(g.device): # tensorflow/benchmarks didn't average gradients if average: g = tf.multiply(g, 1.0 / nr_tower, name='allreduce_avg') grads_for_devices.append(g) new_all_grads.append(grads_for_devices) # transpose to K x N ret = list(zip(*new_all_grads)) return ret
def _broadcast_nccl(self): """Sum gradients across devices using NCCL ops (fast path).""" from tensorflow.python.ops import nccl_ops # pylint: disable=no-name-in-module for all_vars in zip(*[device.grad_clean.keys() for device in self._devices.values()]): if any(x.shape.num_elements() > 0 for x in all_vars): all_grads = [device.grad_clean[var] for device, var in zip(self._devices.values(), all_vars)] all_grads = nccl_ops.all_sum(all_grads) for device, var, grad in zip(self._devices.values(), all_vars, all_grads): device.grad_clean[var] = grad
def aggregate_gradients_using_nccl(replica_grads): """Aggregate gradients using nccl allreduce.""" agg_all_g_and_v = [] for single_g_and_v in zip(*replica_grads): single_grads = [g for g, _ in single_g_and_v] agg_grads = nccl_ops.all_sum(single_grads) agg_all_g_and_v.append( [(g, v) for g, (_, v) in zip(agg_grads, single_g_and_v)]) agg_all_g_and_v = list(zip(*agg_all_g_and_v)) return agg_all_g_and_v
def aggregate_gradients_using_nccl(replica_grads): """Aggregate gradients using nccl allreduce.""" agg_all_g_and_v = [] for single_g_and_v in zip(*replica_grads): single_grads = [g for g, _ in single_g_and_v] agg_grads = nccl_ops.all_sum(single_grads) agg_all_g_and_v.append( [(g, v) for g, (_, v) in zip(agg_grads, single_g_and_v)]) agg_all_g_and_v = list(zip(*agg_all_g_and_v)) return agg_all_g_and_v
def sum_grad_and_var_all_reduce(grad_and_vars, num_workers, alg, gpu_indices, aux_devices=None, num_shards=1): """Apply all-reduce algorithm over specified gradient tensors.""" with tf.name_scope('allreduce'): # Note that each grad_and_vars looks like the following: # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) scaled_grads = [g for g, _ in grad_and_vars] if alg == 'nccl': from tensorflow.python.ops import nccl_ops summed_grads = nccl_ops.all_sum(scaled_grads) elif alg == 'simple': summed_grads = build_reduce_sum(scaled_grads) elif alg == 'trivial': summed_grads = build_trivial_sum(scaled_grads) elif alg == 'xring': summed_grads = all_reduce.build_ring_all_reduce( scaled_grads, num_workers, num_shards, gpu_indices, tf.add) elif alg == 'nccl/xring': summed_grads = all_reduce.build_nccl_then_ring( scaled_grads, num_shards, tf.add) elif alg == 'nccl/rechd': summed_grads = all_reduce.build_nccl_then_recursive_hd( scaled_grads, tf.add) elif alg == 'nccl/pscpu': summed_grads = all_reduce.build_nccl_then_shuffle( scaled_grads, aux_devices, tf.add, tf.add_n) elif alg == 'pscpu/pscpu': summed_grads = all_reduce.build_shuffle_then_shuffle( scaled_grads, aux_devices, # TODO(tucker): devise a way of better specifying the device # for the second level. [aux_devices[0]], tf.add_n) elif alg in ['pscpu', 'psgpu']: summed_grads = all_reduce.build_shuffle_all_reduce( scaled_grads, aux_devices, tf.add_n) else: raise ValueError('unsupported all_reduce alg: ', alg) result = [] for (_, v), g in zip(grad_and_vars, summed_grads): result.append([g, v]) return result
def sum_grad_and_var_all_reduce(grad_and_vars, num_workers, alg, gpu_indices, aux_devices=None, num_shards=1): """Apply all-reduce algorithm over specified gradient tensors.""" with tf.name_scope('allreduce'): # Note that each grad_and_vars looks like the following: # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) scaled_grads = [g for g, _ in grad_and_vars] if alg == 'nccl': from tensorflow.python.ops import nccl_ops summed_grads = nccl_ops.all_sum(scaled_grads) elif alg == 'simple': summed_grads = build_reduce_sum(scaled_grads) elif alg == 'trivial': summed_grads = build_trivial_sum(scaled_grads) elif alg == 'xring': summed_grads = all_reduce.build_ring_all_reduce( scaled_grads, num_workers, num_shards, gpu_indices, tf.add) elif alg == 'nccl/xring': summed_grads = all_reduce.build_nccl_then_ring( scaled_grads, num_shards, tf.add) elif alg == 'nccl/rechd': summed_grads = all_reduce.build_nccl_then_recursive_hd( scaled_grads, tf.add) elif alg == 'nccl/pscpu': summed_grads = all_reduce.build_nccl_then_shuffle( scaled_grads, aux_devices, tf.add, tf.add_n) elif alg == 'pscpu/pscpu': summed_grads = all_reduce.build_shuffle_then_shuffle( scaled_grads, aux_devices, # TODO(tucker): devise a way of better specifying the device # for the second level. [aux_devices[0]], tf.add_n) elif alg in ['pscpu', 'psgpu']: summed_grads = all_reduce.build_shuffle_all_reduce( scaled_grads, aux_devices, tf.add_n) else: raise ValueError('unsupported all_reduce alg: ', alg) result = [] for (_, v), g in zip(grad_and_vars, summed_grads): result.append([g, v]) return result
def sum_grad_and_var_all_reduce(grad_and_vars, num_workers, alg, gpu_indices, aux_devices=None, num_shards=1): """Apply all-reduce algorithm over specified gradient tensors.""" with ops.name_scope('allreduce'): # Note that each grad_and_vars looks like the following: # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) scaled_grads = [g for g, _ in grad_and_vars] if alg == 'nccl': summed_grads = nccl_ops.all_sum(scaled_grads) elif alg == 'xring': summed_grads = all_reduce.build_ring_all_reduce( scaled_grads, num_workers, num_shards, gpu_indices, math_ops.add) elif alg == 'nccl/xring': summed_grads = all_reduce.build_nccl_then_ring( scaled_grads, num_shards, math_ops.add) elif alg == 'nccl/rechd': summed_grads = all_reduce.build_nccl_then_recursive_hd( scaled_grads, math_ops.add) elif alg == 'nccl/pscpu': summed_grads = all_reduce.build_nccl_then_shuffle( scaled_grads, aux_devices, math_ops.add, math_ops.add_n) elif alg == 'pscpu/pscpu': second_gather_devices = aux_devices[:num_shards] summed_grads = all_reduce.build_shuffle_then_shuffle( scaled_grads, aux_devices, second_gather_devices, math_ops.add_n) elif alg in ['pscpu', 'psgpu']: summed_grads = all_reduce.build_shuffle_all_reduce( scaled_grads, aux_devices, math_ops.add_n) else: raise ValueError('unsupported all_reduce alg: ', alg) result = [] for (_, v), g in zip(grad_and_vars, summed_grads): result.append([g, v]) return result
def sum_grad_and_var_all_reduce(grad_and_vars, num_workers, alg, gpu_indices, aux_devices=None, num_shards=1): """Apply all-reduce algorithm over specified gradient tensors.""" with ops.name_scope('allreduce'): # Note that each grad_and_vars looks like the following: # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) scaled_grads = [g for g, _ in grad_and_vars] if alg == 'nccl': summed_grads = nccl_ops.all_sum(scaled_grads) elif alg == 'xring': summed_grads = all_reduce.build_ring_all_reduce( scaled_grads, num_workers, num_shards, gpu_indices, math_ops.add) elif alg == 'nccl/xring': summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards, math_ops.add) elif alg == 'nccl/rechd': summed_grads = all_reduce.build_nccl_then_recursive_hd( scaled_grads, math_ops.add) elif alg == 'nccl/pscpu': summed_grads = all_reduce.build_nccl_then_shuffle( scaled_grads, aux_devices, math_ops.add, math_ops.add_n) elif alg == 'pscpu/pscpu': second_gather_devices = aux_devices[:num_shards] summed_grads = all_reduce.build_shuffle_then_shuffle( scaled_grads, aux_devices, second_gather_devices, math_ops.add_n) elif alg in ['pscpu', 'psgpu']: summed_grads = all_reduce.build_shuffle_all_reduce( scaled_grads, aux_devices, math_ops.add_n) else: raise ValueError('unsupported all_reduce alg: ', alg) result = [] for (_, v), g in zip(grad_and_vars, summed_grads): result.append([g, v]) return result
def all_avg_gradients(tower_gradvars, devices, param_server_device='/gpu:0'): if len(devices) == 1: return tower_gradvars if have_nccl and FLAGS.nccl: new_tower_grads = [] contig_list = [] for d, grad_list in zip(devices, tower_gradvars): with tf.device(d): flat_grads = [tf.reshape(g, [-1]) for (g, _) in grad_list] contig_grads = tf.concat(flat_grads, 0) contig_list.append(contig_grads) summed_grads = nccl_ops.all_sum(contig_list) for d, s, grad_list in zip(devices, summed_grads, tower_gradvars): with tf.device(d): new_grad_list = []; sizes = [tf.size(g) for (g, _) in grad_list] flat_grads = tf.split(s, sizes) for newg, (oldg, v) in zip(flat_grads, grad_list): newg = tf.reshape(newg, tf.shape(oldg)) newg *= 1. / len(devices) new_grad_list.append((newg, v)) new_tower_grads.append(new_grad_list) return new_tower_grads else: num_devices = len(tower_gradvars) avg_gradvars = [] for layer in zip(*tower_gradvars): grads_on_devices, vars_on_devices = zip(*layer) with tf.device(param_server_device): avg_grad = tf.reduce_mean(tf.stack(grads_on_devices), 0) avg_grads_on_devices = [avg_grad]*num_devices avg_gradvars_on_devices = zip(*(avg_grads_on_devices, vars_on_devices)) avg_gradvars.append(avg_gradvars_on_devices) return list(zip(*avg_gradvars))
def apply_updates(self): assert not self._updates_applied self._updates_applied = True devices = list(self._dev_grads.keys()) total_grads = sum(len(grads) for grads in self._dev_grads.values()) assert len(devices) >= 1 and total_grads >= 1 ops = [] with absolute_name_scope(self.scope): # Cast gradients to FP32 and calculate partial sum within each device. dev_grads = OrderedDict() # device => [(grad, var), ...] for dev_idx, dev in enumerate(devices): with tf.name_scope('ProcessGrads%d' % dev_idx), tf.device(dev): sums = [] for gv in zip(*self._dev_grads[dev]): assert all(v is gv[0][1] for g, v in gv) g = [tf.cast(g, tf.float32) for g, v in gv] g = g[0] if len(g) == 1 else tf.add_n(g) sums.append((g, gv[0][1])) dev_grads[dev] = sums # Sum gradients across devices. if len(devices) > 1: with tf.name_scope('SumAcrossGPUs'), tf.device(None): for var_idx, grad_shape in enumerate(self._grad_shapes): g = [dev_grads[dev][var_idx][0] for dev in devices] if np.prod( grad_shape ): # nccl does not support zero-sized tensors g = all_sum(g) for dev, gg in zip(devices, g): dev_grads[dev][var_idx] = ( gg, dev_grads[dev][var_idx][1]) # Apply updates separately on each device. for dev_idx, (dev, grads) in enumerate(dev_grads.items()): with tf.name_scope('ApplyGrads%d' % dev_idx), tf.device(dev): # Scale gradients as needed. if self.use_loss_scaling or total_grads > 1: with tf.name_scope('Scale'): coef = tf.constant(np.float32(1.0 / total_grads), name='coef') coef = self.undo_loss_scaling(coef) grads = [(g * coef, v) for g, v in grads] # Check for overflows. with tf.name_scope('CheckOverflow'): grad_ok = tf.reduce_all( tf.stack([ tf.reduce_all(tf.is_finite(g)) for g, v in grads ])) # Update weights and adjust loss scaling. with tf.name_scope('UpdateWeights'): opt = self._dev_opt[dev] ls_var = self.get_loss_scaling_var(dev) if not self.use_loss_scaling: ops.append( tf.cond(grad_ok, lambda: opt.apply_gradients(grads), tf.no_op)) else: ops.append( tf.cond( grad_ok, lambda: tf.group( tf.assign_add(ls_var, self. loss_scaling_inc), opt.apply_gradients(grads)), lambda: tf.group( tf.assign_sub(ls_var, self. loss_scaling_dec)))) # Report statistics on the last device. if dev == devices[-1]: with tf.name_scope('Statistics'): ops.append( autosummary(self.id + '/learning_rate', self.learning_rate)) ops.append( autosummary(self.id + '/overflow_frequency', tf.where(grad_ok, 0, 1))) if self.use_loss_scaling: ops.append( autosummary(self.id + '/loss_scaling_log2', ls_var)) # Initialize variables and group everything into a single op. self.reset_optimizer_state() init_uninited_vars(list(self._dev_ls_var.values())) return tf.group(*ops, name='TrainingOp')
def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer model.""" num_gpus = flags_core.get_num_gpus(flags_obj) print("num_gpus: ", num_gpus) # num_gpus=params["num_gpus"] learning_rate = get_learning_rate( learning_rate=params["learning_rate"], hidden_size=params["hidden_size"], learning_rate_warmup_steps=params["learning_rate_warmup_steps"]) optimizers = [ tf.contrib.opt.LazyAdamOptimizer( learning_rate, beta1=params["optimizer_adam_beta1"], beta2=params["optimizer_adam_beta2"], epsilon=params["optimizer_adam_epsilon"]) for _ in range(num_gpus) ] if params["dtype"] == "fp16": optimizers = [ tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer) for optimizer in optimizers ] # feature_shards, label_shards = replicate_model_fn._split_batch(features, labels, num_gpus, device=consolidation_device) # feature_shards, label_shards = split_batch(features, labels, num_gpus) model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN) grad_list = [] losses = [] logits = [] for gpu_idx in range(num_gpus): device_setter = local_device_setter( ps_device_type='cpu', worker_device='/gpu:{}'.format(gpu_idx)) with tf.device(device_setter): # with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_idx)), tf.variable_scope('tower%d'%gpu_idx): #with tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster_spec)): logit, loss = create_tower_network(model, params, features, labels) # feature_shard, label_shard = next(iterator) # logit, loss = create_tower_network(model, params, features, labels) logits.append(logit) losses.append(loss) grad_list.append([ x for x in optimizers[gpu_idx].compute_gradients(loss) if x[0] is not None ]) # output_train = tf.concat(logits, axis=0) output_train = tf.reduce_mean(logits, axis=0) loss_train = tf.reduce_mean(losses, name='loss') # grads = [] # all_vars= [] sparse_grads = [] sparse_vars = [] dense_grads = [] dense_vars = [] for tower in grad_list: sp_grad = [] sp_var = [] dn_grad = [] dn_var = [] for x in tower: if isinstance(x[1], ops.IndexedSlices): sp_grad.append(x[0]) sp_var.append(x[1]) else: dn_grad.append(x[0]) dn_var.append(x[1]) if (len(sp_var) > 0): sparse_grads.append(sp_grad) sparse_vars.append(sp_var) if (len(dn_var) > 0): dense_grads.append(dn_grad) dense_vars.append(dn_var) #SPARSE # for var, grad in zip(sparse_vars, sparse_grads): # if len(grad) == 1: # avg_grad = grad # else: # avg_grad = tf.multiply(tf.add_n(grad), 1. /len(grad)) # gradvars.append((avg_grad, var)) if len(sparse_vars) > 0: if num_gpus == 1: reduced_grad = sparse_grads else: new_all_grads = [] for grad in sparse_grads: new_grads = [] for tower_grad in grad: new_grads.append(tower_grad) summed = tf.add_n(new_grads) grads_for_devices = [] for g in summed: with tf.device(g.device): g = tf.multiply(g, 1.0 / num_gpus, name='allreduce_avg') grads_for_devices.append(g) new_all_grads.append(grads_for_devices) reduced_grad = list(zip(*new_all_grads)) gradvars = [ list(zip(gs, vs)) for gs, vs in zip(reduced_grad, sparse_vars) ] #DENSE reduced_grad = [] from tensorflow.python.ops import nccl_ops if num_gpus == 1: reduced_grad = dense_grads else: new_all_grads = [] for grad in dense_grads: summed = nccl_ops.all_sum(grad) grads_for_devices = [] for g in summed: with tf.device(g.device): g = tf.multiply(g, 1.0 / num_gpus, name='allreduce_avg') grads_for_devices.append(g) new_all_grads.append(grads_for_devices) reduced_grad = list(zip(*new_all_grads)) grads = [list(zip(gs, vs)) for gs, vs in zip(reduced_grad, dense_vars)] #apply gradients to each GPU by broadcasting summed gradient train_ops = [] for idx, grad_and_vars in enumerate(grads): with tf.name_scope('apply_gradients'), tf.device( tf.DeviceSpec(device_type="GPU", device_index=idx)): global_step = tf.train.get_global_step() update_ops = tf.assign(global_step, global_step + 1, name='update_global_step') #update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope='tower%d'%idx) #with tf.control_dependencies(update_ops): train_ops.append(optimizers[idx].apply_gradients( grad_and_vars, name='apply_grad_{}'.format(idx))) #SPARSE if device_index == 0 and len(sparse_vars) > 0: learning_rate = get_learning_rate( learning_rate=params["learning_rate"], hidden_size=params["hidden_size"], learning_rate_warmup_steps=params[ "learning_rate_warmup_steps"]) optimizer = tf.contrib.opt.LazyAdamOptimizer( learning_rate, beta1=params["optimizer_adam_beta1"], beta2=params["optimizer_adam_beta2"], epsilon=params["optimizer_adam_epsilon"]) optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=num_devices) sync_hook = optimizer.make_session_run_hook(is_chief) minimize_op = optimizer.apply_gradients( gradvars, global_step=tf.train.get_global_step()) train_ops.append(minimize_op) optimize_op = tf.group(update_ops, *train_ops, name='train_op') train_metrics = {"learning_rate": learning_rate} tf.identity(loss_train, "cross_entropy") if mode == tf.estimator.ModeKeys.TRAIN: return tf.estimator.EstimatorSpec(mode=mode, loss=loss_train, train_op=optimize_op) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec( mode=mode, loss=loss_train, predictions={"predictions": output_train}, eval_metric_ops=metrics.get_eval_metrics( output_train, labels, params)) if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions=output_train, export_outputs={ "translate": tf.estimator.export.PredictOutput(output_train) })
def testErrors(self): with self.assertRaisesRegexp(ValueError, 'Device assignment required'): nccl_ops.all_sum([array_ops.identity(np.random.random_sample((3, 4)))]) with self.assertRaisesRegexp(ValueError, 'Must pass >0 tensors'): nccl_ops.all_sum([])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpus', default='0,1', type=str) parser.add_argument('--max_step', default=10000, type=int) args = parser.parse_args() args.num_gpus = len(args.gpus.split(",")) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus # avoid unimplemented gpu kernel error config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: dataset = build_dataset(args.num_gpus) iterator = dataset.make_initializable_iterator() tower_batches = iterator.get_next() tower_grads_list = [] tower_tvars_list = [] tower_gvars_list = [] tower_loss_list = [] for index, tower_batch in enumerate(tower_batches): # by-device variable scope with tf.variable_scope("tower_%d" % index) as scope, \ tf.device('/gpu:%d' % index): tower_loss = build_tower(tower_batch) tower_gvars = tf.global_variables(scope._name) tower_tvars = tf.trainable_variables(scope._name) tower_grads = tf.gradients(tower_loss, tower_tvars) tower_loss_list.append(tower_loss) tower_tvars_list.append(tower_tvars) tower_gvars_list.append(tower_gvars) tower_grads_list.append(tower_grads) if index == 0: # only one variable global saver def clean(name): name = re.sub('^tower_\d+/', '', name) name = re.sub(':\d+$', '', name) return name save_dict = {clean(var.name): var for var in tower_gvars} saver = tf.train.Saver(save_dict) with tf.name_scope("tower_gvar_sync"): # different device is init with different random seed # need explicit synchronization before training!!! if len(tower_gvars_list) == 1: tower_gvar_sync = tf.no_op() else: sync_ops = [] for vars in zip(*tower_gvars_list): for var in vars[1:]: sync_ops.append(tf.assign(var, vars[0])) tower_gvar_sync = tf.group(*sync_ops) with tf.name_scope('all_reduce'): avg_tower_grads_list = [] for grads_to_avg in zip(*tower_grads_list): # nccl.all_sum will automatically # convert sparse gradients into dense one avg_tower_grads_list.append(all_sum(grads_to_avg)) avg_tower_grads_list = zip(*avg_tower_grads_list) with tf.name_scope('metrics'): loss = tf.add_n(tower_loss_list) / len(tower_loss_list) train_ops = [] for index, (tower_vars, tower_grads) in \ enumerate(zip(tower_tvars_list, avg_tower_grads_list)): with tf.variable_scope("tower_%d" % index), \ tf.device('/gpu:%d' % index): tower_grads = [grad / len(tower_batches) for grad in tower_grads] if index == 0: # only increment global step with the first worker step = tf.train.get_or_create_global_step() tower_optimizer = tf.train.AdamOptimizer() tower_train_op = tower_optimizer.apply_gradients(zip(tower_grads, tower_vars), global_step=step if index == 0 else None) train_ops.append(tower_train_op) train_op = tf.group(train_ops) # start running sess.run(tf.global_variables_initializer()) sess.run(iterator.initializer) # important to sync variables before training! sess.run(tower_gvar_sync) while True: try: fetch_loss, fetch_step, _ = sess.run([loss, step, train_op]) if fetch_step % 20 == 0: print("step: %d, loss: %.4f" % (fetch_step, fetch_loss)) if fetch_step > args.max_step: break except tf.errors.OutOfRangeError: break saver.save(sess, "./model")
def apply_updates(self, allow_no_op: bool = False) -> tf.Operation: """Construct training op to update the registered variables based on their gradients.""" tfutil.assert_tf_initialized() assert not self._updates_applied self._updates_applied = True all_ops = [] # Check for no-op. if allow_no_op and len(self._devices) == 0: with tfutil.absolute_name_scope(self.scope): return tf.no_op(name='TrainingOp') # Clean up gradients. for device_idx, device in enumerate(self._devices.values()): with tfutil.absolute_name_scope(self.scope + "/Clean%d" % device_idx), tf.device( device.name): for var, grad in device.grad_raw.items(): # Filter out disconnected gradients and convert to float32. grad = [g for g in grad if g is not None] grad = [tf.cast(g, tf.float32) for g in grad] # Sum within the device. if len(grad) == 0: grad = tf.zeros(var.shape) # No gradients => zero. elif len(grad) == 1: grad = grad[0] # Single gradient => use as is. else: grad = tf.add_n(grad) # Multiple gradients => sum. # Scale as needed. scale = 1.0 / len(device.grad_raw[var]) / len( self._devices) scale = tf.constant(scale, dtype=tf.float32, name="scale") if self.minibatch_multiplier is not None: scale /= tf.cast(self.minibatch_multiplier, tf.float32) scale = self.undo_loss_scaling(scale) device.grad_clean[var] = grad * scale # Sum gradients across devices. if len(self._devices) > 1: with tfutil.absolute_name_scope(self.scope + "/Broadcast"), tf.device(None): for all_vars in zip(*[ device.grad_clean.keys() for device in self._devices.values() ]): if len(all_vars) > 0 and all( dim > 0 for dim in all_vars[0].shape.as_list() ): # NCCL does not support zero-sized tensors. all_grads = [ device.grad_clean[var] for device, var in zip( self._devices.values(), all_vars) ] all_grads = nccl_ops.all_sum(all_grads) for device, var, grad in zip(self._devices.values(), all_vars, all_grads): device.grad_clean[var] = grad # Apply updates separately on each device. for device_idx, device in enumerate(self._devices.values()): with tfutil.absolute_name_scope(self.scope + "/Apply%d" % device_idx), tf.device( device.name): # pylint: disable=cell-var-from-loop # Accumulate gradients over time. if self.minibatch_multiplier is None: acc_ok = tf.constant(True, name='acc_ok') device.grad_acc = OrderedDict(device.grad_clean) else: # Create variables. with tf.control_dependencies(None): for var in device.grad_clean.keys(): device.grad_acc_vars[var] = tf.Variable( tf.zeros(var.shape), trainable=False, name="grad_acc_var") device.grad_acc_count = tf.Variable( tf.zeros([]), trainable=False, name="grad_acc_count") # Track counter. count_cur = device.grad_acc_count + 1.0 count_inc_op = lambda: tf.assign(device.grad_acc_count, count_cur) count_reset_op = lambda: tf.assign(device.grad_acc_count, tf.zeros([])) acc_ok = (count_cur >= tf.cast(self.minibatch_multiplier, tf.float32)) all_ops.append( tf.cond(acc_ok, count_reset_op, count_inc_op)) # Track gradients. for var, grad in device.grad_clean.items(): acc_var = device.grad_acc_vars[var] acc_cur = acc_var + grad device.grad_acc[var] = acc_cur with tf.control_dependencies([acc_cur]): acc_inc_op = lambda: tf.assign(acc_var, acc_cur) acc_reset_op = lambda: tf.assign( acc_var, tf.zeros(var.shape)) all_ops.append( tf.cond(acc_ok, acc_reset_op, acc_inc_op)) # No overflow => apply gradients. all_ok = tf.reduce_all( tf.stack([acc_ok] + [ tf.reduce_all(tf.is_finite(g)) for g in device.grad_acc.values() ])) apply_op = lambda: device.optimizer.apply_gradients( [(tf.cast(grad, var.dtype), var) for var, grad in device.grad_acc.items()]) all_ops.append(tf.cond(all_ok, apply_op, tf.no_op)) # Adjust loss scaling. if self.use_loss_scaling: ls_inc_op = lambda: tf.assign_add(device.loss_scaling_var, self.loss_scaling_inc) ls_dec_op = lambda: tf.assign_sub(device.loss_scaling_var, self.loss_scaling_dec) ls_update_op = lambda: tf.group( tf.cond(all_ok, ls_inc_op, ls_dec_op)) all_ops.append(tf.cond(acc_ok, ls_update_op, tf.no_op)) # Last device => report statistics. if device_idx == len(self._devices) - 1: all_ops.append( autosummary.autosummary(self.id + "/learning_rate", self.learning_rate)) all_ops.append( autosummary.autosummary(self.id + "/overflow_frequency", tf.where(all_ok, 0, 1), condition=acc_ok)) if self.use_loss_scaling: all_ops.append( autosummary.autosummary( self.id + "/loss_scaling_log2", device.loss_scaling_var)) # Initialize variables. self.reset_optimizer_state() if self.use_loss_scaling: tfutil.init_uninitialized_vars( [device.loss_scaling_var for device in self._devices.values()]) if self.minibatch_multiplier is not None: tfutil.run([ var.initializer for device in self._devices.values() for var in list(device.grad_acc_vars.values()) + [device.grad_acc_count] ]) # Group everything into a single op. with tfutil.absolute_name_scope(self.scope): return tf.group(*all_ops, name="TrainingOp")
def all_sum_gpu(g, *args, **kws): return nccl_ops.all_sum(g, *args, **kws)