def build_nccl_all_reduce(input_tensors, red_op, un_op=None): """Build a subgraph that does one full all-reduce, using NCCL. Args: input_tensors: list of T @{tf.Tensor} of same-shape and type values to be reduced. red_op: binary elementwise reduction operator. Must be one of {tf.add} un_op: optional unary elementwise Op to apply to fully-reduce values. Returns: list of T @{tf.Tensor} of reduced values. Raises: ValueError: red_op not supported. """ if red_op == math_ops.add: output_tensors = nccl.all_sum(input_tensors) else: raise ValueError("red_op not supported by NCCL all-reduce: ", red_op) if un_op: un_op_wrapped = [] for t in output_tensors: with ops.colocate_with(t): un_op_wrapped.append(un_op(t)) output_tensors = un_op_wrapped return output_tensors
def allreduce_grads(all_grads, average): """ All-reduce average the gradients among devices. Results are broadcasted to all devices. Args: all_grads (K x N x 2): A list of K lists. Each of the list is a list of N (grad, var) tuples. The variables have to be the same across the K lists. average (bool): average gradients or not. Returns: (K x N x 2): same as input, but each grad is replaced by the average over K lists. """ from tensorflow.contrib import nccl nr_tower = len(all_grads) if nr_tower == 1: return all_grads new_all_grads = [] # NVar * NGPU * 2 with tf.name_scope('AvgGrad'): for grad_and_vars in zip(*all_grads): v = grad_and_vars[0][1] grads = [g for g, _ in grad_and_vars] summed = nccl.all_sum(grads) grads_for_a_var = [] for (_, v), g in zip(grad_and_vars, summed): with tf.device(g.device): # tensorflow/benchmarks didn't average gradients if average: g = tf.multiply(g, 1.0 / nr_tower) grads_for_a_var.append((g, v)) new_all_grads.append(grads_for_a_var) # transpose ret = [k for k in zip(*new_all_grads)] return ret
def allreduce_grads(all_grads, average): """ All-reduce average the gradients among K devices. Results are broadcasted to all devices. Args: all_grads (K x N): List of list of gradients. N is the number of variables. average (bool): average gradients or not. Returns: K x N: same as input, but each grad is replaced by the average over K devices. """ from tensorflow.contrib import nccl nr_tower = len(all_grads) if nr_tower == 1: return all_grads new_all_grads = [] # N x K for grads in zip(*all_grads): summed = nccl.all_sum(grads) grads_for_devices = [] # K for g in summed: with tf.device(g.device): # tensorflow/benchmarks didn't average gradients if average: g = tf.multiply(g, 1.0 / nr_tower) grads_for_devices.append(g) new_all_grads.append(grads_for_devices) # transpose to K x N ret = list(zip(*new_all_grads)) return ret
def all_avg_gradients(tower_gradvars, devices, param_server_device='/gpu:0', usenccl=True): if len(devices) == 1: return tower_gradvars num_devices = len(devices) avg_gradvars = [] for layer in zip(*tower_gradvars): grads_on_devices, vars_on_devices = zip(*layer) if have_nccl and usenccl: # Note: These nccl ops _must_ be run on all devices, else deadlock # print('ALL_AVG_GRADIENTS GRADS_ON_DEVICES:', # grads_on_devices) # DEBUG avg_grads_on_devices = nccl.all_sum(grads_on_devices) for d, device in enumerate(devices): with tf.device(device): avg_grads_on_devices[d] *= 1. / num_devices else: with tf.device(param_server_device): avg_grad = tf.reduce_mean(tf.stack(grads_on_devices), 0) avg_grads_on_devices = [avg_grad] * num_devices avg_gradvars_on_devices = zip(*(avg_grads_on_devices, vars_on_devices)) avg_gradvars.append(avg_gradvars_on_devices) return list(zip(*avg_gradvars))
def sum_grad_and_var_all_reduce(grad_and_vars, devices): # Note that each grad_and_vars looks like the following: # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) scaled_grads = [g for _, (g, _) in zip(devices, grad_and_vars)] summed_grads = nccl.all_sum(scaled_grads) result = [] for d, (_, v), g in zip(devices, grad_and_vars, summed_grads): with tf.device(d): result.append((g, v)) return result
def aggregate_gradients_using_nccl(tower_grads): """Aggregate gradients using nccl allreduce.""" agg_all_g_and_v = [] for single_g_and_v in zip(*tower_grads): single_grads = [g for g, _ in single_g_and_v] agg_grads = nccl.all_sum(single_grads) agg_all_g_and_v.append( [(g, v) for g, (_, v) in zip(agg_grads, single_g_and_v)]) agg_all_g_and_v = list(zip(*agg_all_g_and_v)) return agg_all_g_and_v
def sum_grad_and_var_all_reduce(grad_and_vars, num_workers, alg, gpu_indices, aux_devices=None, num_shards=1): """Apply all-reduce algorithm over specified gradient tensors.""" with tf.name_scope('allreduce'): # Note that each grad_and_vars looks like the following: # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) scaled_grads = [g for g, _ in grad_and_vars] if alg == 'nccl': summed_grads = nccl.all_sum(scaled_grads) elif alg == 'simple': summed_grads = build_reduce_sum(scaled_grads) elif alg == 'trivial': summed_grads = build_trivial_sum(scaled_grads) elif alg == 'xring': summed_grads = all_reduce.build_ring_all_reduce( scaled_grads, num_workers, num_shards, gpu_indices, tf.add) elif alg == 'nccl/xring': summed_grads = all_reduce.build_nccl_then_ring( scaled_grads, num_shards, tf.add) elif alg == 'nccl/rechd': summed_grads = all_reduce.build_nccl_then_recursive_hd( scaled_grads, tf.add) elif alg == 'nccl/pscpu': summed_grads = all_reduce.build_nccl_then_shuffle( scaled_grads, aux_devices, tf.add, tf.add_n) elif alg == 'pscpu/pscpu': summed_grads = all_reduce.build_shuffle_then_shuffle( scaled_grads, aux_devices, # TODO(tucker): devise a way of better specifying the device # for the second level. [aux_devices[0]], tf.add_n) elif alg in ['pscpu', 'psgpu']: summed_grads = all_reduce.build_shuffle_all_reduce( scaled_grads, aux_devices, tf.add_n) else: raise ValueError('unsupported all_reduce alg: ', alg) result = [] for (_, v), g in zip(grad_and_vars, summed_grads): result.append([g, v]) return result
def sum_grad_and_var_all_reduce(grad_and_vars, num_workers, alg, gpu_indices, aux_devices=None, num_shards=1): """Apply all-reduce algorithm over specified gradient tensors.""" with tf.name_scope('allreduce'): # Note that each grad_and_vars looks like the following: # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) scaled_grads = [g for g, _ in grad_and_vars] if alg == 'nccl': summed_grads = nccl.all_sum(scaled_grads) elif alg == 'xring': summed_grads = all_reduce.build_ring_all_reduce( scaled_grads, num_workers, num_shards, gpu_indices, tf.add) elif alg == 'nccl/xring': summed_grads = all_reduce.build_nccl_then_ring( scaled_grads, num_shards, tf.add) elif alg == 'nccl/rechd': summed_grads = all_reduce.build_nccl_then_recursive_hd( scaled_grads, tf.add) elif alg == 'nccl/pscpu': summed_grads = all_reduce.build_nccl_then_shuffle( scaled_grads, aux_devices, tf.add, tf.add_n) elif alg == 'pscpu/pscpu': summed_grads = all_reduce.build_shuffle_then_shuffle( scaled_grads, aux_devices, # TODO(tucker): devise a way of better specifying the device set # for the second level. [aux_devices[0]], tf.add_n) elif alg in ['pscpu', 'psgpu']: summed_grads = all_reduce.build_shuffle_all_reduce( scaled_grads, aux_devices, tf.add_n) else: raise ValueError('unsupported all_reduce alg: ', alg) result = [] for (_, v), g in zip(grad_and_vars, summed_grads): result.append([g, v]) return result
def _allreduce_grads(tower_grads): from tensorflow.contrib import nccl nr_tower = len(tower_grads) if nr_tower == 1: return [[x] for x in tower_grads[0]] new_tower_grads = [] with tf.name_scope('AvgGrad'): for grad_and_vars in zip(*tower_grads): v = grad_and_vars[0][1] grads = [g for g, _ in grad_and_vars] summed = nccl.all_sum(grads) grads_for_a_var = [] for (_, v), g in zip(grad_and_vars, summed): with tf.device(g.device): g = tf.multiply(g, 1.0 / nr_tower) grads_for_a_var.append((g, v)) new_tower_grads.append(grads_for_a_var) # NVar * NGPU * 2 return new_tower_grads
def _allreduce_grads(tower_grads): from tensorflow.contrib import nccl nr_tower = len(tower_grads) if nr_tower == 1: return tower_grads[0] new_tower_grads = [] with tf.name_scope('AvgGrad'): for grad_and_vars in zip(*tower_grads): v = grad_and_vars[0][1] grads = [g for g, _ in grad_and_vars] if not MultiGPUTrainerBase.check_none_grads(v.op.name, grads): continue summed = nccl.all_sum(grads) grads_for_a_var = [] for (_, v), g in zip(grad_and_vars, summed): grads_for_a_var.append((g, v)) new_tower_grads.append(grads_for_a_var) # NVar * NGPU * 2 return new_tower_grads
def testCombined(self): if not test.is_gpu_available(): return # Test requires access to a GPU for dtype in [np.float32, np.int32, np.int64, np.float64]: # Create session inside outer loop to test use of # same communicator across multiple sessions. with self.test_session(use_gpu=True) as sess: for devices in [[ '/device:GPU:0', '/device:GPU:0', '/device:GPU:0' ], ['/device:GPU:0', '/device:GPU:0']]: shape = (3, 4) # all-reduce np_ans = np.zeros(shape=shape, dtype=dtype) tensors = [] for d in devices: with ops.device(d): t = ((np.random.random_sample(shape) - .5) * 1024).astype(dtype) np_ans += t tensors.append(array_ops.identity(t)) all_reduce_tensors = nccl.all_sum(tensors) sender = np.random.randint(0, len(devices) - 1) other_devices = devices[:sender] + devices[sender + 1:] send_op, received_tensors = nccl.broadcast( all_reduce_tensors[sender], other_devices) # sender doesn't need to be fetched as part of outputs of session.run. del all_reduce_tensors[sender] # Verify shape inference. for r in received_tensors: self.assertEqual(shape, r.get_shape()) # Run and verify results. nccl_results = sess.run(received_tensors + [send_op] + all_reduce_tensors) for r in nccl_results[:len(received_tensors)]: self.assertAllClose(r, np_ans)
def allreduce_gradients_bak(tower_grads): from tensorflow.contrib import nccl nr_tower = len(tower_grads) new_all_grads = [] # NVar * NGPU * 2 with tf.name_scope('gradient_allreduce'): for grad_and_vars in zip(*tower_grads): #v = grad_and_vars[0][1] grads = [g for g, _ in grad_and_vars] summed = nccl.all_sum(grads) grads_for_a_var = [] for (_, v), g in zip(grad_and_vars, summed): with tf.device(g.device): g = tf.multiply(g, 1.0 / nr_tower) grads_for_a_var.append((g, v)) new_all_grads.append(grads_for_a_var) # transpose ret = [list(k) for k in zip(*new_all_grads)] return ret
def allreduce_grads(all_grads, average=True): from tensorflow.contrib import nccl nr_tower = len(all_grads) if nr_tower == 1: return all_grads new_all_grads = [] # N x K for grads_and_vars in zip(*all_grads): grads = [g for g, _ in grads_and_vars] _vars = [v for _, v in grads_and_vars] summed = nccl.all_sum(grads) grads_for_devices = [] # K for g in summed: with tf.device(g.device): # tensorflow/benchmarks didn't average gradients if average: g = tf.multiply(g, 1.0 / nr_tower, name='allreduce_avg') grads_for_devices.append(g) new_all_grads.append(zip(grads_for_devices, _vars)) # transpose to K x N ret = list(zip(*new_all_grads)) return ret
def testCombined(self): if not test.is_gpu_available(): return # Test requires access to a GPU for dtype in [np.float32, np.int32, np.int64, np.float64]: # Create session inside outer loop to test use of # same communicator across multiple sessions. with self.test_session(use_gpu=True) as sess: for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]: shape = (3, 4) # all-reduce np_ans = np.zeros(shape=shape, dtype=dtype) tensors = [] for d in devices: with ops.device(d): t = ((np.random.random_sample(shape) - .5) * 1024).astype(dtype) np_ans += t tensors.append(array_ops.identity(t)) all_reduce_tensors = nccl.all_sum(tensors) sender = np.random.randint(0, len(devices) - 1) other_devices = devices[:sender] + devices[sender + 1:] send_op, received_tensors = nccl.broadcast(all_reduce_tensors[sender], other_devices) # sender doesn't need to be fetched as part of outputs of session.run. del all_reduce_tensors[sender] # Verify shape inference. for r in received_tensors: self.assertEqual(shape, r.get_shape()) # Run and verify results. nccl_results = sess.run( received_tensors + [send_op] + all_reduce_tensors) for r in nccl_results[:len(received_tensors)]: self.assertAllClose(r, np_ans)
def testErrors(self): with self.assertRaisesRegexp(ValueError, 'Device assignment required'): nccl.all_sum([array_ops.identity(np.random.random_sample((3, 4)))]) with self.assertRaisesRegexp(ValueError, 'Must pass >0 tensors'): nccl.all_sum([])
def __call__(self, **inputs): # Inputs images_splits = tf.split(axis=0, num_or_size_splits=self.num_gpus, value=inputs['images']) labels_splits = tf.split(axis=0, num_or_size_splits=self.num_gpus, value=inputs['labels']) # Inference tower_grads = [] tower_losses = [] for device_id in xrange(self.num_gpus): with tf.variable_scope('replicated_%s' % device_id): with tf.name_scope('TOWER_%d' % device_id) as name_scope: with tf.device('/gpu:%d' % device_id): # Forward pre_logits = self.model(images_splits[device_id], is_training=True) logits = fully_connected(pre_logits, num_outputs=inputs['num_classes'], activation_fn=None, biases_initializer=None, weights_regularizer=l2_regularizer(0.0005)) # Losses losses, losses_name = loss_function(logits, labels_splits[device_id], scope=name_scope) total_loss = tf.add_n(losses, name='total_loss') # Variables params = [v for v in tf.trainable_variables() if v.name.startswith('replicated_%s/' % device_id)] # Gradients grads = tf.gradients(total_loss, params, aggregation_method=tf.AggregationMethod.DEFAULT) grads = [grad/self.num_gpus for grad in grads] gradvars = list(zip(grads, params)) for grad, var in gradvars: if grad is not None: tf.summary.histogram(var.name, var) tf.summary.histogram(var.op.name + '/gradients', grad) # Tower grads, losses and updates tower_grads.append(gradvars) tower_losses.append(losses) if device_id == 0: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope) print('Tower %d has been inferenced.' % device_id) # Allreduce losses allreduce_losses = [tf.add_n(losses)/self.num_gpus for losses in zip(*tower_losses)] # Allreduce gradients allreduce_grads = [] for grad_and_vars in zip(*tower_grads): grads = [g for g, _ in grad_and_vars] summed_grads = nccl.all_sum(grads) new_grads_and_vars = [(g, v) for (_, v), g in zip(grad_and_vars, summed_grads)] allreduce_grads.append(new_grads_and_vars) grad_state = [list(x) for x in zip(*allreduce_grads)] # Optimizier tower_train_ops = [] for device_id in xrange(self.num_gpus): with tf.device('/gpu:%d' % device_id): # Gradients of TOWER_(device_id) grads = grad_state[device_id] # Optimizer configure opt = tf.train.MomentumOptimizer(self.lr, 0.9) # Tower train_ops tower_train_ops.append(opt.apply_gradients(grads)) print('Optimizer %d has been configured.' % device_id) global_step = tf.train.get_global_step() global_step_op = global_step.assign_add(1) train_ops = tf.group(*(tower_train_ops+update_ops+[global_step_op])) return train_ops, self.lr, allreduce_losses, losses_name
import tensorflow as tf from tensorflow.contrib.nccl import all_sum # from tensorflow.contrib.rccl import all_sum with tf.device('/gpu:0'): a = tf.get_variable( "a", initializer=tf.constant(1.0, shape=(2, 2))) with tf.device('/gpu:1'): b = tf.get_variable( "b", initializer=tf.constant(2.0, shape=(2, 2))) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) init = tf.global_variables_initializer() sess.run(init) with tf.device('/gpu:0'): summed = sess.run(all_sum([a, b])) print(summed[0]) print(summed[1]) # expected output # [[3. 3.] # [3. 3.]] # [[3. 3.] # [3. 3.]]
def model(X, S, Y, hps, train=False, ema=None): xs = tf.split(X, hps.ngpu, 1) ys = tf.split(Y, hps.ngpu, 1) ss = tf.split(S, hps.ngpu, 2 - hps.axis) losses = [] states = [] grads = [] for gpu in range(hps.ngpu): with tf.device("/gpu:%d" % gpu), tf.variable_scope("model%d" % gpu, reuse=not train): lstm_model = LSTM_Model(hps, train) loss, state = lstm_model.forward(xs[gpu], ss[gpu], ys[gpu], ema=ema) losses.append(loss) states.append(state) if train: grads.append(lstm_model.backward()) if train: ngrads = len(grads[0]) if hps.ngpu > 1: # all reduce grads for i in range(ngrads): sum_grads = nccl.all_sum( [grads[gpu][i][0] for gpu in range(hps.ngpu)]) for gpu in range(hps.ngpu): grads[gpu][i] = (sum_grads[gpu], grads[gpu][i][1]) train = list() for gpu, gpu_grads in enumerate(grads): with tf.device("/gpu:%d" % gpu), tf.variable_scope("opt%d" % gpu): # compute average from sum if hps.ngpu > 1: for i in range(ngrads): # Note the scalar division must appear in a device context otherwise # it will do a whole lot unnecessary of gpu to gpu copying. # Also rebuild the tuple. gpu_grads[i] = (gpu_grads[i][0] / float(hps.ngpu), gpu_grads[i][1]) if hps.optimizer == 'adam_old': trainer = tf.train.AdamOptimizer(learning_rate=hps.lr, beta2=hps.beta2) train.append(trainer.apply_gradients(gpu_grads)) else: param_grads = [gpu_grads[i][0] for i in range(ngrads)] param_names = [gpu_grads[i][1] for i in range(ngrads)] if hps.optimizer == 'adam': train.append( layers.adam_updates(param_names, param_grads, lr=hps.lr, mom2=hps.beta2, gamma=hps.gamma)) if hps.optimizer == 'adamax': train.append( layers.adamax_updates(param_names, param_grads, lr=hps.lr, mom2=hps.beta2)) train = tf.group(*train) else: train = None states = tf.concat(states, 2 - hps.axis) return train, tf.add_n(losses) / hps.ngpu, states
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num_gpus', default=2, type=int) parser.add_argument('--max_step', default=1000, type=int) args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = ','.join( [str(i) for i in range(args.num_gpus)]) # avoid unimplemented gpu kernel error config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: dataset = build_dataset(args.num_gpus) iterator = dataset.make_initializable_iterator() tower_batches = iterator.get_next() tower_grads_list = [] tower_tvars_list = [] tower_gvars_list = [] tower_loss_list = [] for index, tower_batch in enumerate(tower_batches): # by-device variable scope with tf.variable_scope("tower_%d" % index) as scope, \ tf.device('/gpu:%d' % index): tower_loss = build_tower(tower_batch) tower_gvars = tf.global_variables(scope._name) tower_tvars = tf.trainable_variables(scope._name) tower_grads = tf.gradients(tower_loss, tower_tvars) tower_loss_list.append(tower_loss) tower_tvars_list.append(tower_tvars) tower_gvars_list.append(tower_gvars) tower_grads_list.append(tower_grads) if index == 0: # only one variable global saver def clean(name): name = re.sub('^tower_\d+/', '', name) name = re.sub(':\d+$', '', name) return name save_dict = {clean(var.name): var for var in tower_gvars} saver = tf.train.Saver(save_dict) with tf.name_scope("tower_gvar_sync"): # different device is init with different random seed # need explicit synchronization before training!!! if len(tower_gvars_list) == 1: tower_gvar_sync = tf.no_op() else: sync_ops = [] for vars in zip(*tower_gvars_list): for var in vars[1:]: sync_ops.append(tf.assign(var, vars[0])) tower_gvar_sync = tf.group(*sync_ops) with tf.name_scope('all_reduce'): avg_tower_grads_list = [] for grads_to_avg in zip(*tower_grads_list): # nccl.all_sum will automatically # convert sparse gradients into dense one avg_tower_grads_list.append(nccl.all_sum(grads_to_avg)) avg_tower_grads_list = zip(*avg_tower_grads_list) with tf.name_scope('metrics'): loss = tf.add_n(tower_loss_list) / len(tower_loss_list) train_ops = [] for index, (tower_vars, tower_grads) in \ enumerate(zip(tower_tvars_list, avg_tower_grads_list)): with tf.variable_scope("tower_%d" % index), \ tf.device('/gpu:%d' % index): tower_grads = [ grad / len(tower_batches) for grad in tower_grads ] if index == 0: # only increment global step with the first worker step = tf.train.get_or_create_global_step() tower_optimizer = tf.train.AdamOptimizer() tower_train_op = tower_optimizer.apply_gradients( zip(tower_grads, tower_vars), global_step=step if index == 0 else None) train_ops.append(tower_train_op) train_op = tf.group(train_ops) # start running sess.run(tf.global_variables_initializer()) sess.run(iterator.initializer) # important to sync variables before training! sess.run(tower_gvar_sync) while True: try: fetch_loss, fetch_step, _ = sess.run([loss, step, train_op]) if fetch_step % 20 == 0: print("step: %d, loss: %.4f" % (fetch_step, fetch_loss)) if fetch_step > args.max_step: break except tf.errors.OutOfRangeError: break saver.save(sess, "./model")
def main(_): training = tf.Variable(True) accuracies = [] training_steps = [] optimisers = [] device_grads = [] losses = [] for device_num in range(GPUS): with tf.variable_scope('v{}'.format(device_num)): with tf.device('/cpu:0'): train_path = os.path.join(FLAGS.data_dir, 'train') test_path = os.path.join(FLAGS.data_dir, 'test') x, y_ = get_iterators(train_path, test_path) with tf.device('/gpu:{}'.format(device_num)): y = get_model(x, training=training) cross_entropy = tf.losses.sparse_softmax_cross_entropy( labels=y_, logits=y) losses.append(cross_entropy) correct_prediction = tf.equal( tf.cast(tf.argmax(y, 1), dtype=tf.int32), y_) accuracy = tf.reduce_mean( tf.cast(correct_prediction, tf.float32)) accuracies.append(accuracy) params = [ v for v in tf.get_collection('trainable_variables') if v.name.startswith('v%s/' % device_num) ] opt = tf.train.GradientDescentOptimizer(0.1) optimisers.append(opt) grads = opt.compute_gradients(cross_entropy, params) device_grads.append(grads) new_device_grads = [] for grad_and_vars in zip(*device_grads): scaled_grads = [g for g, _ in grad_and_vars] summed_grads = nccl.all_sum(scaled_grads) aggregated_device_grads = [] for (_, v), g in zip(grad_and_vars, summed_grads): aggregated_device_grads.append([g, v]) new_device_grads.append(aggregated_device_grads) aggregated_device_grads = [list(x) for x in zip(*new_device_grads)] training_ops = [] for d, device in enumerate(['/gpu:{}'.format(x) for x in range(GPUS)]): with tf.device(device): opt = optimisers[d] avg_grads = aggregated_device_grads[d] training_ops.append(optimisers[d].apply_gradients(avg_grads)) config = tf.ConfigProto() config.gpu_options.allow_growth = True if FLAGS.xla: # Turns on XLA JIT compilation. jit_level = tf.OptimizerOptions.ON_1 config.graph_options.optimizer_options.global_jit_level = jit_level run_metadata = tf.RunMetadata() sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) local_var_init_op = tf.local_variables_initializer() variable_mgr_init_ops = [local_var_init_op] with tf.control_dependencies([local_var_init_op]): variable_mgr_init_ops.extend(get_post_init_ops()) local_var_init_op_group = tf.group(*variable_mgr_init_ops) sess.run(local_var_init_op_group) # Get handles to enable iterator feeding. sess.run([ tf.get_collection('trn_iterator_inits'), tf.get_collection('val_iterator_inits') ]) training_handles = sess.run(tf.get_collection('trn_iterator_handles')) test_handles = sess.run(tf.get_collection('test_iterator_handles')) feedable_handles = tf.get_collection('feedable_iterator_handles') training_feed_dict = dict(zip(feedable_handles, training_handles)) test_feed_dict = dict(zip(feedable_handles, test_handles)) # Train train_step = tf.group(training_ops) loss = tf.reduce_mean(losses) loss_window = 200 loss_agg = np.zeros(loss_window) for i in range(FLAGS.train_loops): # Create a timeline for the last loop and export to json to view with # chrome://tracing/. if i == FLAGS.train_loops - 1: sess.run( [loss, train_step], feed_dict=training_feed_dict, options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata) trace = timeline.Timeline(step_stats=run_metadata.step_stats) with open('timeline.ctf.json', 'w') as trace_file: trace_file.write(trace.generate_chrome_trace_format()) else: l, _ = sess.run([loss, train_step], feed_dict=training_feed_dict) loss_agg[i % loss_window] = l print('Step: {}/{} Loss: {}'.format(i, FLAGS.train_loops, np.mean(loss_agg)), end="\r") # Print loss as it's overwritten in log print('Loss: {}'.format(np.mean(loss_agg))) # Change dataset to test version # Assign training = false sess.run( [tf.get_collection('test_iterator_inits'), training.assign(False)]) #for print('Accuracy:', sess.run(accuracy, feed_dict=test_feed_dict)) sess.close()
import tensorflow as tf from itertools import repeat from tensorflow.contrib.nccl import all_sum with tf.device('/gpu:0'): g0 = tf.placeholder(tf.float32, (2, 2), f"g0") with tf.device('/gpu:1'): g1 = tf.placeholder(tf.float32, (2, 2), f"g1") all_reduce_sum = all_sum([g0, g1]) sess = tf.Session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=False)) init = tf.global_variables_initializer() sess.run(init) r = [[1, 1], [1, 1]], [[2, 2], [2, 2]] for x, y in repeat(r): sess.run(all_reduce_sum, feed_dict={g0: x, g1: y})
from tensorflow import logging import tokenization import util import os from tensorflow.python.framework import ops from tensorflow.contrib import nccl dim = 10000 with tf.device('/gpu:0'): a = tf.get_variable("a", initializer=tf.constant(1.0, shape=(dim, dim))) with tf.device('/gpu:1'): b = tf.get_variable("b", initializer=tf.constant(2.0, shape=(dim, dim))) with tf.device('/gpu:0'): summed_node = nccl.all_sum([a, b]) for i in summed_node: print('before', i, i.device) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) init = tf.global_variables_initializer() sess.run(init) with tf.device('/gpu:0'): summed = sess.run(summed_node) #print('summed: ', summed)
def train(config, restore=False): sess_config = tf.ConfigProto() sess_config.allow_soft_placement = True sess_config.gpu_options.allow_growth = True with tf.Graph().as_default(), \ tf.Session(config=sess_config) as sess: logger.info("Attempt to load embedding.") embedding_init = np.load(config.embed_path).astype(np.float32) logger.info("Done.") logger.info("Prepare datasets...") with open(config.vocab_path, 'r') as fin: vocabulary = [line.strip() for line in fin.readlines()] vocab_table = tf.contrib.lookup.index_table_from_tensor( vocabulary, default_value=137) # default is unk doc_table = tf.contrib.lookup.index_table_from_tensor( ['1', '0', '-1', '-2'], default_value=-1) train_set = get_csv_dataset([config.train_path], vocab_table, doc_table, config.batch_size, num_sub_batch=config.num_gpus, shuffle=True, bucket_width=100) train_eval_set = get_csv_dataset([config.train_eval_path], vocab_table, doc_table, config.eval_batch_size, config.num_gpus, shuffle=False) valid_eval_set = get_csv_dataset([config.valid_path], vocab_table, doc_table, config.eval_batch_size, config.num_gpus, shuffle=False) iterator = tf.data.Iterator.from_structure(train_set.output_types, train_set.output_shapes) train_iter_init = iterator.make_initializer(train_set) train_eval_iter_init = iterator.make_initializer(train_eval_set) valid_eval_iter_init = iterator.make_initializer(valid_eval_set) logger.info("Done.") # build model logger.info("Build train graph...") tower_grads_list = [] tower_tvars_list = [] tower_gvars_list = [] tower_loss_list = [] tower_labels_list = [] tower_oh_preds_list = [] tower_batches = iterator.get_next() for index, tower_batch in enumerate(tower_batches): with tf.variable_scope("tower_%d" % index) as scope, \ tf.device('/gpu:%d' % index): tower_ids, tower_raw_seqs, tower_seqs, tower_lengths, tower_labels = tower_batch tower_train_loss, tower_eval_oh_preds, tower_elmo_saver = \ build_tower(config, tower_seqs, tower_lengths, tower_labels, initializers={"embedding_init": embedding_init}) tower_gvars = tf.global_variables(scope._name) tower_tvars = tf.trainable_variables(scope._name) tower_grads = tf.gradients(tower_train_loss, tower_tvars) tower_loss_list.append(tower_train_loss) tower_tvars_list.append(tower_tvars) tower_gvars_list.append(tower_gvars) tower_grads_list.append(tower_grads) tower_labels_list.append(tower_labels) tower_oh_preds_list.append(tower_eval_oh_preds) if index == 0: saver = tf.train.Saver(tower_gvars) elmo_saver = tower_elmo_saver with tf.name_scope("tower_gvar_sync"): if len(tower_gvars_list) == 1: tower_gvar_sync = tf.no_op() else: sync_ops = [] for vars in zip(*tower_gvars_list): for var in vars[1:]: sync_ops.append(tf.assign(var, vars[0])) tower_gvar_sync = tf.group(*sync_ops) with tf.name_scope('all_reduce'): avg_tower_grads_list = [] for grads_to_avg in zip(*tower_grads_list): if None in grads_to_avg: avg_tower_grads_list.append(grads_to_avg) continue avg_tower_grads_list.append(nccl.all_sum(grads_to_avg)) avg_tower_grads_list = zip(*avg_tower_grads_list) with tf.device('/gpu:0'), tf.name_scope('metrics'): # metrics labels = tf.concat(tower_labels_list, axis=0) # [batch_size, num_aspects, num_labels] oh_preds = tf.concat(tower_oh_preds_list, axis=0) # [batch_size, num_aspects, num_labels] oh_labels = tf.one_hot(labels, depth=4, on_value=True, off_value=False, dtype=tf.bool) tps = tf.get_local_variable("tps", shape=[20, 4], dtype=tf.float64) fps = tf.get_local_variable("fps", shape=[20, 4], dtype=tf.float64) fns = tf.get_local_variable("fns", shape=[20, 4], dtype=tf.float64) def cross_and_sum(pred_bool, label_bool): cross = tf.logical_and(tf.equal(oh_preds, pred_bool), tf.equal(oh_labels, label_bool)) return tf.reduce_sum(tf.cast(cross, tf.float64), axis=0) f1_updates = tf.group( tf.assign_add(tps, cross_and_sum(pred_bool=True, label_bool=True)), tf.assign_add(fps, cross_and_sum(pred_bool=True, label_bool=False)), tf.assign_add(fns, cross_and_sum(pred_bool=False, label_bool=True)), ) precisions = tps / (tps + fps + 1e-50) recalls = tps / (tps + fns + 1e-50) f1s = 2 * precisions * recalls / (precisions + recalls + 1e-50) macro_f1 = tf.reduce_mean(f1s) metrics_update = tf.group(f1_updates) # train loss loss = tf.add_n(tower_loss_list) / len(tower_loss_list) tower_train_ops = [] for index, (tower_vars, tower_grads) in \ enumerate(zip(tower_tvars_list, avg_tower_grads_list)): with tf.variable_scope("tower_%d" % index), \ tf.device('/gpu:%d' % index): tower_grads = [ grad / len(tower_batches) if grad is not None else None for grad in tower_grads ] if index == 0: global_step = tf.train.get_or_create_global_step() lr = cyclic_learning_rate(global_step=global_step, min_lr=0.00005, max_lr=0.002, step_size=8205) tower_optimizer = tf.contrib.opt.NadamOptimizer(lr) tower_grads, _ = tf.clip_by_global_norm( tower_grads, config.grad_clip_max_norm) tower_train_op = tower_optimizer.apply_gradients( zip(tower_grads, tower_vars), global_step=global_step if index == 0 else None) tower_train_ops.append(tower_train_op) with tf.control_dependencies(tf.get_collection( tf.GraphKeys.UPDATE_OPS)): train_op = tf.group(tower_train_ops) logger.info("Done.") # start training logger.info("Init model...") sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer() ]) logger.info("Done.") if elmo_saver is not None: logger.info("Restoring elmo...") elmo_saver.restore(sess, config.elmo_path) logger.info("Done.") if restore: logger.info("Restore model from {}".format(config.model_path)) saver.restore(sess, config.model_path) logger.info("Done.") logger.info("Synchronize towers...") sess.run(tower_gvar_sync) logger.info("Done.") fetch_dict = { 'loss': loss, 'train_op': train_op, 'step': global_step, 'lr': lr } loss_tracker = CSVTracker(fields=['epoch', 'step', 'loss', 'lr'], fmts=['%d', '%d', "%.4f", '%g'], start_time=config.start_time, log_dir=config.output_dir, filename='loss') acc_tracker = StatefulTracker( cmp_field="valid_f1", fields=["epoch", "train_f1", "valid_f1", "diff_f1"], log_dir=config.output_dir, start_time=config.start_time, filename='acc') def _train(iter_init, epoch): sess.run([iter_init]) fetch = {"epoch": epoch} step = sess.run(global_step) while True: try: if step % 50 == 0: fetch.update(sess.run(fetch_dict)) loss_tracker.track(fetch) else: sess.run(train_op) step += 1 except tf.errors.OutOfRangeError: break def _evaluate(iter_init): timer = Timer() sess.run([iter_init, tf.local_variables_initializer()]) while True: try: sess.run(metrics_update) except tf.errors.OutOfRangeError: break logger.info("Time elapsed: %s" % timer.tock()) fetch_macro_f1 = \ sess.run(macro_f1) return fetch_macro_f1 logger.info("Start training.") for epoch in range(config.max_epoch): _train(train_iter_init, epoch) logger.info("Evaluate train set...") train_f1 = _evaluate(train_eval_iter_init) logger.info("Evaluate valid set...") valid_f1 = _evaluate(valid_eval_iter_init) acc_tracker.track( dict(epoch=epoch, train_f1=train_f1, valid_f1=valid_f1, diff_f1=train_f1 - valid_f1)) if acc_tracker.improved: logger.info("Save checkpoint to {}".format( repr(config.model_path))) saver.save(sess, config.model_path) logger.info("Done.") if acc_tracker.staled_tracks > config.early_stop_epoch: logger.warning("Stop improve for %d epoch, early stop." % acc_tracker.staled_tracks) break logger.info("Finish training.")