def testGetOrCreateGlobalStep(self): with tf.variable_scope('s1'): with tf.name_scope('s2'): gs1 = py_utils.GetOrCreateGlobalStep() gs2 = tf.train.get_global_step() gs3 = py_utils.GetOrCreateGlobalStep() gs4 = tf.train.get_global_step() gs5 = py_utils.GetOrCreateGlobalStep() gs6 = tf.train.get_global_step() for gs in [gs2, gs3, gs4, gs5, gs6]: self.assertTrue(gs1 is gs) self.assertEqual(gs1.name, 'global_step:0')
def _testGPipeTransformerStackTrainEncoderTransparentFProp( self, splits=1, num_micro_batches=1): # time = 2, batch = 4 with self.session() as sess: params = self._TransformerParams( splits=splits, num_micro_batches=num_micro_batches, num_decoder_layers=2, num_encoder_layers=2) params.is_transparent = True params.num_transparent_outputs = 1 params.transparent_merger_dropout_prob = 0.0 xformer = GPipeTransformerStack(params) inputs, paddings, tgt_inputs, tgt_paddings = self._random_inputs( batch=batch) py_utils.GetOrCreateGlobalStep() tf.set_random_seed(1234) tf.global_variables_initializer().run() enc_output = xformer.EncoderFPropDefaultTheta(inputs, paddings) dec_output = xformer.FProp(xformer.theta, inputs, paddings, tgt_inputs, tgt_paddings) enc_out = sess.run(enc_output) dec_out = sess.run(dec_output) self.assertAllClose(enc_out, [[[-0.118476, 1.031626]] * batch, [[0.643884, -1.02581167]] * batch]) self.assertAllClose(dec_out, [[[-2.8764534, 1.00808454]] * batch, [[1.02129495, -0.78406084]] * batch, [[1.02129495, -0.78406084]] * batch])
def _testGPipeTransformerStackTrainTransparentFProp( self, splits=1, num_micro_batches=1): # time = 2, batch = 4 with self.session() as sess: params = self._TransformerParams( splits=splits, num_micro_batches=num_micro_batches, num_decoder_layers=3, num_encoder_layers=1) params.is_transparent = True params.num_transparent_outputs = 3 params.transparent_merger_dropout_prob = 0.0 xformer = GPipeTransformerStack(params) inputs, paddings, tgt_inputs, tgt_paddings = self._random_inputs( batch=batch) py_utils.GetOrCreateGlobalStep() tf.set_random_seed(1234) tf.global_variables_initializer().run() enc_outputs = xformer.EncoderFPropDefaultTheta(inputs, paddings) dec_output = xformer.FProp(xformer.theta, inputs, paddings, tgt_inputs, tgt_paddings) enc_out_1, enc_out_2, enc_out_3 = sess.run(enc_outputs) dec_out = sess.run(dec_output) self.assertAllClose(enc_out_1, enc_out_2) self.assertAllClose(enc_out_2, enc_out_3) self.assertAllClose(enc_out_1, [[[-0.27896273, 1.46589136]] * batch, [[1.03141928, -0.847896]] * batch]) self.assertAllClose(dec_out, [[[2.926736, -4.090812]] * batch, [[-1.69508219, 1.75891459]] * batch, [[-1.6950829, 1.75891507]] * batch])
def testTransformerAttentionLayerDeterministicDropout(self): with self.session(use_gpu=True) as sess: # Needed to generate a seed pair. py_utils.ResetStepSeed() py_utils.GetOrCreateGlobalStep() depth = 4 p = layers_with_attention.TransformerAttentionLayer.Params() p.name = 'transformer_atten' p.source_dim = depth p.is_masked = False p.num_attention_heads = 2 p.residual_dropout_tpl = layers.DeterministicDropoutLayer.Params() p.residual_dropout_prob = 0.1 transformer_atten = layers_with_attention.TransformerAttentionLayer(p) (source_vecs, source_padding, _, _) = self._testTransformerAttentionLayerInputs(depth=depth) ctx, probs = transformer_atten.FProp(transformer_atten.theta, source_vecs, source_padding) tf.global_variables_initializer().run() actual_ctx, actual_probs = sess.run([ctx, probs]) # pylint: disable=bad-whitespace # pyformat: disable print(np.array_repr(actual_ctx)) expected_ctx = np.array([ [[-1.45762944, 1.5337404 , 0.34037334, -0.97208667], [-1.35992002, -1.06530988, 1.53705895, 2.79370689]], [[ 0.00657134, 1.12030125, -1.32564592, -1.73569465], [-0.80793667, -0.10877949, -0.80295694, 2.25494242]], [[ 1.76956046, -0.50777751, -1.19745886, -1.46751583], [-1.79178905, -0.77374339, 1.31586027, 2.98173356]], [[-0.85498607, -0.37413225, 1.25707364, -0.50043333], [ 1.62276983, 0.50820369, -1.52967572, -2.02076197]], [[-0.66754031, -0.68657839, -0.51643699, 1.96581018], [-1.4816376 , 0.89419198, -0.57226259, 1.90177512]] ], dtype=np.float32) print(np.array_repr(actual_probs)) expected_probs = np.array([ [[ 0.21387868, 0.22080734, 0. , 0. , 0.56531399], [ 0. , 0.30584112, 0.24723588, 0.44692296, 0. ]], [[ 0.25358215, 0.50932312, 0. , 0. , 0.23709476], [ 0. , 0.56834149, 0.2632803 , 0.16837817, 0. ]], [[ 0.38519409, 0.55454361, 0. , 0. , 0.06026226], [ 0. , 0.33708778, 0.21976741, 0.4431448 , 0. ]], [[ 0.27139962, 0.12790371, 0. , 0. , 0.60069668], [ 0. , 0.31849149, 0.28174096, 0.39976761, 0. ]], [[ 0.16272782, 0.15781289, 0. , 0. , 0.67945927], [ 0. , 0.55003977, 0.26049581, 0.18946445, 0. ]] ], dtype=np.float32) # pyformat: enable # pylint: enable=bad-whitespace self.assertAllClose(expected_ctx, actual_ctx, rtol=1e-05, atol=1e-05) self.assertAllClose(expected_probs, actual_probs, rtol=1e-05, atol=1e-05)
def _verify_timestep_counts(self, num_splits): num_micro_batches = 8 batch_size = 16 g = tf.Graph() with g.as_default(): py_utils.GetOrCreateGlobalStep() tf.set_random_seed(1245) inputs = tf.random_uniform([batch_size, 8, 8, 1]) net = _BuildDummyPipelineCnn(num_splits=num_splits, num_micro_batches=num_micro_batches) endpoints = net.FPropDefaultTheta(inputs) if isinstance(endpoints, (list, tuple)): logits, aux_logits = endpoints else: logits = endpoints aux_logits = None loss = tf.reduce_mean(logits) grads = tf.gradients(loss, tf.trainable_variables()) grad_norm = tf.sqrt(py_utils.SumSquared(grads)) ts = net.GetAccumulatorValues().Flatten() with self.session(graph=g) as sess: sess.run(tf.global_variables_initializer()) grad_norm_val, ts_vals = sess.run([grad_norm, ts]) self.assertNear(grad_norm_val, 0.269997, err=1.0e-6) # Accumulator values should be equal to number of time steps in pipeline. for ts_val in list(ts_vals): expected_ts = num_micro_batches if num_splits > 1 else 1 self.assertEqual(ts_val, expected_ts) if aux_logits is not None: aux_logit_tensor = sess.run(aux_logits) self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
def GetOverWriteGlobalStep(graph=None): graph = graph or tf.get_default_graph() mb_tensors = graph.get_collection_ref(_OVERWRITE_GLOBAL_STEP_COLLECTION) if len(mb_tensors) == 1: mb_tensor = mb_tensors[0] else: mb_tensor = py_utils.GetOrCreateGlobalStep() return mb_tensor
def _InputBatchFromCKPT(self): p = self.params @function.Defun() def ReadData(): x, = io_ops.restore_v2(p.ckpt, [p.data], [''], [p.data_dtype]) return x # Loads data and label into memory and keep it around. data, = py_x_ops.cached_call(f=ReadData, T=[p.data_dtype]) b = p.batch_size total_length = p.data_shape[0] total_batches = total_length // b total_steps = total_batches // p.num_steps left_over = total_batches % p.num_steps > 0 if left_over: total_steps += 1 if p.eval: dataset = tf.data.Dataset.range(total_steps).repeat() iterator = dataset.make_one_shot_iterator() global_step = iterator.get_next() else: global_step = py_utils.GetOrCreateGlobalStep() - 1 batch_id = tf.to_int32(global_step % total_steps) data = data[:total_batches * b] data = tf.reshape(data, [b, total_batches]) start = p.num_steps * batch_id end = tf.minimum(tf.to_int32(total_batches), start + p.num_steps) raw = tf.gather(data, tf.range(start, end, dtype=tf.int32), axis=1, name='ids') label_end = tf.minimum(end + 1, tf.to_int32(total_batches)) label = tf.gather(data, tf.range(start + 1, label_end, dtype=tf.int32), axis=1, name='labels') raw = py_utils.PadOrTrimTo(raw, [b, end - start]) ret = py_utils.NestedMap() # raw = tf.reshape(data[:700], [20, 35]) # ret.ids = raw # ret.labels = raw # ret.weights = tf.ones([20, 35]) # ret.paddings = 1.0 - ret.weights # ret.word_count = 700 # ret.take_last_state = py_utils.GetOrCreateGlobalStep() > 0 ret.ids = raw ret.labels = py_utils.PadOrTrimTo(label, [b, end - start]) ret.weights = py_utils.PadOrTrimTo(tf.ones([b, label_end - start], dtype=tf.float32), [b, end - start]) ret.paddings = 1.0 - ret.weights ret.word_count = b * (label_end - start - 1) ret.take_last_state = batch_id > 0 return ret
def __init__(self, params): """Initializes this Model.""" assert issubclass(params.cls, BaseModel) super(BaseModel, self).__init__(params) self._global_step = py_utils.GetOrCreateGlobalStep() # tasks are not yet instantiated. self._total_examples_sum = None self._ema = None tp = self.params.train tf.logging.info('Training parameters for %s: %s', params.cls, tp) if tp.ema_decay > 0: assert tp.ema_decay < 1.0 self._ema = tf.train.ExponentialMovingAverage( decay=tp.ema_decay, num_updates=self._global_step)
def testDecoderFPropDeterministicAttentionDropout(self): """Verify that attention dropout is deterministic given fixed seeds.""" with self.session(use_gpu=False) as sess: tf.set_random_seed(8372749040) p = self._DecoderParams( py_utils.VariationalNoiseParams(None, True, False, seed=1792)) p.use_while_loop_based_unrolling = False p.attention.atten_dropout_prob = 0.5 p.attention.atten_dropout_deterministic = True loss, per_sequence_loss = self._testDecoderFPropHelper(params=p) global_step = py_utils.GetOrCreateGlobalStep() tf.global_variables_initializer().run() loss_val, per_sequence_loss_val, global_steps_val, time_steps_val = ( sess.run([ loss, per_sequence_loss, 'decoder_1/accumulated_global_steps:0', 'decoder_1/accumulated_time_steps:0' ])) print('loss = ', loss_val, 'per sequence loss = ', per_sequence_loss_val) self.assertAllClose([3.473008, 15.0], loss_val) self.assertAllClose([13.563036, 10.053869, 10.362661, 18.115553], per_sequence_loss_val) self.assertAllEqual([0, 0, 0, 0, 0], global_steps_val) self.assertAllEqual([1, 2, 3, 4, 5], time_steps_val) # Run another step to test global_step and time_step are incremented # correctly. sess.run(tf.assign_add(global_step, 1)) loss_val, per_sequence_loss_val, global_steps_val, time_steps_val = ( sess.run([ loss, per_sequence_loss, 'decoder_1/accumulated_global_steps:0', 'decoder_1/accumulated_time_steps:0' ])) print('loss = ', loss_val, 'per sequence loss = ', per_sequence_loss_val) self.assertAllClose([3.567736, 15.0], loss_val) self.assertAllClose([14.730419, 10.176270, 10.73501, 17.87434578], per_sequence_loss_val) self.assertAllEqual([1, 1, 1, 1, 1], global_steps_val) self.assertAllEqual([1, 2, 3, 4, 5], time_steps_val)
def testDecoderFPropDeterministicAttentionDropout(self): """Verify that attention dropout is deterministic given fixed seeds.""" with self.session(use_gpu=False, graph=tf.Graph()) as sess: tf.set_random_seed(8372749040) p = self._DecoderParams( py_utils.VariationalNoiseParams(None, True, False, seed=1792)) p.use_while_loop_based_unrolling = False p.attention.atten_dropout_prob = 0.5 p.attention.atten_dropout_deterministic = True loss, per_sequence_loss = self._testDecoderFPropHelper(params=p) global_step = py_utils.GetOrCreateGlobalStep() tf.global_variables_initializer().run() loss_val, per_sequence_loss_val, global_steps_val, time_steps_val = ( sess.run([ loss, per_sequence_loss, 'decoder_1/accumulated_global_steps:0', 'decoder_1/accumulated_time_steps:0' ])) print('loss = ', loss_val, 'per sequence loss = ', per_sequence_loss_val) self.assertAllClose([3.567466, 15.0], loss_val) self.assertAllClose([13.762117, 10.278571, 10.660231, 18.811079], per_sequence_loss_val) self.assertAllEqual([0, 0, 0, 0, 0], global_steps_val) self.assertAllEqual([2, 3, 4, 5, 6], time_steps_val) # Run another step to test global_step and time_step are incremented # correctly. sess.run(tf.assign_add(global_step, 1)) loss_val, per_sequence_loss_val, global_steps_val, time_steps_val = ( sess.run([ loss, per_sequence_loss, 'decoder_1/accumulated_global_steps:0', 'decoder_1/accumulated_time_steps:0' ])) print('loss = ', loss_val, 'per sequence loss = ', per_sequence_loss_val) self.assertAllClose([3.56244, 15.0], loss_val) self.assertAllClose([14.180107, 10.391582, 10.460568, 18.40435], per_sequence_loss_val) self.assertAllEqual([1, 1, 1, 1, 1], global_steps_val) self.assertAllEqual([2, 3, 4, 5, 6], time_steps_val)
def InputBatch(self): np.random.seed(1) bs, sl = 10, 7 src_ids = tf.constant( np.random.randint(low=0, high=8192 - 1, size=[bs, sl], dtype=np.int32)) tgt_ids = tf.constant( np.random.randint(low=0, high=8192 - 1, size=[bs, sl], dtype=np.int32)) tgt_labels = tf.constant( np.random.randint(low=0, high=8192 - 1, size=[bs, sl], dtype=np.int32)) tgt_weights = tf.constant(np.ones(shape=[bs, sl], dtype=np.float32)) src_paddings = tf.zeros([bs, sl]) tgt_paddings = tf.zeros([bs, sl]) ret = py_utils.NestedMap() ret.src = py_utils.NestedMap() ret.tgt = py_utils.NestedMap() if self.params.split: src_ids = tf.split(src_ids, 2, 0) src_paddings = tf.split(src_paddings, 2, 0) tgt_ids = tf.split(tgt_ids, 2, 0) tgt_labels = tf.split(tgt_labels, 2, 0) tgt_paddings = tf.split(tgt_paddings, 2, 0) tgt_weights = tf.split(tgt_weights, 2, 0) ret.src.ids = tf.cond( tf.equal(tf.mod(py_utils.GetOrCreateGlobalStep(), 2), 0), lambda: src_ids[0], lambda: src_ids[1]) ret.src.paddings = tf.cond( tf.equal(tf.mod(py_utils.GetOrCreateGlobalStep(), 2), 0), lambda: src_paddings[0], lambda: src_paddings[1]) ret.tgt.ids = tf.cond( tf.equal(tf.mod(py_utils.GetOrCreateGlobalStep(), 2), 0), lambda: tgt_ids[0], lambda: tgt_ids[1]) ret.tgt.labels = tf.cond( tf.equal(tf.mod(py_utils.GetOrCreateGlobalStep(), 2), 0), lambda: tgt_labels[0], lambda: tgt_labels[1]) ret.tgt.paddings = tf.cond( tf.equal(tf.mod(py_utils.GetOrCreateGlobalStep(), 2), 0), lambda: tgt_paddings[0], lambda: tgt_paddings[1]) ret.tgt.weights = tf.cond( tf.equal(tf.mod(py_utils.GetOrCreateGlobalStep(), 2), 0), lambda: tgt_weights[0], lambda: tgt_weights[1]) else: ret.src.ids = src_ids ret.src.paddings = src_paddings ret.tgt.ids = tgt_ids ret.tgt.labels = tgt_labels ret.tgt.paddings = tgt_paddings ret.tgt.weights = tgt_weights return ret
def testDeterministicDropoutInsideFunctionalWhile(self): with self.session() as sess: cells = FeatureExtractionLayer.Params().Set( name='cell', sub=[ DeterministicDropoutLayer.Params().Set(name='dropout', keep_prob=0.7) ]) p = PipeliningLayer.Params().Set(name='pipe', cell_tpl=[cells]) x = tf.ones([2, 3], dtype=tf.float32) model = p.cls(p) y = model.FPropDefaultTheta(x) py_utils.GetOrCreateGlobalStep() tf.global_variables_initializer().run() y_val = sess.run(y) self.assertAllClose([ [1.0 / 0.7, 1.0 / 0.7, 1.0 / 0.7], [0.0, 0.0, 1.0 / 0.7], ], y_val) self.assertAllClose(5.7142859, np.sum(y_val))
def testGPipeTransformerStackTrainTransparentFPropWithEmbeddings( self, splits=1, num_micro_batches=1): # time = 2, batch = 4 with self.session() as sess: params = self._TransformerParamsWithEmbeddings( splits=splits, num_micro_batches=num_micro_batches, num_decoder_layers=3, num_encoder_layers=1) params.is_transparent = True params.num_transparent_outputs = 3 params.transparent_merger_dropout_prob = 0.0 xformer = GPipeTransformerStack(params) input_ids, id_paddings, tgt_inputs, tgt_paddings = self._random_inputs_ids( batch=batch) inputs, paddings, _, _ = self._random_inputs_vecs(batch=batch) py_utils.GetOrCreateGlobalStep() tf.set_random_seed(1234) tf.global_variables_initializer().run() enc_outputs = xformer.EncoderFPropDefaultTheta(inputs, paddings) dec_output = xformer.FProp(xformer.theta, input_ids, id_paddings, tgt_inputs, tgt_paddings) enc_out_1, enc_out_2, enc_out_3 = sess.run(enc_outputs) dec_out = sess.run(dec_output) self.assertAllClose(enc_out_1, enc_out_2) self.assertAllClose(enc_out_2, enc_out_3) self.assertAllClose( [[[0.68660116, 0.947429, 0.78953624, -1.20142817]] * batch, [[0.57919669, 1.12979364, 4.29336643, 0.45106331]] * batch], enc_out_1) self.assertAllClose( [[[-0.46651918, -1.62957835, 1.15657926, 1.08397353]] * batch, [[-0.34674695, -1.65999401, 1.08431196, 1.07384491]] * batch, [[-0.41073492, -1.60431314, 1.04607999, 1.08858371]] * batch], dec_out)
def testDropoutInRecurrent(self, splits=1, num_micro_batches=1): assert splits in [1, 2, 4] with self.session() as sess: tf.set_random_seed(12345) num_layers = 4 py_utils.GetOrCreateGlobalStep() # Build a model with 4 dropout layers. layers = [] for l in range(num_layers): layers.append(DeterministicDropoutLayer.Params().Set( name='dropout_{}'.format(l), keep_prob=0.7)) # Divide the model into splits partitions. cell_tpl = [] layers_per_split = num_layers // splits for i in range(splits): sub = layers[i * layers_per_split:(i + 1) * layers_per_split] cell_tpl.append(FeatureExtractionLayer.Params().Set( name='cell_{}'.format(i), sub=sub)) # Parallelize partitions using pipeline. p = PipeliningLayer.Params().Set( name='pipeline', num_micro_batches=num_micro_batches, cell_tpl=cell_tpl) # Fake input x = tf.ones([2, 3]) # Construct weights. w = tf.get_variable( 'w', shape=[2, 3], initializer=tf.constant_initializer([[1] * 3] * 2)) mdl = p.cls(p) y = mdl.FPropDefaultTheta(x * w) # Construct loss function such that gradients = final activation. loss = tf.reduce_sum(y) grads = py_utils.ComputeGradients(loss, py_utils.NestedMap(w=w)) tf.global_variables_initializer().run() y_val = sess.run(y) grads_val = sess.run(grads)['w'][1] self.assertAllClose(y_val, grads_val)
def Apply(self, lr, var_grad): p = self.params def _Acc(vg): """Updating accumulators.""" v, g = vg with tf.variable_scope(v.op.name): _, a = py_utils.CreateVariable( 'grad_accumulator', py_utils.WeightParams(v.get_shape(), py_utils.WeightInit.Constant(0.0), self.params.dtype), trainable=False) a = tf.assign_add(a, g) return v, a var_grad = var_grad.Transform(_Acc) def _ApplyAndReset(): with tf.control_dependencies([ self._opt.Apply( lr, py_utils.ApplyGradMultiplier(var_grad, 1. / p.accum_steps)) ]): return tf.group(*[ tf.assign(a, tf.zeros_like(a)) for _, a in var_grad.Flatten() ]) return tf.cond( tf.equal(tf.mod(py_utils.GetOrCreateGlobalStep(), p.accum_steps), p.accum_steps - 1), _ApplyAndReset, lambda: tf.group(tf.no_op()))
def __init__(self, params): assert issubclass(params.cls, BaseTask) super(BaseTask, self).__init__(params) p = self.params if p.input: # TODO(zhifengc): Consider a simpler way to ensure the input # generator stops after one epoch. if p.is_eval and p.eval: seq_inp = issubclass(p.input.cls, base_input_generator.BaseInputGeneratorFromFiles) if p.input.num_samples == 0: # Dataset size is unknown. Computes eval summary based on num_samples. assert p.eval.samples_per_summary > 0 elif (p.eval.samples_per_summary == 0) or (p.input.num_samples < p.eval.samples_per_summary): # If we know the dataset size and we want to evaluate the full # set, we need to coordinate the input generator to flush out # all samples so the evaler and decoder compute metrics on the # whole set for each summary step. if seq_inp: p.input.flush_every_n = p.input.num_samples p.eval.samples_per_summary = p.input.num_samples if seq_inp and p.input.num_batcher_threads > 1: tf.logging.warning('input.num_batcher_threads > 1 inside eval mode. ' 'The input generator may not iterate over exactly ' 'one epoch per run') with tf.device( self.cluster.input_device), py_utils.outside_all_rewrites(): self.CreateChild('input', p.input) self._var_grads = None self._encoder = None self._online_encoder = None self._decoder = None self._total_examples = None self._total_nans_and_infs = None self._loss = None self._num_predictions = None self._train_op = None self._eval_metrics = {} self._trainer_verbose_tensors = {} # Create the gradient mask, self._per_input_gradient_mask = None self._shared_global_step = py_utils.GetOrCreateGlobalStep() tp = p.train if tp: if tp.task_global_step: self._task_global_step = CreateTaskGlobalStep(p, p.name) self._global_step = self._task_global_step else: self._task_global_step = None self._global_step = self._shared_global_step if tp.grad_norm_tracker: with tf.variable_scope(p.name): self.CreateChild('grad_norm_tracker', tp.grad_norm_tracker) self.CreateChild('lr_schedule', tp.lr_schedule) self.CreateChild('optimizer', tp.optimizer) self._UpdateVnConfig()
def FProp(self, theta, *args): """Run multiple cells in different devices in a pipelining manner. Args: theta: A NestedMap object containing weights' values of this layer and its children layers. *args: Non-keyworded variable length argument list of input tensors. Returns: A list of output tensors """ # TODO(huangyp): handle optional None inputs. p = self.params if p.is_eval: outputs = _ToTuple(args) for (name, l) in self._before_layers: outputs = _ToTuple(outputs) outputs = l.FProp(theta[name], *outputs) for (name, l) in self._cells: outputs = _ToTuple(outputs) outputs = l.FProp(theta[name], *outputs) return outputs num_cells = len(p.cell_tpl) cluster = self.cluster # Compute shapes of input and output tenors. input_tenors = _ToTuple(args) mini_batch_size = input_tenors[0].get_shape().as_list()[p.batch_dim] input_dtype = input_tenors[0].dtype if p.num_micro_batches > mini_batch_size: p.num_micro_batches = mini_batch_size micro_batch_size = mini_batch_size // p.num_micro_batches input_shapes = () for input_tensor in input_tenors: if input_tensor is not None: input_shape = input_tensor.get_shape().as_list() input_shape[p.batch_dim] = micro_batch_size input_shapes += (tf.TensorShape(input_shape), ) else: input_shapes += (None, ) state_shapes = self._CalculateOutputShapes(input_shapes) def GetCellFn(i): """Get the ith feature extraction layer.""" def CellFn(theta, state0, inputs): """A cell fn is exectued inside of StackedRecurrent.""" del state0 frop_inputs = [] for input_idx in range(len(state_shapes[i])): name = 's{}'.format(input_idx) if state_shapes[i][input_idx] is not None: inputs[name].set_shape(state_shapes[i][input_idx]) frop_inputs.append(inputs[name]) else: frop_inputs.append(None) with CellFnFropOpReplacementWrapper(): tf.logging.info('cell {} input {}'.format(i, frop_inputs)) mb_tensor = inputs[_MICRO_BATCH_STATE_NAME] SetOverWriteGlobalStep(mb_tensor) _, cell = self._cells[i] outputs = cell.FProp(theta, *frop_inputs) state1 = py_utils.NestedMap() state1[_MICRO_BATCH_STATE_NAME] = mb_tensor outputs = _ToTuple(outputs) assert len(outputs) == len(state_shapes[i + 1]) for output_idx in range(len(outputs)): if outputs[output_idx] is not None: name = 's{}'.format(output_idx) state1[name] = outputs[output_idx] return state1, py_utils.NestedMap() return CellFn cell_fns = [] accumulator_layers = [] thetas = [] init_states = [] devices = [] for cell_idx in range(num_cells): cell_name, cell = self._cells[cell_idx] accumulator_layers.append(cell) cell_fns.append(GetCellFn(cell_idx)) thetas.append(theta[cell_name]) init_state = py_utils.NestedMap() init_state[_MICRO_BATCH_STATE_NAME] = tf.cast(0, dtype=input_dtype) for output_idx in range(len(state_shapes[cell_idx + 1])): name = 's{}'.format(output_idx) if state_shapes[cell_idx + 1][output_idx] is not None: init_state[name] = tf.zeros(state_shapes[cell_idx + 1][output_idx], dtype=input_dtype) init_states.append(init_state) devices.append(cluster.WorkerDeviceInModelSplit(cell_idx)) cell_grads = [None] * num_cells cell_outs = [lambda x: x] * num_cells cell_out_grads = [lambda x: x] * num_cells with tf.device(devices[0]): previous = input_tenors for (name, l) in self._before_layers: previous = l.FProp(theta[name], *previous) previous = _ToTuple(previous) inputs = py_utils.NestedMap() gs_tensor = py_utils.GetOrCreateGlobalStep() inputs[_MICRO_BATCH_STATE_NAME] = tf.stack([ tf.cast(gs_tensor * p.num_micro_batches + t, dtype=input_dtype) for t in range(p.num_micro_batches) ]) # TODO(huangyp, dehao): apply dehao's trick to reshape the input tensor # to [p.num_micro_batches, -1, 128]. for output_idx, output_tenor in enumerate(previous): name = 's{}'.format(output_idx) if output_tenor is not None: output_tenor = tf.stack( tf.split(output_tenor, p.num_micro_batches, axis=p.batch_dim)) inputs[name] = output_tenor output, _ = recurrent.StackedRecurrent( devices=devices, cell_fns=cell_fns, cell_grads=cell_grads, cell_outs=cell_outs, cell_out_grads=cell_out_grads, thetas=thetas, init_states=init_states, inputs=inputs, accumulator_layers=accumulator_layers, unused_acc_state=True) with tf.device(devices[-1]): output_tensors = [] for output_idx in range(len(state_shapes[-1])): state_shape = state_shapes[-1][output_idx] if state_shape is None: output_tensors.append(None) continue output_name = 's{}'.format(output_idx) output_tensor = output[output_name] if p.batch_dim != 0: perm = list(range(1, p.batch_dim + 1)) + [0] perm += list(range(p.batch_dim + 1, len(state_shape) + 1)) output_tensor = tf.transpose(output_tensor, perm=perm) state_shape[p.batch_dim] *= p.num_micro_batches output_tensor = tf.reshape(output_tensor, state_shape) output_tensors.append(output_tensor) tf.logging.info('pipeline output = {}'.format(output_tensors)) if len(output_tensors) == 1: return output_tensors[0] return tuple(output_tensors)
def testBPropWithAccumComparison(self): def _SetDefaults(p): p.random_seed = 12345 p.decoder.input_dropout_prob = 0.0 mp = p.encoder.transformer_stack.transparent_merger_tpl mp.weighted_merger_dropout_prob = 0.0 disable_vn = py_utils.VariationalNoiseParams(1.0, False, False) for lp in base_layer.RecursiveFindLayerParams(p): # TODO(lepikhin): lp.dtype = dtype lp.params_init = py_utils.WeightInit.Gaussian(0.1, 12345) lp.vn = disable_vn tp = p.train assert tp.l2_regularizer_weight is None tp.clip_gradient_norm_to_value = False tp.grad_norm_to_clip_to_zero = False tp.optimizer = optimizer.SGD.Params() tp.learning_rate = 1e-2 tp.lr_schedule = lr_schedule.ContinuousLearningRateSchedule.Params() for l in p.ToText().split('\n'): print(l) return p with self.session(use_gpu=False, graph=tf.Graph()) as sess: tf.set_random_seed(_TF_RANDOM_SEED) p = self._testParams() p.input = TestInputGenerator.Params() p.input.split = True p = _SetDefaults(p) p.train.optimizer = optimizer.Accumulator.Params().Set( accum_steps=2, optimizer_tpl=p.train.optimizer) mdl = p.cls(p) mdl.FPropDefaultTheta() mdl.BProp() loss = mdl.loss logp = mdl.eval_metrics['log_pplx'][0] tf.global_variables_initializer().run() for _ in range(2): sess.run((py_utils.GetOrCreateGlobalStep(), loss, logp, mdl.train_op)) expected = sess.run(mdl.dec.softmax.vars['weight_0']) with self.session(use_gpu=False, graph=tf.Graph()) as sess: tf.set_random_seed(_TF_RANDOM_SEED) p = self._testParams() p.input = TestInputGenerator.Params() p.input.split = False p = _SetDefaults(p) mdl = p.cls(p) mdl.FPropDefaultTheta() mdl.BProp() loss = mdl.loss logp = mdl.eval_metrics['log_pplx'][0] tf.global_variables_initializer().run() sess.run((py_utils.GetOrCreateGlobalStep(), loss, logp, mdl.train_op)) actual = sess.run(mdl.dec.softmax.vars['weight_0']) self.assertAllClose(expected, actual, rtol=1e-2, atol=1e-2)
def FPropTower(self, theta, input_batch): p = self.params chunk_ids = input_batch.chunk_ids if p.lm.use_chunks else None ids, paddings, labels_ids, weights, chunk_ids = self._TrimIfPossibleThenTranspose( input_batch.ids, input_batch.paddings, input_batch.labels, input_batch.weights, chunk_ids=chunk_ids) seqlen = tf.shape(ids)[0] batch_size = tf.shape(ids)[1] zero_state = self.lm.zero_state(batch_size) with tf.name_scope('prepare_state'): if p.contiguous: state0 = py_utils.NestedMap(rnn=[]) for i in range(p.lm.rnns.num_layers): if p.is_eval: last_m = tf.reshape(self.theta['last_state_%d_m' %i], [p.batch_size, p.lm.emb.embedding_dim]) last_c = tf.reshape(self.theta['last_state_%d_c' %i], [p.batch_size, p.lm.emb.embedding_dim]) else: last_m = self.theta['last_state_%d_m' %i] last_c = self.theta['last_state_%d_c' %i] m = tf.cond(input_batch.take_last_state, lambda: last_m, lambda: zero_state.rnn[i].m) c = tf.cond(input_batch.take_last_state, lambda: last_c, lambda: zero_state.rnn[i].c) # c = tf.Print(c, [c]) state0.rnn.append(py_utils.NestedMap(c=c, m=m)) else: state0 = zero_state labels = py_utils.NestedMap(class_ids=labels_ids, class_weights=weights) xent_output, state1 = self.lm.FProp(theta.lm, ids, paddings, state0, labels=labels, chunk_ids=chunk_ids) # self.state1 = state1 if p.contiguous: assign_ops = list() for i in range(p.lm.rnns.num_layers): m = tf.reshape(state1.rnn[i].m, [1, p.batch_size, p.lm.emb.embedding_dim]) c = tf.reshape(state1.rnn[i].c, [1, p.batch_size, p.lm.emb.embedding_dim]) if not p.is_eval: state1.rnn[i].m = m state1.rnn[i].c = c assign_ops.append(tf.assign(self.vars['last_state_%i_m' %i], m)) assign_ops.append(tf.assign(self.vars['last_state_%i_c' %i], c)) self.last_state_group_op = tf.group(*assign_ops) # +1 to account for the end of sequence symbol. div = 2 if p.input.use_chunks else 1 # tags shouldn't be counted as words num_words = tf.cast( tf.reduce_sum(input_batch.word_count // div + tf.constant(1, dtype=tf.int32) * (1 - p.contiguous)), tf.float32) predicted_labels = tf.cast(xent_output.per_example_argmax, labels_ids.dtype) num_preds = xent_output.total_weight mean_acc = tf.reduce_sum( tf.cast(tf.equal(labels_ids, predicted_labels), tf.float32) * weights) / ( num_preds + 1e-4) if p.lm.emb.cls == layers.HRREmbeddingLayer: if p.train.isometric > 0.0: isometric_constraint = 0.0 nr = p.lm.emb.num_roles # TODO(jmluo) rearrange it to divide the code according to three modes if 'F' in theta.lm.emb: F_wm = theta.lm.emb.F nr, nf, d = F_wm.get_shape().as_list() # F2d leads to overspefication of parameters in F F2d = tf.reshape(F_wm, [nr * nf, d]) diff = tf.matmul(F2d, tf.transpose(F2d)) - tf.eye(nr * nf) # diff = tf.matmul(F_wm, tf.transpose(F_wm, perm=[0, 2, 1])) - tf.eye(nf) isometric_constraint += tf.reduce_sum(diff**2) if 'A' in theta.lm: d = theta.lm.A.get_shape().as_list()[0] A = tf.reshape(theta.lm.A, [d, 2, d]) A1 = A[:, 0] A2 = A[:, 1] diff = tf.matmul(A1, tf.transpose(A2)) / 2 # isometric_constraint += tf.reduce_sum(diff ** 2) if nr > 1 and 'r' in theta.lm.emb: r_wm = theta.lm.emb.r diff = tf.matmul(r_wm, tf.transpose(r_wm)) - tf.eye(nr) isometric_constraint += tf.reduce_sum(diff**2) if 'R' in theta.lm: R_wm = theta.lm.R diff = tf.matmul(R_wm, tf.transpose(R_wm)) - tf.eye(p.lm.num_sent_roles) isometric_constraint += tf.reduce_sum(diff**2) if p.lm.emb.mode == 'rs': assert 'rR' in theta.lm.emb rR = theta.lm.emb.rR diff = tf.matmul(rR, tf.transpose(rR)) - tf.eye(2) isometric_constraint += tf.reduce_sum(diff ** 2) rs_all = theta.lm.emb.rs.wm for rs in rs_all: rs = tf.reshape(rs, [-1, 2, 2]) norm = tf.reduce_sum(rs ** 2, axis=-1) isometric_constraint += tf.reduce_sum((norm - 1.0) ** 2) + tf.reduce_sum((rs ** 2) * ((1 - rs) ** 2)) normalized_rs = tf.nn.l2_normalize(rs, axis=-1) dot = tf.matmul(normalized_rs, tf.transpose(normalized_rs, perm=[0, 2, 1])) isometric_constraint += tf.reduce_sum(((dot * (tf.ones([2, 2]) - tf.eye(2))) ** 2) * 0.5) tf.summary.histogram('rs', tf.stack(rs_all)) isometric_loss = isometric_constraint * p.train.isometric if p.lm.use_chunks:# and not p.is_eval: with tf.name_scope('global_decode'): assert p.lm.num_sent_roles > 0 total_chunk_loss = -tf.reduce_sum(xent_output.chunk_log_probs) avg_chunk_loss = total_chunk_loss / xent_output.num_chunks global_step = tf.to_float(py_utils.GetOrCreateGlobalStep()) temperature = tf.minimum(tf.constant(p.train.chunk_loss_anneal), global_step) / p.train.chunk_loss_anneal tf.summary.scalar('chunk/temperature', temperature) annealed_total_chunk_loss = temperature * total_chunk_loss annealed_avg_chunk_loss = temperature * avg_chunk_loss chunk_loss = annealed_avg_chunk_loss loss = xent_output.avg_xent if p.train.sum_loss_across_tokens_in_batch: loss = xent_output.total_xent if 'chunk_loss' in locals(): chunk_loss = annealed_total_chunk_loss metrics = { 'fraction_of_correct_next_step_preds': (mean_acc, num_preds), 'log_pplx': (xent_output.avg_xent, num_preds), 'log_pplx_per_word': (xent_output.total_xent / num_words, num_words), 'num_predictions': (num_preds, 1), 'num_words': (num_words, 1) } #tmp_loss = loss# + theta.dummy * theta.dummy if 'isometric_loss' in locals(): #tmp_loss += isometric_loss metrics['isometric'] = (isometric_loss, 1) if 'chunk_loss' in locals(): #tmp_loss += chunk_loss metrics['chunk_loss'] = (chunk_loss, 1) metrics['annealed_total_chunk_loss'] = (annealed_total_chunk_loss, 1) metrics['annealed_avg_chunk_loss'] = (annealed_avg_chunk_loss, xent_output.num_chunks) metrics['total_chunk_loss'] = (total_chunk_loss, 1) metrics['avg_chunk_loss'] = (avg_chunk_loss, xent_output.num_chunks) metrics['num_chunks'] = (xent_output.num_chunks, 1) #metrics['loss'] = (tmp_loss, num_preds) if p.train.sum_loss_across_tokens_in_batch: metrics['loss'] = (loss, 1) else: metrics['loss'] = (loss, num_preds) metrics['batch_size'] = (tf.cast(batch_size, tf.float32), 1) return metrics
def testAccumulator(self): # testAccumulator compares # - explicit averaging of independently computed var_grads1 and # var_grads2, # - Accumulator(SGD) optimizer effectively doing this over 2 steps. np.random.seed(12345) np_input1 = np.random.normal(0.1, 0.5, [2, 4, 3]) np.random.seed(12346) np_input2 = np.random.normal(0.1, 0.5, [2, 4, 3]) g1 = tf.Graph() with g1.as_default(): tf.set_random_seed(123456) params = layers.ProjectionLayer.Params() params.name = 'proj' params.dtype = tf.float64 params.input_dim = 3 params.output_dim = 2 params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456) params.is_eval = False params.batch_norm = False proj_layer = layers.ProjectionLayer(params) inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64) in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64) inputs2 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64) in_padding2 = tf.zeros([2, 4, 1], dtype=tf.float64) output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1) output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2) loss1 = tf.reduce_sum(output1) loss2 = tf.reduce_sum(output2) var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars) var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars) op = optimizer.SGD.Params().Set(add_summary=False) opt = op.cls(op) lr = 1e-1 with tf.control_dependencies([loss1, loss2]): var_update_op1 = opt.Apply( lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.)) with tf.control_dependencies([var_update_op1]): var_update_op2 = opt.Apply( lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.)) init_op = tf.global_variables_initializer() with self.session(use_gpu=True, graph=g1) as sess: sess.run(init_op) vars1 = sess.run(proj_layer.vars.Flatten()) loss1_1, grads1_1, loss1_2, grads1_2 = sess.run( [loss1, var_grads1, loss2, var_grads2], feed_dict={ inputs1: np_input1, inputs2: np_input2, }) sess.run( [var_update_op2], feed_dict={ inputs1: np_input1, inputs2: np_input2, }) vars1_1 = sess.run(proj_layer.vars.Flatten()) g2 = tf.Graph() with g2.as_default(): tf.set_random_seed(123456) params = layers.ProjectionLayer.Params() params.name = 'proj' params.dtype = tf.float64 params.input_dim = 3 params.output_dim = 2 params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456) params.is_eval = False params.batch_norm = False proj_layer = layers.ProjectionLayer(params) in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64) inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64) output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1) loss = tf.reduce_sum(output1) var_grads = py_utils.ComputeGradients(loss, proj_layer.vars) op = optimizer.Accumulator.Params().Set( accum_steps=2, dtype=tf.float64, optimizer_tpl=optimizer.SGD.Params().Set(add_summary=False)) opt = op.cls(op) lr = 1e-1 var_update_op = opt.Apply(lr, var_grads) init_op = tf.global_variables_initializer() global_step = py_utils.GetOrCreateGlobalStep() increment_global_step_op = tf.assign_add(global_step, 1) with self.session(use_gpu=True, graph=g2) as sess: sess.run(init_op) vars2, global_step = sess.run([proj_layer.vars.Flatten(), global_step]) loss2_1, grads2_1 = sess.run( [loss, var_grads], feed_dict={ inputs1: np_input1, }) loss2_2, grads2_2 = sess.run( [loss, var_grads], feed_dict={ inputs1: np_input2, }) acc_0 = sess.run( [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0] sess.run( [var_update_op], feed_dict={ inputs1: np_input1, }) acc_1 = sess.run( [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0] vars2_intermediate = sess.run(proj_layer.vars.Flatten()) sess.run(increment_global_step_op) sess.run( [var_update_op], feed_dict={ inputs1: np_input2, }) acc_2 = sess.run( [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0] vars2_1 = sess.run(proj_layer.vars.Flatten()) self.assertAllClose(vars1, vars2) self.assertAllClose(acc_0, np.zeros_like(acc_0)) self.assertAllClose(acc_1, grads2_1['w'][1]) self.assertAllClose(acc_2, np.zeros_like(acc_0)) self.assertAllClose(loss1_1, loss2_1) self.assertAllClose(loss1_2, loss2_2) self.assertAllClose(grads1_1, grads2_1) self.assertAllClose(grads1_2, grads2_2) self.assertAllClose(vars1, vars2_intermediate) self.assertAllClose(vars2[0], grads2_1['w'][0]) self.assertAllClose(vars2[0], grads2_2['w'][0]) self.assertAllClose( vars1[0] - 0.5 * lr * (grads1_1['w'][1] + grads1_2['w'][1]), vars1_1[0]) self.assertAllClose( vars2[0] - 0.5 * lr * (grads2_1['w'][1] + grads2_2['w'][1]), vars2_1[0]) self.assertAllClose(vars2, vars2_intermediate) self.assertAllClose(vars1_1, vars2_1)