def _update_distributed_as_chief(self, version_step=None): """ Performs the gradient averaging, updates the variables, and the global step :param version_step: A variable that represents the model's version :return: The update operation to run Note: This method is called by the chief when synchronization is required. """ # Creating sync_token queue with ops.device(self._global_step.device), ops.name_scope(''): self._sync_token_queue = data_flow_ops.FIFOQueue( capacity=-1, dtypes=self._global_step.dtype.base_dtype, shapes=(), name='sync_token_q', shared_name='sync_token_q') # Applying grads, then adding tokens to queue with ops.control_dependencies([self._apply_grad_op]): tokens = gen_array_ops.fill([self._num_workers], self._global_step) sync_op = self._sync_token_queue.enqueue_many((tokens, )) # Waiting for token in queue (sync point) with ops.control_dependencies([sync_op]): token = self._sync_token_queue.dequeue() update_ops = [state_ops.assign(self._local_step, token)] # Increasing version step if version_step is not None: update_ops += [state_ops.assign_add(version_step, 1)] # Returning return control_flow_ops.group(*update_ops)
def __init__(self, cell, order_embedding, candidate_embedding, candidates, sequence_length, initial_state, beam_width, input_layer=None, output_layer=None, time_major=False): """ Initialize the CustomBeamHelper :param cell: An `RNNCell` instance. :param order_embedding: The order embedding vector - Size: (batch, ord_emb_size) :param candidate_embedding: The candidate embedding vector - Size: (batch, cand_emb_size) :param candidates: The candidates at each time step -- Size: (batch, nb_cand, max_candidates) :param sequence_length: The length of each sequence (batch,) :param initial_state: A (possibly nested tuple of...) tensors and TensorArrays. :param beam_width: Python integer, the number of beams. :param input_layer: Optional. A layer to apply on the inputs :param output_layer: Optional. An instance of `tf.layers.Layer`, i.e., `tf.layers.Dense`. Optional layer to apply to the RNN output prior to storing the result or sampling. :param time_major: If true indicates that the first dimension is time, otherwise it is batch size. """ # pylint: disable=super-init-not-called,too-many-arguments rnn_cell_impl.assert_like_rnncell('cell', cell) # pylint: disable=protected-access assert isinstance(beam_width, int), 'beam_width should be a Python integer' self._sequence_length = ops.convert_to_tensor(sequence_length, name='sequence_length') if self._sequence_length.get_shape().ndims != 1: raise ValueError("Expected vector for sequence_length. Shape: %s" % self._sequence_length.get_shape()) candidates = ops.convert_to_tensor(candidates, name='candidates') candidates = nest.map_structure(_transpose_batch_time, candidates) if not time_major else candidates self._cell = cell self._order_embedding_fn = _get_embedding_fn(order_embedding) self._candidate_embedding_fn = _get_embedding_fn(candidate_embedding) self._candidate_tas = nest.map_structure(_unstack_ta, candidates) self._input_layer = input_layer if input_layer is not None else lambda x: x self._output_layer = output_layer self._input_size = order_embedding.shape[-1] if input_layer is not None: self._input_size = self._input_layer.compute_output_shape([None, self._input_size])[-1] self._batch_size = array_ops.size(sequence_length) self._start_tokens = gen_array_ops.fill([self._batch_size * beam_width], GO_ID) self._end_token = -1 self._beam_width = beam_width self._initial_cell_state = nest.map_structure(self._maybe_split_batch_beams, initial_state, self._cell.state_size) self._finished = array_ops.one_hot(array_ops.zeros([self._batch_size], dtype=dtypes.int32), depth=self._beam_width, on_value=False, off_value=True, dtype=dtypes.bool) # Compute input shape self._zero_inputs = \ CandidateInputs(inputs= array_ops.zeros_like(self._split_batch_beams( self._input_layer(self._order_embedding_fn(self._start_tokens)), self._input_size)), candidates=array_ops.zeros_like(candidates[0, :]), candidates_emb=array_ops.zeros_like(self._candidate_embedding_fn(candidates[0, :])))
def __init__(self, decoder_type, inputs, order_embedding, candidate_embedding, sequence_length, candidates, input_layer=None, time_major=False, softmax_temperature=None, seed=None, name=None): """ Constructor :param decoder_type: An uint8 representing TRAINING_DECODER, GREEDY_DECODER, or SAMPLE_DECODER :param inputs: The decoder input (b, dec_len) :param order_embedding: The order embedding vector :param candidate_embedding: The candidate embedding vector :param sequence_length: The length of each input (b,) :param candidates: The candidates at each time step -- Size: (b, nb_cand, max_candidates) :param input_layer: Optional. A layer to apply on the inputs :param time_major: If true indicates that the first dimension is time, otherwise it is batch size :param softmax_temperature: Optional. Softmax temperature. None, scalar, or size: (batch_size,) :param seed: Optional. The sampling seed :param name: Optional scope name. """ # pylint: disable=too-many-arguments with ops.name_scope(name, "CustomHelper", [inputs, sequence_length, order_embedding, candidate_embedding]): inputs = ops.convert_to_tensor(inputs, name="inputs") candidates = ops.convert_to_tensor(candidates, name="candidates") self._inputs = inputs self._order_embedding_fn = _get_embedding_fn(order_embedding) self._candidate_embedding_fn = _get_embedding_fn(candidate_embedding) if not time_major: inputs = nest.map_structure(_transpose_batch_time, inputs) candidates = nest.map_structure(_transpose_batch_time, candidates) self._input_tas = nest.map_structure(_unstack_ta, inputs) self._candidate_tas = nest.map_structure(_unstack_ta, candidates) self._decoder_type = decoder_type self._sequence_length = ops.convert_to_tensor(sequence_length, name="sequence_length") if self._sequence_length.get_shape().ndims != 1: raise ValueError("Expected vector for sequence_length. Shape: %s" % self._sequence_length.get_shape()) self._input_layer = input_layer if input_layer is not None else lambda x: x self._batch_size = array_ops.size(sequence_length) self._start_inputs = gen_array_ops.fill([self._batch_size], GO_ID) self._softmax_temperature = softmax_temperature self._seed = seed # Compute input shape self._zero_inputs = \ CandidateInputs(inputs= array_ops.zeros_like(self._input_layer(self._order_embedding_fn(self._start_inputs))), candidates=array_ops.zeros_like(candidates[0, :]), candidates_emb=array_ops.zeros_like(self._candidate_embedding_fn(candidates[0, :]))) # Preventing div by zero # Adding an extra dim to the matrix, so we can broadcast with the outputs shape if softmax_temperature is not None: self._softmax_temperature = gen_math_ops.maximum(1e-10, self._softmax_temperature) if self._softmax_temperature.get_shape().ndims == 1: self._softmax_temperature = self._softmax_temperature[:, None]
def body(time, outputs_ta, state, inputs, finished, sequence_lengths): """ Internal while_loop body. """ (next_outputs, decoder_state, next_inputs, decoder_finished) = decoder.step(time, inputs, state) if decoder.tracks_own_finished: next_finished = decoder_finished else: next_finished = gen_math_ops.logical_or( decoder_finished, finished) next_sequence_lengths = array_ops.where( gen_math_ops.logical_not(finished), gen_array_ops.fill(array_ops.shape(sequence_lengths), time + 1), sequence_lengths) nest.assert_same_structure(state, decoder_state) nest.assert_same_structure(outputs_ta, next_outputs) nest.assert_same_structure(inputs, next_inputs) # Zero out output values past finish if impute_finished: emit = nest.map_structure( lambda out, zero: array_ops.where(finished, zero, out), next_outputs, zero_outputs) else: emit = next_outputs # Copy through states past finish def _maybe_copy_state(new, cur): # TensorArrays, multiple dynamic dims, and scalar states get passed through. if isinstance(cur, tensor_array_ops.TensorArray): pass_through = True elif None in new.shape.as_list()[1:]: pass_through = True else: new.set_shape(cur.shape) pass_through = (new.shape.ndims == 0) return new if pass_through else array_ops.where( finished, cur, new) if impute_finished: next_state = nest.map_structure(_maybe_copy_state, decoder_state, state) else: next_state = decoder_state outputs_ta = nest.map_structure( lambda ta, out: ta.write(time, out), outputs_ta, emit) return (time + 1, outputs_ta, next_state, next_inputs, next_finished, next_sequence_lengths)
def __init__(self, cell, embedding, mask, sequence_length, initial_state, beam_width, input_layer=None, output_layer=None, time_major=False): """ Initialize the CustomBeamHelper :param cell: An `RNNCell` instance. :param embedding: The embedding vector :param mask: [SparseTensor] Mask to apply at each time step -- Size: (b, dec_len, vocab_size, vocab_size) :param sequence_length: The length of each input (b,) :param initial_state: A (possibly nested tuple of...) tensors and TensorArrays. :param beam_width: Python integer, the number of beams. :param input_layer: Optional. A layer to apply on the inputs :param output_layer: Optional. An instance of `tf.layers.Layer`, i.e., `tf.layers.Dense`. Optional layer to apply to the RNN output prior to storing the result or sampling. :param time_major: If true indicates that the first dimension is time, otherwise it is batch size. """ # pylint: disable=super-init-not-called,too-many-arguments rnn_cell_impl.assert_like_rnncell('cell', cell) # pylint: disable=protected-access assert isinstance(mask, SparseTensor), 'The mask must be a SparseTensor' assert isinstance(beam_width, int), 'beam_width should be a Python integer' self._sequence_length = ops.convert_to_tensor(sequence_length, name='sequence_length') if self._sequence_length.get_shape().ndims != 1: raise ValueError("Expected vector for sequence_length. Shape: %s" % self._sequence_length.get_shape()) self._cell = cell self._embedding_fn = _get_embedding_fn(embedding) self._mask = mask self._time_major = time_major self.vocab_size = VOCABULARY_SIZE self._input_layer = input_layer if input_layer is not None else lambda x: x self._output_layer = output_layer self._input_size = embedding.shape[-1] if input_layer is not None: self._input_size = self._input_layer.compute_output_shape( [None, self._input_size])[-1] self._batch_size = array_ops.size(sequence_length) self._start_tokens = gen_array_ops.fill( [self._batch_size * beam_width], GO_ID) self._end_token = -1 self._beam_width = beam_width self._initial_cell_state = nest.map_structure( self._maybe_split_batch_beams, initial_state, self._cell.state_size) self._finished = array_ops.one_hot(array_ops.zeros([self._batch_size], dtype=dtypes.int32), depth=self._beam_width, on_value=False, off_value=True, dtype=dtypes.bool) # zero_mask is (batch, beam, vocab_size) self._zero_mask = _slice_mask(self._mask, slicing=[-1, 0, GO_ID, -1], squeeze=True, time_major=self._time_major) self._zero_mask = gen_array_ops.tile( array_ops.expand_dims(self._zero_mask, axis=1), [1, self._beam_width, 1]) self._zero_inputs = \ MaskedInputs( inputs=array_ops.zeros_like( self._split_batch_beams( self._input_layer(self._embedding_fn(self._start_tokens)), self._input_size)), mask=self._zero_mask)
def training(): """ Selecting training / teacher forcing """ fill_op = gen_array_ops.fill([array_ops.shape(outputs)[0]], -1) with ops.control_dependencies([fill_op]): return array_ops.identity(fill_op)
def __init__(self, decoder_type, inputs, embedding, sequence_length, mask, input_layer=None, time_major=False, softmax_temperature=None, seed=None, name=None): """ Constructor :param decoder_type: An uint8 representing TRAINING_DECODER, GREEDY_DECODER, or SAMPLE_DECODER :param inputs: The decoder input (b, dec_len) :param embedding: The embedding vector :param sequence_length: The length of each input (b,) :param mask: [SparseTensor] Mask to apply at each time step -- Size: (b, dec_len, vocab_size, vocab_size) :param input_layer: Optional. A layer to apply on the inputs :param time_major: If true indicates that the first dimension is time, otherwise it is batch size :param softmax_temperature: Optional. Softmax temperature. None or size: (batch_size,) :param seed: Optional. The sampling seed :param name: Optional scope name. """ # pylint: disable=too-many-arguments with ops.name_scope(name, "CustomHelper", [inputs, sequence_length, embedding]): assert isinstance(mask, SparseTensor), 'The mask must be a SparseTensor' inputs = ops.convert_to_tensor(inputs, name="inputs") self._inputs = inputs self._mask = mask self._time_major = time_major self._embedding_fn = embedding if callable( embedding) else lambda ids: embedding_lookup(embedding, ids) if not time_major: inputs = nest.map_structure(_transpose_batch_time, inputs) self._input_tas = nest.map_structure(_unstack_ta, inputs) self._decoder_type = decoder_type self._sequence_length = ops.convert_to_tensor( sequence_length, name="sequence_length") if self._sequence_length.get_shape().ndims != 1: raise ValueError( "Expected vector for sequence_length. Shape: %s" % self._sequence_length.get_shape()) self._input_layer = input_layer if callable( input_layer) else lambda x: x self._batch_size = array_ops.size(sequence_length) self._start_inputs = gen_array_ops.fill([self._batch_size], GO_ID) self._softmax_temperature = softmax_temperature self._seed = seed self.vocab_size = VOCABULARY_SIZE self._zero_inputs = \ MaskedInputs(inputs=array_ops.zeros_like(self._input_layer(self._embedding_fn(self._start_inputs))), mask=_slice_mask(self._mask, slicing=[-1, 0, GO_ID, -1], squeeze=True, time_major=self._time_major)) # Preventing div by zero # Adding an extra dim to the matrix, so we can broadcast with the outputs shape if softmax_temperature is not None: self._softmax_temperature = gen_math_ops.maximum( 1e-10, self._softmax_temperature) if self._softmax_temperature.get_shape().ndims == 1: self._softmax_temperature = self._softmax_temperature[:, None]
def _step(self, inputs, past_attns, time, feeder_cell, feeder_state): """ Performs the block operation on n-layers :param inputs: The tensor inputs (embedding of each word) - [batch, seq_len, emb_size] :param past_attns: The past attentions - [batch, nb_layers, 2, nb_heads. past_length, emb_size // nb_heads] :param time: A tensor representing the current time step :param feeder_cell: None or A feeder cell that returns a RNN cell output to use for conditioning :param feeder_state: None or the initial state of the feeder cell :param name: Name of the scope - To share weights between calls :return: A tuple consisting of: 1) The cell outputs - [batch, seq_len, emb_size] 2) The present attention - [batch, nb_layers, 2, nb_heads. seq_len, emb_size // nb_heads] 3) The new state of the feeder cell """ with variable_scope.variable_scope(self._scope, default_name='step'): past_length = array_ops.shape(past_attns)[ -2] # How many past attention steps we have seq_len = array_ops.shape(inputs)[ -2] # How many steps are we computing for the current time emb_size = inputs.shape[-1].value # The size of the embedding assert emb_size == self._emb_size, 'Expected an embedding size of %d' % self._emb_size # 1) Computing the word embedding of each token assert inputs.shape.ndims == 3, 'Expected [batch, seq_len, emb_size]' # [bz, seq, emb] out_h = inputs # 2) Computing the position embedding of each token # If we know the context was padded, the effective past length is the context length + nb of time steps if self._past_seq_lengths is not None: past_length = gen_math_ops.minimum( past_length, self._past_seq_lengths + time)[:, None] # [bz, 1] else: past_length = gen_array_ops.fill([self._batch_size, 1], value=past_length) # [bz, 1] step_ix = math_ops.range(seq_len)[None, :] # [1, seq_len] token_positions = gen_math_ops.add(past_length, step_ix) # [batch, seq_len] token_positions = gen_math_ops.minimum( self._position_emb_size - 1, token_positions) # [batch, seq_len] h_pos = self._position_embedding_fn( token_positions) # [bz, seq, emb] out_h = out_h + h_pos # 3) If we have a feeder cell, we also need to condition 'h' on it. next_feeder_state = feeder_state if feeder_cell is not None: assert feeder_state is not None, 'A feeder state is required if a feeder cell is provided.' assert inputs.shape[ 1].value == 1, 'The seq dimension must be 1 to use a feeder_cell' feeder_outputs, next_feeder_state = feeder_cell( array_ops.squeeze(inputs, axis=1), feeder_state) h_feed = feeder_outputs # [bz, feeder_sz] if feeder_outputs.shape[-1].value != emb_size: h_feed = core.Dense(emb_size, activation=None, name='h_feed')(h_feed) # [bz, emb] h_feed = gen_array_ops.tile(h_feed[:, None, :], [1, seq_len, 1]) # [bz, seq, emb] out_h = out_h + h_feed # Transformer presents = [] pasts = array_ops.unstack( past_attns, axis=1) # list of [batch, 2, heads, past_len, head_sz] assert len( pasts ) == self._nb_layers, 'Expected the past attention to have %d layers.' % self._nb_layers for layer_ix, past_attn in enumerate(pasts): out_h, present = self._block(out_h, past_attn, 'layer.%d' % layer_ix) presents += [present] presents = array_ops.stack(presents, axis=1) # Normalizing and returning cell_outputs = self._norm(out_h, 'norm_h') # [batch, seq, emb] return cell_outputs, presents, next_feeder_state