def ndlstm_base_unrolled(inputs, noutput, scope=None, reverse=False): """Run an LSTM, either forward or backward. This is a 1D LSTM implementation using unrolling and the TensorFlow LSTM op. Args: inputs: input sequence (length, batch_size, ninput) noutput: depth of output scope: optional scope name reverse: run LSTM in reverse Returns: Output sequence (length, batch_size, noutput) """ with variable_scope.variable_scope(scope, "SeqLstmUnrolled", [inputs]): length, batch_size, _ = _shape(inputs) lstm_cell = core_rnn_cell_impl.BasicLSTMCell(noutput, state_is_tuple=False) state = array_ops.zeros([batch_size, lstm_cell.state_size]) output_u = [] inputs_u = array_ops.unstack(inputs) if reverse: inputs_u = list(reversed(inputs_u)) for i in xrange(length): if i > 0: variable_scope.get_variable_scope().reuse_variables() output, state = lstm_cell(inputs_u[i], state) output_u += [output] if reverse: output_u = list(reversed(output_u)) outputs = array_ops.stack(output_u) return outputs
def __call__(self, inputs, state, scope=None): """Run the cell on embedded inputs.""" with vs.variable_scope(scope or type(self).__name__): # "EmbeddingWrapper2" with ops.device("/cpu:0"): if self._initializer: initializer = self._initializer elif vs.get_variable_scope().initializer: initializer = vs.get_variable_scope().initializer else: # Default initializer for embeddings should have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3) embeddings = [] for i in xrange(len(self._embedding_classes)): embeddings.append(vs.get_variable("embedding"+str(i), [self._embedding_classes[i], self._embedding_sizes[i]], initializer=initializer)) embedded = [] for i in xrange(len(self._embedding_classes)): embedded.append(embedding_ops.embedding_lookup( embeddings[i], array_ops.reshape(inputs[i], [-1]))) finalEmbedded = tf.concat(1, embedded) return self._cell(finalEmbedded, state)
def rnn_decoder(decoder_inputs, initial_state, cell, softmax_w, softmax_b, scope=None): # Currently only support Mean Squared Error. Need to support Cross Entropy # By cchanging linear activation to argmax of the logits with variable_scope.variable_scope(scope or "rnn_decoder"): state_train = initial_state state_valid = initial_state outputs_train = [] outputs_valid = [] for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() output_train, state_train = cell(inp, state_train) outputs_train.append(output_train) if i > 0: # For the next decoder input, the decoder input of train and valid are # different. For train, we use the true decoder input, for test, we use # the output of the previous # ipdb.set_trace() output_valid, state_valid = cell(tf.matmul(outputs_valid[-1], softmax_w) + softmax_b, state_valid) else: # For the first decoder input, the decoder input of train and valid # are the same, since they are both fed the decoder_input[0] state_valid, output_valid = state_train, output_train outputs_valid.append(output_valid) return outputs_train, state_train, outputs_valid, state_valid
def testReturnsExistingConcatenatedValueIfReuse(self): with variable_scope.variable_scope( "scope0", partitioner=axis0_into2_partitioner): v_concat = variable_scope.get_variable("name0", shape=(3, 1, 1)) variable_scope.get_variable_scope().reuse_variables() v_concat_2 = variable_scope.get_variable("name0", shape=(3, 1, 1)) self.assertEqual(v_concat, v_concat_2)
def testAtrousFullyConvolutionalValues(self): """Verify dense feature extraction with atrous convolution.""" nominal_stride = 32 for output_stride in [4, 8, 16, 32, None]: with arg_scope(resnet_utils.resnet_arg_scope()): with ops.Graph().as_default(): with self.test_session() as sess: random_seed.set_random_seed(0) inputs = create_test_input(2, 81, 81, 3) # Dense feature extraction followed by subsampling. output, _ = self._resnet_small( inputs, None, is_training=False, global_pool=False, output_stride=output_stride) if output_stride is None: factor = 1 else: factor = nominal_stride // output_stride output = resnet_utils.subsample(output, factor) # Make the two networks use the same weights. variable_scope.get_variable_scope().reuse_variables() # Feature extraction at the nominal network rate. expected, _ = self._resnet_small( inputs, None, is_training=False, global_pool=False) sess.run(variables.global_variables_initializer()) self.assertAllClose( output.eval(), expected.eval(), atol=1e-4, rtol=1e-4)
def __call__(self, inputs, state, scope=None): """Run the cell on embedded inputs.""" with _checked_scope(self, scope or "embedding_wrapper", reuse=self._reuse): with ops.device("/cpu:0"): if self._initializer: initializer = self._initializer elif vs.get_variable_scope().initializer: initializer = vs.get_variable_scope().initializer else: # Default initializer for embeddings should have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3) if type(state) is tuple: data_type = state[0].dtype else: data_type = state.dtype embedding = vs.get_variable( "embedding", [self._embedding_classes, self._embedding_size], initializer=initializer, dtype=data_type) embedded = embedding_ops.embedding_lookup( embedding, array_ops.reshape(inputs, [-1])) return self._cell(embedded, state)
def __call__(self, inputs, state, scope=None): """Run the cell on embedded inputs.""" with vs.variable_scope(scope or type(self).__name__): # "EmbeddingWrapper" with ops.device("/cpu:0"): if self._embedding: embedding = self._embedding else: if self._initializer: initializer = self._initializer elif vs.get_variable_scope().initializer: initializer = vs.get_variable_scope().initializer else: # Default initializer for embeddings should have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3) embedding = vs.get_variable("embedding", [self._embedding_classes, self._cell.input_size], initializer=initializer) embedded = embedding_ops.embedding_lookup( embedding, array_ops.reshape(inputs, [-1])) """print (embedded) print ("{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}")""" return self._cell(embedded, state)
def rnn_decoder(decoder_inputs, initial_state, cell, scope=None): """RNN Decoder that creates training and sampling sub-graphs. Args: decoder_inputs: Inputs for decoder, list of tensors. This is used only in training sub-graph. initial_state: Initial state for the decoder. cell: RNN cell to use for decoder. scope: Scope to use, if None new will be produced. Returns: List of tensors for outputs and states for training and sampling sub-graphs. """ with vs.variable_scope(scope or "dnn_decoder"): states, sampling_states = [initial_state], [initial_state] outputs, sampling_outputs = [], [] with ops.op_scope([decoder_inputs, initial_state], "training"): for i, inp in enumerate(decoder_inputs): if i > 0: vs.get_variable_scope().reuse_variables() output, new_state = cell(inp, states[-1]) outputs.append(output) states.append(new_state) with ops.op_scope([initial_state], "sampling"): for i, _ in enumerate(decoder_inputs): if i == 0: sampling_outputs.append(outputs[i]) sampling_states.append(states[i]) else: sampling_output, sampling_state = cell(sampling_outputs[-1], sampling_states[-1]) sampling_outputs.append(sampling_output) sampling_states.append(sampling_state) return outputs, states, sampling_outputs, sampling_states
def _TestCreateOrGetQuantizationStep(self, use_resource): g = ops.Graph() with session.Session(graph=g) as sess: variable_scope.get_variable_scope().set_use_resource(use_resource) quantization_step_tensor = common.CreateOrGetQuantizationStep() # Check that operations are added to the graph. num_nodes = len(g.get_operations()) self.assertGreater(num_nodes, 0) # Check that getting the quantization step doesn't change the graph. get_quantization_step_tensor = common.CreateOrGetQuantizationStep() self.assertEqual(quantization_step_tensor, get_quantization_step_tensor) self.assertEqual(num_nodes, len(g.get_operations())) # Ensure that running the graph increments the quantization step. sess.run(variables.global_variables_initializer()) step_val = sess.run(quantization_step_tensor) self.assertEqual(step_val, 1) # Ensure that even running a graph that depends on the quantization step # multiple times only executes it once. a = quantization_step_tensor + 1 b = a + quantization_step_tensor _, step_val = sess.run([b, quantization_step_tensor]) self.assertEqual(step_val, 2)
def testBasicLSTMCellStateTupleType(self): with self.test_session(): with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m0 = (array_ops.zeros([1, 2]),) * 2 m1 = (array_ops.zeros([1, 2]),) * 2 cell = rnn_cell_impl.MultiRNNCell( [rnn_cell_impl.BasicLSTMCell(2) for _ in range(2)], state_is_tuple=True) self.assertTrue(isinstance(cell.state_size, tuple)) self.assertTrue( isinstance(cell.state_size[0], rnn_cell_impl.LSTMStateTuple)) self.assertTrue( isinstance(cell.state_size[1], rnn_cell_impl.LSTMStateTuple)) # Pass in regular tuples _, (out_m0, out_m1) = cell(x, (m0, m1)) self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple)) self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple)) # Pass in LSTMStateTuples variable_scope.get_variable_scope().reuse_variables() zero_state = cell.zero_state(1, dtypes.float32) self.assertTrue(isinstance(zero_state, tuple)) self.assertTrue(isinstance(zero_state[0], rnn_cell_impl.LSTMStateTuple)) self.assertTrue(isinstance(zero_state[1], rnn_cell_impl.LSTMStateTuple)) _, (out_m0, out_m1) = cell(x, zero_state) self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple)) self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
def testResidualWrapperWithSlice(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 5]) m = array_ops.zeros([1, 3]) base_cell = rnn_cell_impl.GRUCell(3) g, m_new = base_cell(x, m) variable_scope.get_variable_scope().reuse_variables() def residual_with_slice_fn(inp, out): inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3]) return inp_sliced + out g_res, m_new_res = rnn_cell_impl.ResidualWrapper( base_cell, residual_with_slice_fn)(x, m) sess.run([variables_lib.global_variables_initializer()]) res_g, res_g_res, res_m_new, res_m_new_res = sess.run( [g, g_res, m_new, m_new_res], { x: np.array([[1., 1., 1., 1., 1.]]), m: np.array([[0.1, 0.1, 0.1]]) }) # Residual connections self.assertAllClose(res_g_res, res_g + [1., 1., 1.]) # States are left untouched self.assertAllClose(res_m_new, res_m_new_res)
def sequence_to_final(inputs, noutput, scope=None, name=None, reverse=False): """Run an LSTM across all steps and returns only the final state. Args: inputs: (length, batch_size, depth) tensor noutput: size of output vector scope: optional scope name name: optional name for output tensor reverse: run in reverse Returns: Batch of size (batch_size, noutput). """ with variable_scope.variable_scope(scope, "SequenceToFinal", [inputs]): length, batch_size, _ = _shape(inputs) lstm = core_rnn_cell_impl.BasicLSTMCell(noutput, state_is_tuple=False) state = array_ops.zeros([batch_size, lstm.state_size]) inputs_u = array_ops.unstack(inputs) if reverse: inputs_u = list(reversed(inputs_u)) for i in xrange(length): if i > 0: variable_scope.get_variable_scope().reuse_variables() output, state = lstm(inputs_u[i], state) outputs = array_ops.reshape(output, [batch_size, noutput], name=name) return outputs
def embed(self, func, embedding_classes, embedding_size, inputs, dtype=None, scope=None, keep_prob=1.0, initializer=None): embedder_cell = func(self._cell, embedding_classes, embedding_size, initializer=initializer) # Like rnn(..) in rnn.py, but we call only the Embedder, not the RNN cell outputs = [] with vs.variable_scope(scope or "Embedder") as varscope: if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) for time, input_ in enumerate(inputs): if time > 0: vs.get_variable_scope().reuse_variables() embedding = embedder_cell.__call__(input_, scope) if keep_prob < 1: embedding = tf.nn.dropout(embedding, keep_prob) # annotation = C~_t = tanh ( E(x_t) + b_c) b_c = tf.get_variable("annotation_b", [embedding_size]) annotation = tanh(tf.nn.bias_add(embedding, b_c)) # weighted annotation = i_t * C~_t # i = sigmoid ( E(x_t) + b_i) b_i = tf.get_variable("input_b", [embedding_size]) i = sigmoid(tf.nn.bias_add(embedding, b_i)) w_annotation = i * annotation outputs.append(w_annotation) # return empty state, will be initialized by decoder batch_size = array_ops.shape(inputs[0])[0] state = self._cell.zero_state(batch_size, dtype) return (outputs, state)
def testResidualWrapper(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 3]) m = array_ops.zeros([1, 3]) base_cell = rnn_cell_impl.GRUCell(3) g, m_new = base_cell(x, m) variable_scope.get_variable_scope().reuse_variables() wrapper_object = rnn_cell_impl.ResidualWrapper(base_cell) (name, dep), = wrapper_object._checkpoint_dependencies wrapper_object.get_config() # Should not throw an error self.assertIs(dep, base_cell) self.assertEqual("cell", name) g_res, m_new_res = wrapper_object(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([g, g_res, m_new, m_new_res], { x: np.array([[1., 1., 1.]]), m: np.array([[0.1, 0.1, 0.1]]) }) # Residual connections self.assertAllClose(res[1], res[0] + [1., 1., 1.]) # States are left untouched self.assertAllClose(res[2], res[3])
def _create_slot_var(primary, val, scope, validate_shape, shape, dtype): """Helper function for creating a slot variable.""" # TODO(lukaszkaiser): Consider allowing partitioners to be set in the current # scope. current_partitioner = variable_scope.get_variable_scope().partitioner variable_scope.get_variable_scope().set_partitioner(None) # When init from val instead of callable initializer, the shape is expected to # be None, not <unknown> or any fully defined shape. shape = shape if callable(val) else None slot = variable_scope.get_variable( scope, initializer=val, trainable=False, use_resource=resource_variable_ops.is_resource_variable(primary), shape=shape, dtype=dtype, validate_shape=validate_shape) variable_scope.get_variable_scope().set_partitioner(current_partitioner) # pylint: disable=protected-access if isinstance(primary, variables.Variable) and primary._save_slice_info: # Primary is a partitioned variable, so we need to also indicate that # the slot is a partitioned variable. Slots have the same partitioning # as their primaries. # For examples when using AdamOptimizer in linear model, slot.name # here can be "linear//weights/Adam:0", while primary.op.name is # "linear//weight". We want to get 'Adam' as real_slot_name, so we # remove "'linear//weight' + '/'" and ':0'. real_slot_name = slot.name[len(primary.op.name + "/"):-2] slice_info = primary._save_slice_info slot._set_save_slice_info(variables.Variable.SaveSliceInfo( slice_info.full_name + "/" + real_slot_name, slice_info.full_shape[:], slice_info.var_offset[:], slice_info.var_shape[:])) # pylint: enable=protected-access return slot
def tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, loop_function=None, dtype=dtypes.float32, scope=None): """RNN sequence-to-sequence model with tied encoder and decoder parameters. This model first runs an RNN to encode encoder_inputs into a state vector, and then runs decoder, initialized with the last encoder state, on decoder_inputs. Encoder and decoder use the same RNN cell and share parameters. Args: encoder_inputs: A list of 2D Tensors [batch_size x cell.input_size]. decoder_inputs: A list of 2D Tensors [batch_size x cell.input_size]. cell: rnn_cell.RNNCell defining the cell function and size. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol), see rnn_decoder for details. dtype: The dtype of the initial state of the rnn cell (default: tf.float32). scope: VariableScope for the created subgraph; default: "tied_rnn_seq2seq". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x cell.output_size] containing the generated outputs. state: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. """ with variable_scope.variable_scope("combined_tied_rnn_seq2seq"): scope = scope or "tied_rnn_seq2seq" _, enc_state = rnn.rnn( cell, encoder_inputs, dtype=dtype, scope=scope) variable_scope.get_variable_scope().reuse_variables() return rnn_decoder(decoder_inputs, enc_state, cell, loop_function=loop_function, scope=scope)
def beam_rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, scope=None,output_projection=None, beam_size=10): """RNN decoder for the sequence-to-sequence model. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor with shape [batch_size x cell.state_size]. cell: rnn_cell.RNNCell defining the cell function and size. loop_function: If not None, this function will be applied to the i-th output in order to generate the i+1-st input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. scope: VariableScope for the created subgraph; defaults to "rnn_decoder". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing generated outputs. state: The state of each cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. (Note that in some cases, like basic RNN cell or GRU cell, outputs and states can be the same. They are different for LSTM cells though.) """ with variable_scope.variable_scope(scope or "rnn_decoder"): state = initial_state outputs = [] prev = None log_beam_probs, beam_path, beam_symbols = [],[],[] state_size = int(initial_state.get_shape().with_rank(2)[1]) for i, inp in enumerate(decoder_inputs): if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols) if i > 0: variable_scope.get_variable_scope().reuse_variables() input_size = inp.get_shape().with_rank(2)[1] print input_size x = inp output, state = cell(x, state) if loop_function is not None: prev = output if i ==0: states =[] for kk in range(beam_size): states.append(state) state = tf.reshape(tf.concat(0, states), [-1, state_size]) outputs.append(tf.argmax(nn_ops.xw_plus_b( output, output_projection[0], output_projection[1]), dimension=1)) return outputs, state, tf.reshape(tf.concat(0, beam_path),[-1,beam_size]), tf.reshape(tf.concat(0, beam_symbols),[-1,beam_size])
def testBowEncodersSharingEmbeddingsSharedScope(self): with self.cached_session() as sess: docs = [[0, 1], [2, 3]] enc_1 = encoders.bow_encoder(docs, 4, 3, scope='bow') variable_scope.get_variable_scope().reuse_variables() enc_2 = encoders.bow_encoder(docs, 4, 3, scope='bow') sess.run(variables.global_variables_initializer()) avg_1, avg_2 = sess.run([enc_1, enc_2]) self.assertAllEqual(avg_1, avg_2)
def decoder(cell, dec_outputs, states, scope): outputs = [] with variable_scope.variable_scope(scope): for i in range(len(states)): if i > 0: variable_scope.get_variable_scope().reuse_variables() outs, _ = seq2seq.rnn_decoder(dec_outputs, states[i], cell) outputs.extend(outs) return outputs
def encoder(cell, inputs, n_steps, batch_size=1, dtype=tf.float32, scope=None): states = [] with variable_scope.variable_scope(scope): init_state = cell.zero_state(batch_size, dtype) for i in range(0, len(inputs), n_steps): if i > 0: variable_scope.get_variable_scope().reuse_variables() _, state = rnn(cell, inputs[i: i + n_steps], init_state, dtype) states.append(state) return states
def my_rnn(alphabetEnc, cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None): if not isinstance(cell, rnn_cell.RNNCell): raise TypeError("cell must be an instance of RNNCell") if not isinstance(inputs, list): raise TypeError("inputs must be a list") if not inputs: raise ValueError("inputs must not be empty") outputs = [] with vs.variable_scope(scope or "RNN"): fixed_batch_size = inputs[0].get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(inputs[0])[0] if initial_state is not None: state = initial_state else: if not dtype: raise ValueError("If no initial_state is provided, dtype must be.") state = cell.zero_state(batch_size, dtype) if sequence_length is not None: sequence_length = math_ops.to_int32(sequence_length) if sequence_length: # Prepare variables zero_output = array_ops.zeros( array_ops.pack([batch_size, cell.output_size]), inputs[0].dtype) zero_output.set_shape( tensor_shape.TensorShape([fixed_batch_size.value, cell.output_size])) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) for time, input_ in enumerate(inputs): if time > 0: vs.get_variable_scope().reuse_variables() # pylint: disable=cell-var-from-loop call_cell = lambda: cell([ input_ , alphabetEnc[time] ], state) # pylint: enable=cell-var-from-loop if sequence_length: (output, state) = _rnn_step( time, sequence_length, min_sequence_length, max_sequence_length, zero_output, state, call_cell) else: (output, state) = call_cell() outputs.append(output) return (outputs, state)
def testModelWithBucketsScopeAndLoss(self): """Test that variable scope reuse is not reset after model_with_buckets.""" classes = 10 buckets = [(4, 4), (8, 8)] with self.test_session(): # Here comes a sample Seq2Seq model using GRU cells. def SampleGRUSeq2Seq(enc_inp, dec_inp, weights, per_example_loss): """Example sequence-to-sequence model that uses GRU cells.""" def GRUSeq2Seq(enc_inp, dec_inp): cell = core_rnn_cell_impl.MultiRNNCell( [core_rnn_cell_impl.GRUCell(24) for _ in range(2)], state_is_tuple=True) return seq2seq_lib.embedding_attention_seq2seq( enc_inp, dec_inp, cell, num_encoder_symbols=classes, num_decoder_symbols=classes, embedding_size=24) targets = [dec_inp[i + 1] for i in range(len(dec_inp) - 1)] + [0] return seq2seq_lib.model_with_buckets( enc_inp, dec_inp, targets, weights, buckets, GRUSeq2Seq, per_example_loss=per_example_loss) # Now we construct the copy model. inp = [ array_ops.placeholder( dtypes.int32, shape=[None]) for _ in range(8) ] out = [ array_ops.placeholder( dtypes.int32, shape=[None]) for _ in range(8) ] weights = [ array_ops.ones_like( inp[0], dtype=dtypes.float32) for _ in range(8) ] with variable_scope.variable_scope("root"): _, losses1 = SampleGRUSeq2Seq(inp, out, weights, per_example_loss=False) # Now check that we did not accidentally set reuse. self.assertEqual(False, variable_scope.get_variable_scope().reuse) # Construct one more model with per-example loss. variable_scope.get_variable_scope().reuse_variables() _, losses2 = SampleGRUSeq2Seq(inp, out, weights, per_example_loss=True) # First loss is scalar, the second one is a 1-dimensinal tensor. self.assertEqual([], losses1[0].get_shape().as_list()) self.assertEqual([None], losses2[0].get_shape().as_list())
def testExceptions(self): with self.test_session(): x = constant_op.constant(self.dtype([0.1, 0.2])) wrapped_fn, _ = variable_utils.externalize_variables_as_args( test_fn, [x], possible_ancestor_vars=[], assert_variable_override=True) varscope_ops.get_variable_scope().reuse_variables() with self.assertRaisesRegexp(ValueError, r"not found"): wrapped_fn(self.dtype(2))
def testBasic(self): for i, dtype in enumerate(self.float_types): with self.cached_session(), self.test_scope(): variable_scope.get_variable_scope().set_use_resource(True) # Initialize variables for numpy implementation. m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 var0_np = np.array([1.0, 2.0], dtype=dtype) grads0_np = np.array([0.1, 0.1], dtype=dtype) var1_np = np.array([3.0, 4.0], dtype=dtype) grads1_np = np.array([0.01, 0.01], dtype=dtype) var0 = resource_variable_ops.ResourceVariable( var0_np, name="var0_%d" % i) var1 = resource_variable_ops.ResourceVariable( var1_np, name="var1_%d" % i) grads0 = constant_op.constant(grads0_np) grads1 = constant_op.constant(grads1_np) opt = adamax.AdaMaxOptimizer() update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) opt_variables = opt.variables() beta1_power = opt._get_beta_accumulators() self.assertTrue(beta1_power is not None) self.assertIn(beta1_power, opt_variables) with ops.Graph().as_default(): # Shouldn't return non-slot variables from other graphs. self.assertEqual(0, len(opt.variables())) variables.global_variables_initializer().run() # Fetch params to validate initial values self.assertAllClose([1.0, 2.0], self.evaluate(var0)) self.assertAllClose([3.0, 4.0], self.evaluate(var1)) beta1_power = opt._get_beta_accumulators() # Run 3 steps of AdaMax for t in range(1, 4): update.run() self.assertAllCloseAccordingToType(0.9**(t + 1), self.evaluate(beta1_power)) var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0) var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1) # Validate updated params self.assertAllCloseAccordingToType( var0_np, self.evaluate(var0), rtol=1e-2) self.assertAllCloseAccordingToType( var1_np, self.evaluate(var1), rtol=1e-2) self.assertEqual("var0_%d/AdaMax:0" % (i,), opt.get_slot(var=var0, name="m").name)
def _maybe_get_unique(name): """Get name for a unique variable, if not `reuse=True`.""" if variable_scope.get_variable_scope().reuse: return name vs_vars = [x.op.name for x in variable_scope.get_variable_scope().global_variables()] full_name = variable_scope.get_variable_scope().name + "/" + name if full_name not in vs_vars: return name idx = 1 while full_name + ("_%d" % idx) in vs_vars: idx += 1 return name + ("_%d" % idx)
def _propagate(dim_indices, conf, cells, c_prev, m_prev, new_output, new_state, first_call): """Propagates through all the cells in dim_indices dimensions. """ if len(dim_indices) == 0: return # Because of the way RNNCells are implemented, we take the last dimension # (H_{N-1}) out and feed it as the state of the RNN cell # (in `last_dim_output`). # The input of the cell (H_0 to H_{N-2}) are concatenated into `cell_inputs` if conf.num_dims > 1: ls_cell_inputs = [None] * (conf.num_dims - 1) for d in conf.dims[:-1]: ls_cell_inputs[d.idx] = new_output[d.idx] if new_output[ d.idx] is not None else m_prev[d.idx] cell_inputs = array_ops.concat(ls_cell_inputs, 1) else: cell_inputs = array_ops.zeros([m_prev[0].get_shape().as_list()[0], 0], m_prev[0].dtype) last_dim_output = new_output[-1] if new_output[-1] is not None else m_prev[-1] for i in dim_indices: d = conf.dims[i] if d.non_recurrent_fn: linear_args = array_ops.concat( [cell_inputs, last_dim_output], 1) if conf.num_dims > 1 else last_dim_output with vs.variable_scope('non_recurrent' if conf.tied else 'non_recurrent/cell_{}'.format(i)): if conf.tied and not (first_call and i == dim_indices[0]): vs.get_variable_scope().reuse_variables() new_output[d.idx] = layers.legacy_fully_connected( linear_args, num_output_units=conf.num_units, activation_fn=d.non_recurrent_fn, weight_init=vs.get_variable_scope().initializer or layers.initializers.xavier_initializer) else: if c_prev[i] is not None: cell_state = array_ops.concat([c_prev[i], last_dim_output], 1) else: # for GRU/RNN, the state is just the previous output cell_state = last_dim_output with vs.variable_scope('recurrent' if conf.tied else 'recurrent/cell_{}'.format(i)): if conf.tied and not (first_call and i == dim_indices[0]): vs.get_variable_scope().reuse_variables() cell = cells[i] new_output[d.idx], new_state[d.idx] = cell(cell_inputs, cell_state)
def dialog_attention_seq2seq(encoder_inputs, decoder_inputs, cell, vocab_size, num_heads=1, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None, initial_state_attention=False): if len(encoder_inputs) != len(decoder_inputs): raise Exception with variable_scope.variable_scope(scope or "dialog_attention_seq2seq"): encoder_cell = rnn_cell.EmbeddingWrapper(cell, vocab_size) outputs = [] fixed_batch_size = encoder_inputs[0][0].get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(encoder_inputs[0][0])[0] drnn_state = cell.zero_state(batch_size, dtype) for i in range(0, len(encoder_inputs)): if i > 0: variable_scope.get_variable_scope().reuse_variables() encoder_outputs, encoder_state = rnn.rnn( encoder_cell, encoder_inputs[i], dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. top_states = [array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs] attention_states = array_ops.concat(1, top_states) with variable_scope.variable_scope("DRNN"): drnn_out, drnn_state = cell(encoder_state, drnn_state) # Decoder. output_size = None if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, vocab_size) output_size = vocab_size answer_output, answer_state = embedding_attention_decoder( decoder_inputs[i], drnn_state, attention_states, cell, vocab_size, num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous, initial_state_attention=initial_state_attention) outputs.append(answer_output) with variable_scope.variable_scope("DRNN", reuse=True): drnn_out, drnn_state = cell(answer_state, drnn_state) return outputs, drnn_state
def test_unique_name_and_reuse(self): tmpl1 = template.make_template( "_", variable_scoped_function, unique_name_="s1") v1 = tmpl1() v2 = tmpl1() variable_scope.get_variable_scope().reuse_variables() tmpl2 = template.make_template( "_", variable_scoped_function, unique_name_="s1") v3 = tmpl2() self.assertEqual(v1, v2) self.assertEqual(v1, v3) self.assertEqual("s1/dummy:0", v1.name)
def _TestQuantize_AtrousConvWithBatchNorm( self, activation, activation_op_name, with_bypass, delay, fused_batch_norm, use_resource): """Tests quantization: inputs -> atrous conv with batch norm -> Activation. Args: activation: Callable that returns an Operation, a factory method for the Activation. activation_op_name: String, name of the Activation operation. with_bypass: Bool, when true there is an extra connection added from inputs to just before Activation. delay: Int (optional), delay in number of steps until quantization starts. fused_batch_norm: Bool, when true use FusedBatchNorm. use_resource: Bool, when true uses resource variables. """ graph = ops.Graph() with graph.as_default(): variable_scope.get_variable_scope().set_use_resource(use_resource) batch_size, height, width, depth = 5, 128, 128, 3 inputs = array_ops.zeros((batch_size, height, width, depth)) dilation_rate = 2 scope = 'test/test2' if with_bypass else 'test' node = separable_conv2d( inputs, None, [3, 3], rate=dilation_rate, depth_multiplier=1.0, padding='SAME', weights_initializer=self._WeightInit(0.09), activation_fn=None, normalizer_fn=batch_norm, normalizer_params=self._BatchNormParams(fused_batch_norm), scope=scope) # Manually add a bypass (optional) and an activation. if with_bypass: node = math_ops.add(inputs, node, name='test/Add') node = activation(node, name='test/' + activation_op_name) update_barrier = control_flow_ops.no_op(name='update_barrier') with ops.control_dependencies([update_barrier]): array_ops.identity(node, name='control_dependency') fold_batch_norms.FoldBatchNorms(graph, is_training=True) quantize.Quantize(graph, True, quant_delay=delay) self._AssertCorrectQuantizedGraphWithBatchNorm( graph, scope, 'DepthwiseConv2dNative', activation_op_name, with_bypass, delay, use_resource)
def build(self, input_shape): input_shape = tensor_shape.TensorShape(input_shape) if not input_shape.ndims: raise ValueError('Input has undefined rank:', input_shape) ndim = len(input_shape) if self.axis < 0: axis = ndim + self.axis else: axis = self.axis if axis < 0 or axis >= ndim: raise ValueError('Value of `axis` argument ' + str(self.axis) + ' is out of range for input with rank ' + str(ndim)) param_dim = input_shape[axis] if not param_dim.value: raise ValueError('Input has undefined `axis` dimension. Input shape: ', input_shape) if self.center: self.beta = vs.get_variable('beta', shape=(param_dim,), initializer=self.beta_initializer, regularizer=self.beta_regularizer, trainable=True) else: self.beta = None if self.scale: self.gamma = vs.get_variable('gamma', shape=(param_dim,), initializer=self.gamma_initializer, regularizer=self.gamma_regularizer, trainable=True) else: self.gamma = None # Disable variable partitioning when creating the moving mean and variance partitioner = vs.get_variable_scope().partitioner try: vs.get_variable_scope().set_partitioner(None) self.moving_mean = vs.get_variable( 'moving_mean', shape=(param_dim,), initializer=self.moving_mean_initializer, trainable=False) self.moving_variance = vs.get_variable( 'moving_variance', shape=(param_dim,), initializer=self.moving_variance_initializer, trainable=False) finally: vs.get_variable_scope().set_partitioner(partitioner)
def train_mode(self, input_dim, input_states, input_mask, concept_dim, concept_states, concept_mask, init_state, decoder_inputs, decoder_refs, decoder_feats, decoder_wids, decoder_cids, loss_weights, mode_gen='ce_train'): ''' encoder_dim: int-valued encoder_states: [batch_size, passage_len, encoder_dim]. encoder_mask: [batch_size, passage_len] int32 init_state: Tuple of [batch_size, gen_hidden_size] decoder_inputs: [batch_size, max_dec_steps]. decoder_refs: [batch_size, max_dec_steps] decoder_feats: [batch_size, max_dec_steps] decoder_wids: [batch_size, max_dec_steps] decoder_cids: [batch_size, max_dec_steps] ''' options = self.options batch_size = tf.shape(input_states)[0] input_len = tf.shape(input_states)[1] concept_len = tf.shape(concept_states)[1] decoder_inputs = tf.unstack(decoder_inputs, axis=1) # max_dec_steps * [batch_size] decoder_refs_unstack = tf.unstack(decoder_refs, axis=1) # max_dec_steps * [batch_size] decoder_feats = tf.unstack(decoder_feats, axis=1) # max_dec_steps * [batch_size, feat_num] decoder_wids = tf.unstack(decoder_wids, axis=1) # max_dec_steps * [batch_size] decoder_cids = tf.unstack(decoder_cids, axis=1) # max_dec_steps * [batch_size] # initialize all the variables state_t_1 = init_state context_input_t_1 = tf.zeros([batch_size, input_dim]) context_concept_t_1 = tf.zeros([batch_size, concept_dim]) # store variables from each time-step vocab_scores = [] sampled_words = [] with variable_scope.variable_scope("attention_decoder"): wordidx_t = decoder_inputs[0] # [batch_size] int32 featidx_t = decoder_feats[0] # [batch_size, feat_num] int32 wid_t = decoder_wids[0] cid_t = decoder_cids[0] for i in range(options.max_answer_len): if mode_gen in ('ce_train', 'loss',): wordidx_t = decoder_inputs[i] featidx_t = decoder_feats[i] wid_t = decoder_wids[i] cid_t = decoder_cids[i] word_t = _embedding_lookup(wordidx_t, self.action_embedding) if i > 0: variable_scope.get_variable_scope().reuse_variables() (state_t, context_input_t, context_concept_t, output_t) = \ self.one_step_decoder(state_t_1, context_input_t_1, context_concept_t_1, word_t, featidx_t, wid_t, cid_t, input_states, input_mask, concept_states, concept_mask) vocab_scores.append(output_t) state_t_1 = state_t context_input_t_1 = context_input_t context_concept_t_1 = context_concept_t if mode_gen == 'greedy': # TODO update featidx_t wordidx_t = tf.argmax(output_t, 1) # [batch_size] wordidx_t = tf.reshape(wordidx_t, [-1]) # [batch_size] elif mode_gen == 'sample': # TODO update featidx_t log_score_t = tf.log(output_t) # [batch_size, vsize] wordidx_t = tf.multinomial(log_score_t, 1) # [batch_size, 1] wordidx_t = tf.reshape(wordidx_t, [-1]) # [batch_size] elif mode_gen in ('ce_train', 'loss',): wordidx_t = tf.argmax(output_t, axis=1) # [batch] else: assert False, 'unknown generating mode %s' % mode_gen sampled_words.append(wordidx_t) if len(sampled_words)!=0: sampled_words = tf.stack(sampled_words, axis=1) # [batch_size, max_dec_steps] vocab_scores = tf.stack(vocab_scores, axis=1) # [batch_size, max_dec_steps, vocab] # calculating loss self.loss = None if mode_gen in ('ce_train', 'loss', ): xent = _CE_loss(vocab_scores, decoder_refs, loss_weights) # [batch_size] if mode_gen == 'loss': xent *= self.placeholders.reward # multiply with rewards self.loss = tf.reduce_mean(xent) # accuracy is calculated only under 'ce_train', where true answer is given if mode_gen == 'ce_train': accuracy = _mask_and_accuracy(vocab_scores, decoder_refs, loss_weights) return accuracy, self.loss, sampled_words else: return None, self.loss, sampled_words
def embedding_lookup( params, ids, partition_strategy=None, # pylint: disable=unused-argument name=None, validate_indices=None, # pylint: disable=unused-argument max_norm=None, return_trainable=False, ): """Provides a dynamic version of embedding_lookup similar with tf.nn.embedding_lookup. Ids are flattened to a 1d tensor before being passed to embedding_lookup then, they are unflattend to match the original ids shape plus an extra leading dimension of the size of the embeddings. Args: params: A dynamic_embedding.Variable instance. ids: A tensor with any shape as same dtype of params.key_dtype. partition_strategy: No used, for API compatiblity with `nn.emedding_lookup`. name: A name for the operation. Name is optional in graph mode and required in eager mode. validate_indices: No used, just for compatible with nn.embedding_lookup . max_norm: If not `None`, each embedding is clipped if its l2-norm is larger than this value. return_trainable: optional, If True, also return TrainableWrapper Returns: A tensor with shape [shape of ids] + [dim], dim is equal to the value dim of params. containing the values from the params tensor(s) for keys in ids. trainable_wrap: A TrainableWrapper object used to fill the Optimizers `var_list` Only provided if `return_trainable` is True. """ if isinstance(params, (list, tuple)) and len(params) > 1: raise ValueError("Only one params is allowed.") if isinstance(params, (list, tuple)): params = params[0] if not isinstance(params, de.Variable): raise TypeError("params should be a Variable instance.") if params.key_dtype != ids.dtype: raise TypeError( "params.key_dtype should be same with ids.dtype: {} vs. {}".format( params.key_dtype, ids.dtype)) if context.executing_eagerly() and (name is None): raise ValueError( 'Must specify a name for dynamic_embedding.embedding_lookup when running eagerly.' ) scope = variable_scope.get_variable_scope() full_name = scope.name + "/" if scope.name else "" full_name += (name + "/") if name else "embedding_lookup/" with ops.name_scope(full_name): ids = ops.convert_to_tensor(ids, name="ids") if ids.get_shape().is_fully_defined(): # use static shape initial_shape = [ids.get_shape().num_elements(), params.dim] embeddings_shape = ids.get_shape().concatenate([params.dim]) else: # use dynamic shape initial_shape = (1, params.dim) embeddings_shape = array_ops.concat([array_ops.shape(ids), [params.dim]], axis=0) initial_value = array_ops.zeros(shape=initial_shape, dtype=params.value_dtype) if (isinstance(initial_value, ops.Tensor) and hasattr(initial_value, "graph") and initial_value.graph.building_function): def initial_value(): return array_ops.zeros(initial_shape, dtype=params.value_dtype) with ops.colocate_with(None, ignore_existing=True): collections = [ops.GraphKeys.LOCAL_VARIABLES] if params.trainable: collections += [ops.GraphKeys.TRAINABLE_VARIABLES] def _create_trainable(trainable_name): return de.TrainableWrapper(params, ids, max_norm=max_norm, initial_value=initial_value, dtype=params.value_dtype, trainable=params.trainable, collections=collections, model_mode=ModelMode.CURRENT_SETTING, name=trainable_name) with ops.colocate_with(ids, ignore_existing=True): if context.executing_eagerly(): trainable_ = params._trainable_store.get(name, None) if trainable_ is None: trainable_ = _create_trainable(name) params._trainable_store[name] = trainable_ else: trainable_._reset_ids(ids) else: trainable_ = _create_trainable(name) params._trainable_store[name] = trainable_ embeddings = array_ops.identity(trainable_) embeddings = array_ops.reshape(embeddings, shape=embeddings_shape) return (embeddings, trainable_) if return_trainable else embeddings
def attention_decoder(decoder_inputs, initial_state, encoder_states, enc_padding_mask, cell, initial_state_attention=False, pointer_gen=True, use_coverage=False, prev_coverage=None): """ Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. encoder_states: 3D Tensor [batch_size x attn_length x attn_size]. enc_padding_mask: 2D Tensor [batch_size x attn_length] containing 1s and 0s; indicates which of the encoder locations are padding (0) or a real token (1). cell: rnn_cell.RNNCell defining the cell function and size. initial_state_attention: Note that this attention decoder passes each decoder input through a linear layer with the previous step's context vector to get a modified version of the input. If initial_state_attention is False, on the first decoder step the "previous context vector" is just a zero vector. If initial_state_attention is True, we use initial_state to (re)calculate the previous step's context vector. We set this to False for train/eval mode (because we call attention_decoder once for all decoder steps) and True for decode mode (because we call attention_decoder once for each decoder step). pointer_gen: boolean. If True, calculate the generation probability p_gen for each decoder step. use_coverage: boolean. If True, use coverage mechanism. prev_coverage: If not None, a tensor with shape (batch_size, attn_length). The previous step's coverage vector. This is only not None in decode mode when using coverage. Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x cell.output_size]. The output vectors. state: The final state of the decoder. A tensor shape [batch_size x cell.state_size]. attn_dists: A list containing tensors of shape (batch_size,attn_length). The attention distributions for each decoder step. p_gens: List of length input_size, containing tensors of shape [batch_size, 1]. The values of p_gen for each decoder step. Empty list if pointer_gen=False. coverage: Coverage vector on the last step computed. None if use_coverage=False. """ #pdb.set_trace() with variable_scope.variable_scope("attention_decoder") as scope: batch_size = encoder_states.get_shape( )[0].value # if this line fails, it's because the batch size isn't defined attn_size = encoder_states.get_shape( )[2].value # if this line fails, it's because the attention length isn't defined # Reshape encoder_states (need to insert a dim) encoder_states = tf.expand_dims( encoder_states, axis=2) # now is shape (batch_size, attn_len, 1, attn_size) # To calculate attention, we calculate # v^T tanh(W_h h_i + W_s s_t + b_attn) # where h_i is an encoder state, and s_t a decoder state. # attn_vec_size is the length of the vectors v, b_attn, (W_h h_i) and (W_s s_t). # We set it to be equal to the size of the encoder states. attention_vec_size = attn_size # Get the weight matrix W_h and apply it to each encoder state to get (W_h h_i), the encoder features W_h = variable_scope.get_variable( "W_h", [1, 1, attn_size, attention_vec_size]) encoder_features = nn_ops.conv2d( encoder_states, W_h, [1, 1, 1, 1], "SAME") # shape (batch_size,attn_length,1,attention_vec_size) # Get the weight vectors v and w_c (w_c is for coverage) v = variable_scope.get_variable("v", [attention_vec_size]) if use_coverage: with variable_scope.variable_scope("coverage"): w_c = variable_scope.get_variable( "w_c", [1, 1, 1, attention_vec_size]) if prev_coverage is not None: # for beam search mode with coverage # reshape from (batch_size, attn_length) to (batch_size, attn_len, 1, 1) prev_coverage = tf.expand_dims(tf.expand_dims(prev_coverage, 2), 3) def attention(decoder_state, coverage=None): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1). Returns: context_vector: weighted sum of encoder_states attn_dist: attention distribution coverage: new coverage vector. shape (batch_size, attn_len, 1, 1) """ with variable_scope.variable_scope("Attention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) decoder_features = linear( decoder_state, attention_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, attention_vec_size) def masked_attention(e): """Take softmax of e then apply enc_padding_mask and re-normalize""" attn_dist = nn_ops.softmax( e) # take softmax. shape (batch_size, attn_length) attn_dist *= enc_padding_mask # apply mask masked_sums = tf.reduce_sum(attn_dist, axis=1) # shape (batch_size) return attn_dist / tf.reshape(masked_sums, [-1, 1]) # re-normalize if use_coverage and coverage is not None: # non-first step of coverage # Multiply coverage vector by w_c to get coverage_features. coverage_features = nn_ops.conv2d( coverage, w_c, [1, 1, 1, 1], "SAME" ) # c has shape (batch_size, attn_length, 1, attention_vec_size) # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn) e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features + coverage_features), [2, 3]) # shape (batch_size,attn_length) # Calculate attention distribution attn_dist = masked_attention(e) # Update coverage vector coverage += array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) else: # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # calculate e # Calculate attention distribution attn_dist = masked_attention(e) if use_coverage: # first step of training coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2), 2) # initialize coverage # Calculate the context vector from attn_dist and encoder_states context_vector = math_ops.reduce_sum( array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * encoder_states, [1, 2]) # shape (batch_size, attn_size). context_vector = array_ops.reshape(context_vector, [-1, attn_size]) return context_vector, attn_dist, coverage outputs = [] attn_dists = [] p_gens = [] state = initial_state coverage = prev_coverage # initialize coverage to None or whatever was passed in context_vector = array_ops.zeros([batch_size, attn_size]) context_vector.set_shape([ None, attn_size ]) # Ensure the second shape of attention vectors is set. if initial_state_attention: # true in decode mode # Re-calculate the context vector from the previous step so that we can pass it through a linear layer with this step's input to get a modified version of the input context_vector, _, coverage = attention( initial_state, coverage ) # in decode mode, this is what updates the coverage vector for i, inp in enumerate(decoder_inputs): tf.logging.info("Adding attention_decoder timestep %i of %i", i, len(decoder_inputs)) if i > 0: variable_scope.get_variable_scope().reuse_variables() # Merge input and previous attentions into one vector x of the same size as inp input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) x = linear([inp] + [context_vector], input_size, True) # Run the decoder RNN cell. cell_output = decoder state cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: # always true in decode mode with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=True ): # you need this because you've already run the initial attention(...) call context_vector, attn_dist, _ = attention( state, coverage) # don't allow coverage to update else: context_vector, attn_dist, coverage = attention( state, coverage) attn_dists.append(attn_dist) # Calculate p_gen if pointer_gen: with tf.variable_scope('calculate_pgen'): p_gen = linear([context_vector, state.c, state.h, x], 1, True) # Tensor shape (batch_size, 1) p_gen = tf.sigmoid(p_gen) p_gens.append(p_gen) # Concatenate the cell_output (= decoder state) and the context vector, and pass them through a linear layer # This is V[s_t, h*_t] + b in the paper with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + [context_vector], cell.output_size, True) outputs.append(output) # If using coverage, reshape it if coverage is not None: coverage = array_ops.reshape(coverage, [batch_size, -1]) return outputs, state, attn_dists, p_gens, coverage
def seq2seq(feed_previous=False, input_dim=1, output_dim=1, input_length=120, output_length=48, hidden_dim=64, stacked_layers=2, GRADIENT_CLIPPING=2.5): tf.reset_default_graph() global_step = tf.Variable( initial_value=0, name="global_step", trainable=False, collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES]) weights = { 'out': tf.get_variable('Weights_out', shape = [hidden_dim, output_dim], \ dtype = tf.float32, \ initializer = tf.truncated_normal_initializer()), } biases = { 'out': tf.get_variable('Biases_out', shape = [output_dim], \ dtype = tf.float32, \ initializer = tf.constant_initializer(0.)), } with tf.variable_scope('Seq2seq'): encoder_input = [ tf.placeholder(tf.float32, shape=(None, input_dim), name="input_{}".format(t)) for t in range(input_length) ] target_sequence = [ tf.placeholder(tf.float32, shape=(None, output_dim), name="y".format(t)) for t in range(output_length) ] decoder_input = [ tf.zeros_like(target_sequence[0], dtype=tf.float32, name="GO") ] + target_sequence[:-1] with tf.variable_scope('LSTMCell'): cells = [] for i in range(stacked_layers): with tf.variable_scope('RNN_{}'.format(i)): cells.append(tf.contrib.rnn.LSTMCell(hidden_dim)) cell = tf.contrib.rnn.MultiRNNCell(cells) with variable_scope.variable_scope('basic_rnn_seq2seq'): encoder_cell = copy.deepcopy(cell) _, encoder_state = rnn.static_rnn(encoder_cell, encoder_input, dtype=dtypes.float32) with variable_scope.variable_scope('rnn_decoder'): state = encoder_state outputs = [] for i, input_ in enumerate(decoder_input): if i > 0: variable_scope.get_variable_scope().reuse_variables() output, state = cell(input_, state) outputs.append(output) reshaped = [ tf.matmul(i, weights['out']) + biases['out'] for i in outputs ] return encoder_input, target_sequence, reshaped, global_step
def safe_embedding_lookup_sparse( embedding_weights, sparse_ids, sparse_weights=None, combiner="mean", default_id=None, name="safe_embedding_lookup_sparse", partition_strategy=None, # no used max_norm=None, return_trainable=False, ): """Provides a dynamic version of `tf.nn.safe_embedding_lookup_sparse`. Lookup embedding results, accounting for empty features and invalid weights. Any IDs will be treated as valid include non-positive IDs. Invalid weights (<= 0) are pruned from input weights, as well as any IDs with non-positive weight. For an entry with no features, the embedding vector for `default_id` is returned, or the 0-vector if `default_id` is not supplied. The ids and weights may be multi-dimensional. Embeddings are always aggregated along the last dimension. Args: embedding_weights: A single `dynamic_embedding.Variable` instance representing the complete embedding tensor. sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the ids. `d_0` is typically batch size. sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing float weights corresponding to `sparse_ids`, or `None` if all weights are be assumed to be 1.0. combiner: A string specifying how to combine embedding results for each entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the default. default_id: The id to use for an entry with no features. name: A name for this operation. Name is optional in graph mode and required in eager mode. partition_strategy: A string specifying the partitioning strategy. Currently `"div"` and `"mod"` are supported. Default is `"div"`. max_norm: If not `None`, all embeddings are l2-normalized to max_norm before combining. Returns: combined_embeddings: A dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`. trainable_wrap: A TrainableWrapper object used to fill the Optimizers `var_list` Only provided if `return_trainable` is True. Raises: ValueError: if `embedding_weights` is empty. """ if embedding_weights is None: raise ValueError("Missing embedding_weights %s." % embedding_weights) if embedding_weights.key_dtype != sparse_ids.dtype: raise TypeError( "embedding_weights.key_dtype should be same with sparse_ids.dtype: " "{} vs. {}".format(embedding_weights.key_dtype, sparse_ids.dtype)) weights_dtype = sparse_weights.dtype if sparse_weights is not None else None if weights_dtype and embedding_weights.value_dtype != weights_dtype: raise TypeError( "embedding_weights.value_dtype should be same with sparse_weights.dtype" ": {} vs. {}".format(embedding_weights.value_dtype, weights_dtype)) scope = variable_scope.get_variable_scope() full_name = scope.name + "/" + name if scope.name else name with ops.name_scope(full_name + "/"): # Reshape higher-rank sparse ids and weights to linear segment ids. original_shape = sparse_ids.dense_shape original_rank_dim = tensor_shape.dimension_value( sparse_ids.dense_shape.get_shape()[0]) original_rank = (array_ops.size(original_shape) if original_rank_dim is None else original_rank_dim) sparse_ids = de.math.sparse_reshape( sparse_ids, [ math_ops.reduce_prod( array_ops.slice(original_shape, [0], [original_rank - 1])), array_ops.gather(original_shape, original_rank - 1), ], ) if sparse_weights is not None: sparse_weights = sparse_tensor.SparseTensor(sparse_ids.indices, sparse_weights.values, sparse_ids.dense_shape) # Prune invalid weights. if combiner != "sum": sparse_ids, sparse_weights = _prune_invalid_weights( sparse_ids, sparse_weights) # Fill in dummy values for empty features, if necessary. sparse_ids, is_row_empty = de.math.sparse_fill_empty_rows( sparse_ids, default_id or 0) if sparse_weights is not None: sparse_weights, _ = de.math.sparse_fill_empty_rows(sparse_weights, 1.0) result, trainable_ = embedding_lookup_sparse( embedding_weights, sparse_ids, sparse_weights, combiner=combiner, partition_strategy=partition_strategy, name=name + "/embedding_lookup_sparse", max_norm=max_norm, return_trainable=True, ) if default_id is None: # Broadcast is_row_empty to the same shape as embedding_lookup_result, # for use in Select. is_row_empty = array_ops.tile( array_ops.reshape(is_row_empty, [-1, 1]), array_ops.stack([1, array_ops.shape(result)[1]]), ) result = array_ops.where(is_row_empty, array_ops.zeros_like(result), result, name="where") # Reshape back from linear ids back into higher-dimensional dense result. final_result = array_ops.reshape( result, array_ops.concat( [ array_ops.slice( math_ops.cast(original_shape, dtypes.int32), [0], [original_rank - 1], ), array_ops.slice(array_ops.shape(result), [1], [-1]), ], 0, ), ) final_result.set_shape( tensor_shape.unknown_shape( (tensor_shape.Dimension(original_rank_dim) - 1).value).concatenate( result.get_shape()[1:])) return (final_result, trainable_) if return_trainable else final_result
def variable(name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, device=None, partitioner=None, custom_getter=None, use_resource=None, synchronization=variables.VariableSynchronization.AUTO, aggregation=variables.VariableAggregation.NONE): """Gets an existing variable with these parameters or creates a new one. Args: name: the name of the new or existing variable. shape: shape of the new or existing variable. dtype: type of the new or existing variable (defaults to `DT_FLOAT`). initializer: initializer for the variable if one is created. regularizer: a (Tensor -> Tensor or None) function; the result of applying it on a newly created variable will be added to the collection GraphKeys.REGULARIZATION_LOSSES and can be used for regularization. trainable: If `True` also add the variable to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). collections: A list of collection names to which the Variable will be added. If None it would default to `tf.GraphKeys.GLOBAL_VARIABLES`. caching_device: Optional device string or function describing where the Variable should be cached for reading. Defaults to the Variable's device. device: Optional device to place the variable. It can be an string or a function that is called to get the device for the variable. partitioner: Optional callable that accepts a fully defined `TensorShape` and dtype of the `Variable` to be created, and returns a list of partitions for each axis (currently only one axis can be partitioned). custom_getter: Callable that allows overwriting the internal get_variable method and has to have the same signature. use_resource: If `True` use a ResourceVariable instead of a Variable. synchronization: Indicates when a distributed a variable will be aggregated. Accepted values are constants defined in the class `tf.VariableSynchronization`. By default the synchronization is set to `AUTO` and the current `DistributionStrategy` chooses when to synchronize. If `synchronization` is set to `ON_READ`, `trainable` must not be set to `True`. aggregation: Indicates how a distributed variable will be aggregated. Accepted values are constants defined in the class `tf.VariableAggregation`. Returns: The created or existing variable. """ collections = list(collections if collections is not None else [ops.GraphKeys.GLOBAL_VARIABLES]) # Remove duplicates collections = list(set(collections)) getter = variable_scope.get_variable if custom_getter is not None: getter = functools.partial( custom_getter, reuse=variable_scope.get_variable_scope().reuse) with ops.device(device or ''): return getter(name, shape=shape, dtype=dtype, initializer=initializer, regularizer=regularizer, trainable=trainable, collections=collections, caching_device=caching_device, partitioner=partitioner, use_resource=use_resource, synchronization=synchronization, aggregation=aggregation)
def call(self, inputs, state): num_proj = self._num_units if self._num_proj is None else self._num_proj sigmoid = math_ops.sigmoid if self._state_is_tuple: (c_prev, h_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) h_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) x = inputs[0] dtype = x.dtype input_size = x.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from get_shape()") # [-1]? scope = vs.get_variable_scope() with vs.variable_scope(scope, reuse=tf.AUTO_REUSE, initializer=self._initializer) as unit_scope: # i = input_gate, j = new_input, f = forget_gate, o = output_gate lstm_matrix = _linear([x, h_prev], 4 * self._num_units, bias=True) i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1) with vs.variable_scope("label_layer", reuse=tf.AUTO_REUSE) as label_scope: """ compute the output: probabilities """ yy = tf.eye(self._num_labs) wy = vs.get_variable("wy", [self._num_labs, self._num_units]) li = math_ops.matmul(yy, wy) if self._gate == "input": logits = i + li elif self._gate == "output": logits = o + li elif self._gate == "forget": logits = f + li elif self._gate == "combine": logits = lstm_matrix + array_ops.tile(li, [1, 4]) yb = vs.get_variable("yb", [1, self._num_labs], initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) logits = tf.reduce_sum(tf.log(1 + tf.exp(logits)), axis=1) logits += yb # Check check check probs = probs + tf.log(yb) output = logits - tf.reduce_max(logits) """ compute next state """ if len(inputs) == 1: y = softmax(output) elif len(inputs) == 2: y = inputs[1] i_ = math_ops.matmul(y, wy) """ what should be added """ if self._gate == "input": i += i_ elif self._gate == "output": o += i_ elif self._gate == "forget": f += i_ elif self._gate == "combine": i += i_ j += i_ o += i_ f += i_ elif self._gate == "average": print("TODO") if self._use_peepholes: raise ValueError("Not supported yet") c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: raise ValueError("Not supported yet") m = sigmoid(o) * self._activation(c) new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return output, new_state
def __init__(self, args, output_size, build_bias, bias_initializer=None, kernel_initializer=None): self._build_bias = build_bias if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] self._is_sequence = False else: self._is_sequence = True # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError( "linear expects shape[1] to be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] scope = vs.get_variable_scope() with vs.variable_scope(scope) as outer_scope: self._weights = vs.get_variable(_WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype, initializer=kernel_initializer) if build_bias: with vs.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) if bias_initializer is None: bias_initializer = init_ops.constant_initializer( 0.0, dtype=dtype) self._biases = vs.get_variable( _BIAS_VARIABLE_NAME, [output_size], dtype=dtype, initializer=bias_initializer)
def call(self, inputs, state): """Run one step of G-LSTM. Args: inputs: input Tensor, 2D, [batch x num_units]. state: this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the G-LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - LSTMStateTuple representing the new state of G-LSTM cell after reading `inputs` when the previous state was `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ (c_prev, m_prev) = state self._batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0] input_size = inputs.shape[-1].value or array_ops.shape(inputs)[-1] dtype = inputs.dtype scope = vs.get_variable_scope() with vs.variable_scope(scope, initializer=self._initializer): i_parts = [] j_parts = [] f_parts = [] o_parts = [] for group_id in range(self._number_of_groups): with vs.variable_scope("group%d" % group_id): x_g_id = array_ops.concat( [ self._get_input_for_group( inputs, group_id, int(input_size / self._number_of_groups)), #self._group_shape[0]), # this is only correct if inputs dim = num_units!!! self._get_input_for_group( m_prev, group_id, int(self._output_size / self._number_of_groups)) ], axis=1) #self._group_shape[0])], axis=1) if self._linear1[group_id] is None: self._linear1[group_id] = _Linear( x_g_id, 4 * self._group_shape[1], False) R_k = self._linear1[group_id](x_g_id) # pylint: disable=invalid-name i_k, j_k, f_k, o_k = array_ops.split(R_k, 4, 1) i_parts.append(i_k) j_parts.append(j_k) f_parts.append(f_k) o_parts.append(o_k) bi = vs.get_variable(name="bias_i", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) bj = vs.get_variable(name="bias_j", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) bf = vs.get_variable(name="bias_f", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) bo = vs.get_variable(name="bias_o", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) i = nn_ops.bias_add(array_ops.concat(i_parts, axis=1), bi) j = nn_ops.bias_add(array_ops.concat(j_parts, axis=1), bj) f = nn_ops.bias_add(array_ops.concat(f_parts, axis=1), bf) o = nn_ops.bias_add(array_ops.concat(o_parts, axis=1), bo) c = (math_ops.sigmoid(f + self._forget_bias) * c_prev + math_ops.sigmoid(i) * math_ops.tanh(j)) m = math_ops.sigmoid(o) * self._activation(c) if self._num_proj is not None: with vs.variable_scope("projection"): if self._linear2 is None: self._linear2 = _Linear(m, self._num_proj, False) m = self._linear2(m) new_state = rnn_cell_impl.LSTMStateTuple(c, m) return m, new_state
def beam_attention_decoder(decoder_inputs, initial_state, attention_states, cell, embedding, output_size=None, num_heads=1, loop_function=None, dtype=None, scope=None, initial_state_attention=False, output_projection=None, beam_size=10): if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder", dtype=dtype) as scope: dtype = scope.dtype # batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value if attn_length is None: attn_length = array_ops.shape(attention_states)[1] attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) state = [] # 将encoder的最后一个隐层状态扩展成beam_size维,因为decoder阶段的batch_size是beam_size。 # initial_state是一个列表,RNN有多少层就有多少个元素,每个元素都是一个LSTMStateTuple,包含h,c两个隐层状态 # 所以要将其扩展成beam_size维,其实是把c和h进行扩展,最后再合成LSTMStateTuple就可以了 for layers in initial_state: c = [layers.c] * beam_size h = [layers.h] * beam_size c = tf.concat(c, 0) h = tf.concat(h, 0) state.append(rnn_cell_impl.LSTMStateTuple(c, h)) state = tuple(state) # state_size = int(initial_state.get_shape().with_rank(2)[1]) # states = [] # for kk in range(beam_size): # states.append(initial_state) # state = tf.concat(states, 0) # state = initial_state def attention(query): ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = Linear(query, attention_vec_size, True)(query) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum(array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None # attention也要定义成beam_size为的tensor batch_attn_size = array_ops.stack([beam_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) log_beam_probs, beam_path, beam_symbols = [], [], [] for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if i == 0: #i=0时,输入时一个batch_szie=beam_size的tensor,且里面每个元素的值都是相同的,都是<GO>标志 inp = tf.nn.embedding_lookup(embedding, tf.constant(1, dtype=tf.int32, shape=[beam_size])) if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i, log_beam_probs, beam_path, beam_symbols) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) inputs = [inp] + attns x = Linear(inputs, input_size, True)(inputs) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): inputs = [cell_output] + attns output = Linear(inputs, output_size, True)(inputs) if loop_function is not None: prev = output outputs.append(tf.argmax(nn_ops.xw_plus_b(output, output_projection[0], output_projection[1]), axis=1)) return outputs, state, tf.reshape(tf.concat(beam_path, 0), [-1, beam_size]), tf.reshape(tf.concat(beam_symbols, 0), [-1, beam_size])
def dynamic_distraction_m2_decoder(decoder_inputs, initial_state, distract_initial_state, attention_states, attention_states_query, cell1,cell2, distraction_cell, output_size=None, num_heads=1, loop_function=None, dtype=None, scope=None, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1412.7449 (see below for details). It is recommended for complex sequence-to-sequence tasks. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. num_heads: Number of attention heads that read from attention_states. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, shapes of attention_states are not set, or input size cannot be inferb_a from the input. """ if decoder_inputs is None: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if attention_states.get_shape()[2].value is None: raise ValueError("Shape[2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell1.output_size with variable_scope.variable_scope( scope or "dynamic_distraction_m2_decoder", dtype=dtype) as scope: dtype = scope.dtype batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length_state = attention_states.get_shape()[1].value attn_length_query = attention_states_query.get_shape()[1].value dim_1 = initial_state.get_shape()[1].value dim_2 = cell1.output_size project_initial_state_W = variable_scope.get_variable("Initial_State_W", [dim_1, dim_2]) project_initial_state_B = variable_scope.get_variable("Initial_State_Bias", [dim_2]) print ("Preksha " + scope.name) if attn_length_state is None: attn_length_state = shape(attention_states)[1] if attn_length_query is None: attn_length_query = shape(attention_states_query)[1] attn_size_state = attention_states.get_shape()[2].value attn_size_query = attention_states_query.get_shape()[2].value b_a = variable_scope.get_variable("b_a", [1, attn_size_state]) # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden_states = array_ops.reshape( attention_states, [-1, attn_length_state, 1, attn_size_state]) hidden_states_query = array_ops.reshape( attention_states_query, [-1, attn_length_query, 1, attn_size_query]) hidden_features_states = [] hidden_features_query = [] v_state = [] attention_vec_size_state = attn_size_state # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_State_%d" % a, [1, 1, attn_size_state, attention_vec_size_state]) hidden_features_states.append(nn_ops.conv2d(hidden_states, k, [1, 1, 1, 1], "SAME")) v_state.append( variable_scope.get_variable("AttnV_State_%d" % a, [attention_vec_size_state])) v_query = [] attention_vec_size_query = attn_size_query # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_Query_%d" %a, [1, 1, attn_size_query, attention_vec_size_query]) hidden_features_query.append(nn_ops.conv2d(hidden_states_query, k, [1, 1, 1, 1], "SAME")) v_query.append( variable_scope.get_variable("AttnV_Query_%d" % a, [attention_vec_size_query])) state_1 = math_ops.matmul(initial_state, project_initial_state_W) + project_initial_state_B state_2 = state_1 prev_states = [] for i in range(attn_length_state): prev_states.append(array_ops.zeros([batch_size])) def attention(query, prev_states, b_a): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size_state, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size_state]) # Attention mask is a softmax of v^T * tanh(...). temp = hidden_features_states[a] + y new_states = array_ops.squeeze(temp, [2]) new_states_list = array_ops.unpack(new_states, axis=1) #print(temp.get_shape(), new_states.get_shape(), len(new_states_list), new_states_list[0].get_shape()) distract_states_list = [] for i, _ in enumerate(new_states_list): temp = array_ops.reshape(prev_states[i], [-1, 1]) t1 = math_ops.matmul(temp, b_a) print ("b_a size and prev_states size", temp.get_shape(), prev_states[i].get_shape(), b_a.get_shape(), t1.get_shape()) distract_states_list.append(new_states_list[i] - t1) distract_states = array_ops.pack(distract_states_list, axis=1) print (len(distract_states_list), distract_states.get_shape()) s = math_ops.reduce_sum( v_state[a] * math_ops.tanh(distract_states), [2]) print(s.get_shape()) a = nn_ops.softmax(s) prev_states = array_ops.pack(prev_states, axis=1) prev_states = prev_states + a # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length_state, 1, 1]) * hidden_states, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size_state])) return ds, array_ops.unpack(prev_states, axis=1) def attention_query(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_Query_%d" % a): y = linear(query, attention_vec_size_query, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size_query]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v_query[a] * math_ops.tanh(hidden_features_query[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length_query, 1, 1]) * hidden_states_query, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size_query])) return ds[0] outputs = [] ctx_vec = [] prev = None batch_attn_size_state = array_ops.pack([batch_size, attn_size_state]) batch_attn_size_query = array_ops.pack([batch_size, attn_size_query]) attns_state = [array_ops.zeros(batch_attn_size_state, dtype=dtype) for _ in xrange(num_heads)] attns_query = [array_ops.zeros(batch_attn_size_query, dtype=dtype) for _ in xrange(num_heads)] for a in attns_state: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size_state]) for a in attns_query: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size_query]) acc_ctx = array_ops.zeros([batch_size, attn_size_state]) if initial_state_attention: attns_query = attention_query(initial_state) list_of_queries = [initial_state, attns_query] attns_state, prev_states = attention(list_of_queries, prev_states) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) with variable_scope.variable_scope("Cell2"): input_2 = linear([state_1] + [inp], input_size, True) output_2, state_2 = cell2(input_2, state_2) # Run the RNN. #print (x.get_shape()) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns_query = attention_query(output_2) list_of_queries = [state, attns_query] attns_state, prev_states = attention(list_of_queries, prev_states, b_a) else: attns_query = attention_query(output_2) list_of_queries = [output_2, attns_query] attns_state, prev_states = attention(list_of_queries, prev_states, b_a) with variable_scope.variable_scope("AttnOutputProjection"): W = variable_scope.get_variable("W", [1,attn_size_state]) U = variable_scope.get_variable("U", [1,attn_size_state]) new_ctx = math_ops.mul(W, attns_state[0]) - math_ops.mul(U, acc_ctx) new_ctx = math_ops.tanh(new_ctx) acc_ctx = acc_ctx + new_ctx with variable_scope.variable_scope("Cell1"): input_1 = linear([output_2] + [new_ctx], input_size, True) output_1, state_1 = cell1(input_1, state_1) output = math_ops.tanh(linear([inp] + [output_1] + [new_ctx], output_size, True)) #x_shape = variable_scope.get_variable(name = 'x_shape',shape=cell_output.get_shape()) if loop_function is not None: prev = output outputs.append(output) ctx_vec.append(new_ctx) return outputs, state_1, ctx_vec
def attention_decoder(decoder_inputs, sequence_length, initial_state, attention_matrix, cell, output_size=None, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False): if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_matrix.get_shape()[1:].is_fully_defined(): raise ValueError("Shape of attention matrix must be known: %s" % attention_matrix.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder"): #batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. # Temporarily avoid EmbeddingWrapper and seq2seq badness # TODO(lukaszkaiser): remove EmbeddingWrapper if decoder_inputs[0].get_shape().ndims != 1: (fixed_batch_size, input_size) = decoder_inputs[0].get_shape().with_rank(2) if input_size.value is None: raise ValueError( "Input size (second dimension of inputs[0]) must be accessible via " "shape inference, but saw value None.") else: fixed_batch_size = decoder_inputs[0].get_shape( ).with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(decoder_inputs[0])[0] if sequence_length is not None: sequence_length = math_ops.to_int32(sequence_length) zero_output = array_ops.zeros( array_ops.pack([batch_size, cell.output_size]), decoder_inputs[0].dtype) zero_output.set_shape( tensor_shape.TensorShape( [fixed_batch_size.value, cell.output_size])) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) # ATTENTION COMPUTATION attn_size = attention_matrix.get_shape()[-1].value batch_attn_size = array_ops.pack([batch_size, attn_size]) def _attention(query, states): """Put attention masks on hidden using hidden_features and query.""" v = variable_scope.get_variable("AttnV", [attn_size]) k = variable_scope.get_variable("AttnW", [1, 1, attn_size, attn_size]) # attn is v^T * tanh(W1*h_t + U*q) # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. attn_length = states.get_shape()[1].value hidden = array_ops.reshape(states, [-1, attn_length, 1, attn_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") y = rnn_cell._linear(query, attn_size, True) y = array_ops.reshape(y, [-1, 1, 1, attn_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) d = array_ops.reshape(d, [-1, attn_size]) return d def attention(query): outer_states = tf.unpack(attention_matrix, axis=1) inner_states = [] for i, states in enumerate(outer_states): with variable_scope.variable_scope("Attention_outer", reuse=i > 0): inner_states.append(_attention(query, states)) with variable_scope.variable_scope("Attention_inner"): return _attention(query, tf.pack(inner_states, 1)) state = cell.zero_state( batch_size, dtype) if initial_state == None else initial_state outputs = [] prev = None attns = array_ops.zeros(batch_attn_size, dtype=dtype) attns.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) x = rnn_cell._linear([inp] + [attns], input_size, True) if sequence_length is not None: call_cell = lambda: cell(x, state) if sequence_length is not None: cell_output, state = _rnn_step(i, sequence_length, min_sequence_length, max_sequence_length, zero_output, state, call_cell, cell.state_size) else: cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = rnn_cell._linear([cell_output] + [attns], output_size, True) if loop_function is not None: prev = output outputs.append(output) return outputs, state
def _linear(args, output_size, bias, weight_name=_WEIGHTS_VARIABLE_NAME, bias_name=_BIAS_VARIABLE_NAME, bias_initializer=None, kernel_initializer=None): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_initializer: starting value to initialize the bias (default is all zeros). kernel_initializer: starting value to initialize the weight. Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError( "linear expects shape[1] to be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] # Now the computation. scope = vs.get_variable_scope() with vs.variable_scope(scope) as outer_scope: weights = vs.get_variable(weight_name, [total_arg_size, output_size], dtype=dtype, initializer=kernel_initializer) # if the args is a single tensor then matmul it with weight # if the args is a list of tensors then concat them in axis of 1 and matmul if len(args) == 1: res = math_ops.matmul(args[0], weights) else: res = math_ops.matmul(array_ops.concat(args, 1), weights) if not bias: return res with vs.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) if bias_initializer is None: bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype) biases = vs.get_variable(bias_name, [output_size], dtype=dtype, initializer=bias_initializer) return nn_ops.bias_add(res, biases)
def __init__(self, dist, coord, replica_id, devices, variable_creator_fn, fn, caching_scope, args, kwargs): super(_MirroredReplicaThread, self).__init__() self.coord = coord self.distribution = dist self.devices = devices self.replica_id = replica_id self.replica_id_in_sync_group = ( dist.extended._get_replica_id_in_sync_group(replica_id)) # pylint: disable=protected-access self.variable_creator_fn = variable_creator_fn # State needed to run and return the results of `fn`. self.main_fn = fn self.main_args = args self.main_kwargs = kwargs self.main_result = None self.done = False # State needed to run the next merge_call() (if any) requested via # ReplicaContext. self.merge_fn = None self.merge_args = None self.merge_kwargs = None self.merge_result = None self.captured_name_scope = None self.captured_var_scope = None try: self.caching_scope_entered = caching_scope.new_cache_scope_count self.caching_scope_exited = caching_scope.cache_scope_exited_count except AttributeError: self.caching_scope_entered = None self.caching_scope_exited = None # We use a thread.Event for the main thread to signal when this # thread should start running (`should_run`), and another for # this thread to transfer control back to the main thread # (`has_paused`, either when it gets to a # `get_replica_context().merge_call` or when `fn` returns). In # either case the event starts cleared, is signaled by calling # set(). The receiving thread waits for the signal by calling # wait() and then immediately clearing the event using clear(). self.should_run = threading.Event() self.has_paused = threading.Event() # These fields have to do with inheriting various contexts from the # parent thread: context.ensure_initialized() ctx = context.context() self.in_eager = ctx.executing_eagerly() self.record_thread_local_summary_state() self.record_thread_local_eager_context_state() self.context_device_policy = ( pywrap_tfe.TFE_ContextGetDevicePlacementPolicy(ctx._context_handle) ) # pylint: disable=protected-access self.graph = ops.get_default_graph() with ops.init_scope(): self._init_in_eager = context.executing_eagerly() self._init_graph = ops.get_default_graph() self._variable_creator_stack = self.graph._variable_creator_stack[:] # pylint: disable=protected-access self._var_scope = variable_scope.get_variable_scope() # Adding a "/" at end lets us re-enter this scope later. self._name_scope = self.graph.get_name_scope() if self._name_scope: self._name_scope += "/" if self.replica_id > 0: if not self._name_scope: self._name_scope = "" self._name_scope += "replica_%d/" % self.replica_id
def func_graph_from_py_func(name, python_func, args, kwargs, signature=None, func_graph=None, autograph=False, add_control_dependencies=True, arg_names=None, op_return_value=None): """Returns a `FuncGraph` generated from `python_func`. Args: name: an identifier for the function. python_func: the Python function to trace. args: the positional args with which the Python function should be called; ignored if a signature is provided. kwargs: the keyword args with which the Python function should be called; ignored if a signature is provided. signature: a possibly nested sequence of `TensorSpecs` specifying the shapes and dtypes of the arguments. When a signature is provided, `args` and `kwargs` are ignored, and `python_func` is traced with Tensors conforming to `signature`. If `None`, the shapes and dtypes are inferred from the inputs. func_graph: Optional. An instance of FuncGraph. If provided, we will use this graph else a new one is built and returned. autograph: whether to use autograph to compile `python_func`. See https://www.tensorflow.org/guide/autograph for more information. add_control_dependencies: If True, automatically adds control dependencies to ensure program order matches execution order and stateful ops always execute. arg_names: Optional list of argument names, used to give input placeholders recognizable names. op_return_value: Optional. A Tensor. If set and `python_func` returns Operations, those return values will be replaced with this value. If not set, returning an Operation triggers an error. Returns: A FuncGraph. Raises: TypeError: If any of `python_func`'s return values is neither `None` nor a `Tensor`. """ if op_return_value is not None: assert isinstance(op_return_value, ops.Tensor), op_return_value if func_graph is None: func_graph = FuncGraph(name) assert isinstance(func_graph, FuncGraph) if add_control_dependencies: control_manager = AutomaticControlDependencies else: control_manager = ops.NullContextmanager with func_graph.as_default(), control_manager() as a: current_scope = variable_scope.get_variable_scope() default_use_recource = current_scope.use_resource current_scope.set_use_resource(True) if signature is not None: args = signature kwargs = {} # Creates and names placeholders for all arguments. func_args = _get_defun_inputs_from_args(args, arg_names) func_kwargs = _get_defun_inputs_from_kwargs(kwargs) # Note: `nest.flatten` sorts by keys, as does `_deterministic_dict_values`. # Variables to help check whether mutation happens in calling the function # Copy the recursive list, tuple and map structure, but not base objects func_args_before = nest.pack_sequence_as(func_args, nest.flatten(func_args)) func_kwargs_before = nest.pack_sequence_as(func_kwargs, nest.flatten(func_kwargs)) def convert(x): """Converts a function output to a Tensor.""" if x is None: return None if op_return_value is not None and isinstance(x, ops.Operation): # TODO(b/79881896): we currently can't capture external control deps, so # this won't work if x needs to be captured (i.e. if python_func returns # captured Operations). with ops.control_dependencies([x]): x = array_ops.identity(op_return_value) elif not isinstance(x, tensor_array_ops.TensorArray): try: x = ops.convert_to_tensor_or_indexed_slices(x) except (ValueError, TypeError): raise TypeError( "To be compatible with tf.contrib.eager.defun, Python functions " "must return zero or more Tensors; in compilation of %s, found " "return value of type %s, which is not a Tensor." % (str(python_func), type(x))) if add_control_dependencies: x = a.mark_as_return(x) return x this_tape = tape.push_new_tape() try: if autograph: from tensorflow.python import autograph # pylint: disable=g-import-not-at-top _, original_func = tf_decorator.unwrap(python_func) def wrapper(*args, **kwargs): return autograph.converted_call( original_func, None, autograph.ConversionOptions( verbose=autograph.Verbosity.BRIEF, recursive=True, strip_decorators=(def_function.function, ), optional_features=(), ), *args, **kwargs) # Wrapping around a decorator allows checks like tf_inspect.getargspec # to be accurate. converted_func = tf_decorator.make_decorator( original_func, wrapper) tf_decorator.rewrap(python_func, original_func, converted_func) func_outputs = python_func(*func_args, **func_kwargs) # invariant: `func_outputs` contains only Tensors, IndexedSlices, # SparseTensors, TensorArrays and `None`s. func_outputs = nest.map_structure(convert, func_outputs) check_mutation(func_args_before, func_args) check_mutation(func_kwargs_before, func_kwargs) finally: tape.pop_tape(this_tape) current_scope.set_use_resource(default_use_recource) # Variables in `func_args`, `func_kwargs` should be explicit inputs # to the function, not captured inputs. tape_variables = this_tape.watched_variables() arg_variables = set() inputs = [] for arg in nest.flatten(func_args) + nest.flatten(func_kwargs): if isinstance(arg, resource_variable_ops.ResourceVariable): # Even if an argument variable was not used in the function, we've # already manually captured the resource Tensor when creating argument # placeholders. resource_placeholder = func_graph.captures.pop(arg.handle) arg_variables.add(arg) inputs.append(resource_placeholder) elif isinstance(arg, ops.Tensor): inputs.append(arg) variables = [v for v in tape_variables if v not in arg_variables] func_graph.inputs = inputs + list(func_graph.captures.values()) func_graph.structured_outputs = func_outputs # Returning a closed-over tensor does not trigger convert_to_tensor. func_graph.outputs.extend( func_graph.capture(x) for x in flatten(func_graph.structured_outputs) if x is not None) func_graph.variables = variables # Register any other functions defined in the graph. with ops.init_scope(): if context.executing_eagerly(): for f in func_graph._functions.values(): # pylint: disable=protected-access # TODO(ashankar): What about the gradient registry? context.add_function(f._c_func.func) # pylint: disable=protected-access return func_graph
def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None): """foldl on the list of tensors unpacked from `elems` on dimension 0. This foldl operator repeatedly applies the callable `fn` to a sequence of elements from first to last. The elements are made of the tensors unpacked from `elems` on dimension 0. The callable fn takes two tensors as arguments. The first argument is the accumulated value computed from the preceding invocation of fn. If `initializer` is None, `elems` must contain at least one element, and its first element is used as the initializer. Suppose that `elems` is unpacked into `values`, a list of tensors. The shape of the result tensor is fn(initializer, values[0]).shape`. This method also allows multi-arity `elems` and output of `fn`. If `elems` is a (possibly nested) list or tuple of tensors, then each of these tensors must have a matching first (unpack) dimension. The signature of `fn` may match the structure of `elems`. That is, if `elems` is `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is: `fn = lambda (t1, [t2, t3, [t4, t5]]):`. Args: fn: The callable to be performed. elems: A tensor or (possibly nested) sequence of tensors, each of which will be unpacked along their first dimension. The nested sequence of the resulting slices will be the first argument to `fn`. initializer: (optional) A tensor or (possibly nested) sequence of tensors, as the initial value for the accumulator. parallel_iterations: (optional) The number of iterations allowed to run in parallel. back_prop: (optional) True enables support for back propagation. swap_memory: (optional) True enables GPU-CPU memory swapping. name: (optional) Name prefix for the returned tensors. Returns: A tensor or (possibly nested) sequence of tensors, resulting from applying `fn` consecutively to the list of tensors unpacked from `elems`, from first to last. Raises: TypeError: if `fn` is not callable. Example: ```python elems = tf.constant([1, 2, 3, 4, 5, 6]) sum = foldl(lambda a, x: a + x, elems) # sum == 21 ``` """ if not callable(fn): raise TypeError("fn must be callable.") def create_ta(elem): return tensor_array_ops.TensorArray( dtype=elem.dtype, size=n, dynamic_size=False, infer_shape=True).unstack(elem) in_graph_mode = not context.executing_eagerly() with ops.name_scope(name, "foldl", [elems]): # TODO(akshayka): Remove the in_graph_mode check once caching devices are # supported in Eager if in_graph_mode: # Any get_variable calls in fn will cache the first call locally # and not issue repeated network I/O requests for each iteration. varscope = vs.get_variable_scope() varscope_caching_device_was_none = False if varscope.caching_device is None: # TODO(ebrevdo): Change to using colocate_with here and in other # methods. varscope.set_caching_device(lambda op: op.device) varscope_caching_device_was_none = True # Convert elems to tensor array. n may be known statically. elems_flat = [ ops.convert_to_tensor(elem, name="elem") for elem in nest.flatten(elems) ] n = (tensor_shape.dimension_value(elems_flat[0].shape[0]) or array_ops.shape(elems_flat[0])[0]) elems_ta = nest.map_structure(create_ta, elems) if initializer is None: a = nest.map_structure(lambda elem: elem.read(0), elems_ta) i = constant_op.constant(1) else: a = initializer i = constant_op.constant(0) def compute(i, a): elem_i = nest.map_structure(lambda elem: elem.read(i), elems_ta) a = fn(a, elem_i) return [i + 1, a] _, r_a = control_flow_ops.while_loop( lambda i, a: i < n, compute, [i, a], parallel_iterations=parallel_iterations, back_prop=back_prop, swap_memory=swap_memory, maximum_iterations=n) # TODO(akshayka): Remove the in_graph_mode check once caching devices are # supported in Eager if in_graph_mode and varscope_caching_device_was_none: varscope.set_caching_device(None) return r_a
def model_with_buckets(encoder_inputs, decoder_inputs, targets, weights, buckets, seq2seq, softmax_loss_function=None, per_example_loss=False, name=None): """Create a sequence-to-sequence model with support for bucketing. The seq2seq argument is a function that defines a sequence-to-sequence model, e.g., seq2seq = lambda x, y: basic_rnn_seq2seq( x, y, core_rnn_cell.GRUCell(24)) Args: encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input. decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input. targets: A list of 1D batch-sized int32 Tensors (desired output sequence). weights: List of 1D batch-sized float-Tensors to weight the targets. buckets: A list of pairs of (input size, output size) for each bucket. seq2seq: A sequence-to-sequence model function; it takes 2 input that agree with encoder_inputs and decoder_inputs, and returns a pair consisting of outputs and states (as, e.g., basic_rnn_seq2seq). softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch to be used instead of the standard softmax (the default if this is None). per_example_loss: Boolean. If set, the returned loss will be a batch-sized tensor of losses for each sequence in the batch. If unset, it will be a scalar with the averaged loss from all examples. name: Optional name for this operation, defaults to "model_with_buckets". Returns: A tuple of the form (outputs, losses), where: outputs: The outputs for each bucket. Its j'th element consists of a list of 2D Tensors. The shape of output tensors can be either [batch_size x output_size] or [batch_size x num_decoder_symbols] depending on the seq2seq model used. losses: List of scalar Tensors, representing losses for each bucket, or, if per_example_loss is set, a list of 1D batch-sized float Tensors. Raises: ValueError: If length of encoder_inputsut, targets, or weights is smaller than the largest (last) bucket. """ if len(encoder_inputs) < buckets[-1][0]: raise ValueError( "Length of encoder_inputs (%d) must be at least that of la" "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0])) if len(targets) < buckets[-1][1]: raise ValueError("Length of targets (%d) must be at least that of last" "bucket (%d)." % (len(targets), buckets[-1][1])) if len(weights) < buckets[-1][1]: raise ValueError("Length of weights (%d) must be at least that of last" "bucket (%d)." % (len(weights), buckets[-1][1])) all_inputs = encoder_inputs + decoder_inputs + targets + weights losses = [] outputs = [] with ops.name_scope(name, "model_with_buckets", all_inputs): for j, bucket in enumerate(buckets): with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=True if j > 0 else None): bucket_outputs, _ = seq2seq(encoder_inputs[:bucket[0]], decoder_inputs[:bucket[1]]) outputs.append(bucket_outputs) if per_example_loss: losses.append( sequence_loss_by_example( outputs[-1], targets[:bucket[1]], weights[:bucket[1]], softmax_loss_function=softmax_loss_function)) else: losses.append( sequence_loss( outputs[-1], targets[:bucket[1]], weights[:bucket[1]], softmax_loss_function=softmax_loss_function)) return outputs, losses
def actrgn_attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=None, scope=None, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1412.7449 (see below for details). It is recommended for complex sequence-to-sequence tasks. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: core_rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. num_heads: Number of attention heads that read from attention_states. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, shapes of attention_states are not set, or input size cannot be inferred from the input. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError( "With less than 1 heads, use a non-attention decoder.") if attention_states.get_shape()[2].value is None: raise ValueError("Shape[2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder", dtype=dtype) as scope: dtype = scope.dtype batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value if attn_length is None: attn_length = array_ops.shape(attention_states)[1] attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append( variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) state = initial_state def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = [ array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads) ] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) x = linear([inp] + attns, input_size, True) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + attns, output_size, True) if loop_function is not None: prev = output outputs.append(output) return outputs, state
def rnn(cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None): """Creates a recurrent neural network specified by RNNCell "cell". The simplest form of RNN network generated is: state = cell.zero_state(...) outputs = [] for input_ in inputs: output, state = cell(input_, state) outputs.append(output) return (outputs, state) However, a few other options are available: An initial state can be provided. If the sequence_length vector is provided, dynamic calculation is performed. This method of calculation does not compute the RNN steps past the maximum sequence length of the minibatch (thus saving computational time), and properly propagates the state at an example's sequence length to the final state output. The dynamic calculation performed is, at time t for batch row b, (output, state)(b, t) = (t >= sequence_length(b)) ? (zeros(cell.output_size), states(b, sequence_length(b) - 1)) : cell(input(b, t), state(b, t - 1)) Args: cell: An instance of RNNCell. inputs: A length T list of inputs, each a tensor of shape [batch_size, cell.input_size]. initial_state: (optional) An initial state for the RNN. This must be a tensor of appropriate type and shape [batch_size x cell.state_size]. dtype: (optional) The data type for the initial state. Required if initial_state is not provided. sequence_length: Specifies the length of each sequence in inputs. An int32 or int64 vector (tensor) size [batch_size]. Values in [0, T). scope: VariableScope for the created subgraph; defaults to "RNN". Returns: A pair (outputs, state) where: outputs is a length T list of outputs (one for each input) state is the final state Raises: TypeError: If "cell" is not an instance of RNNCell. ValueError: If inputs is None or an empty list, or if the input depth cannot be inferred from inputs via shape inference. """ if not isinstance(cell, BaseCell): raise TypeError("cell must be an instance of RNNCell") if not isinstance(inputs, list): raise TypeError("inputs must be a list") if not inputs: raise ValueError("inputs must not be empty") outputs = [] # Create a new scope in which the caching device is either # determined by the parent scope, or is set to place the cached # Variable using the same placement as for the rest of the RNN. with vs.variable_scope(scope or "RNN") as varscope: if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) # Temporarily avoid EmbeddingWrapper and seq2seq badness # TODO(lukaszkaiser): remove EmbeddingWrapper if inputs[0].get_shape().ndims != 1: (fixed_batch_size, input_size) = inputs[0].get_shape().with_rank(2) if input_size.value is None: raise ValueError( "Input size (second dimension of inputs[0]) must be accessible via " "shape inference, but saw value None.") else: fixed_batch_size = inputs[0].get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(inputs[0])[0] if initial_state is not None: state = initial_state else: if not dtype: raise ValueError("If no initial_state is provided, dtype must be.") state = cell.zero_state(batch_size, dtype) if sequence_length is not None: sequence_length = math_ops.to_int32(sequence_length) if sequence_length is not None: # Prepare variables zero_output = array_ops.zeros( array_ops.pack([batch_size, cell.output_size]), inputs[0].dtype) zero_output.set_shape( tensor_shape.TensorShape([fixed_batch_size.value, cell.output_size])) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) for time, input_ in enumerate(inputs): if time > 0: vs.get_variable_scope().reuse_variables() # pylint: disable=cell-var-from-loop call_cell = lambda: cell(input_, state) # pylint: enable=cell-var-from-loop if sequence_length is not None: (output, state) = _rnn_step( time, sequence_length, min_sequence_length, max_sequence_length, zero_output, state, call_cell) else: (output, state) = call_cell() outputs.append(output) return (outputs, state)
def beam_rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, scope=None, output_projection=None, beam_size=10): """RNN decoder for the sequence-to-sequence model. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor with shape [batch_size x cell.state_size]. cell: rnn_cell.RNNCell defining the cell function and size. loop_function: If not None, this function will be applied to the i-th output in order to generate the i+1-st input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. scope: VariableScope for the created subgraph; defaults to "rnn_decoder". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing generated outputs. state: The state of each cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. (Note that in some cases, like basic RNN cell or GRU cell, outputs and states can be the same. They are different for LSTM cells though.) """ with variable_scope.variable_scope(scope or "rnn_decoder"): state = initial_state prev = None log_beam_probs, beam_path, beam_symbols = [], [], [] path_lengthes, is_finished_beam = None, None for i, inp in enumerate(decoder_inputs): if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp, state, path_lengthes, is_finished_beam = loop_function( i, prev, state, log_beam_probs, beam_path, beam_symbols, path_lengthes, is_finished_beam) if i > 0: variable_scope.get_variable_scope().reuse_variables() output, state = cell(inp, state) if loop_function is not None: prev = output # from time-major to batch_major beam_path = tf.stack(beam_path, axis=1) beam_symbols = tf.stack(beam_symbols, axis=1) # [batch*beam, state] -> [batch, beam, state] state = tf.reshape(state, [ -1, beam_path.get_shape().as_list()[-1], state.get_shape().as_list()[-1] ]) return beam_path, beam_symbols, state
def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, reverse=False, name=None): """scan on the list of tensors unpacked from `elems` on dimension 0. The simplest version of `scan` repeatedly applies the callable `fn` to a sequence of elements from first to last. The elements are made of the tensors unpacked from `elems` on dimension 0. The callable fn takes two tensors as arguments. The first argument is the accumulated value computed from the preceding invocation of fn. If `initializer` is None, `elems` must contain at least one element, and its first element is used as the initializer. Suppose that `elems` is unpacked into `values`, a list of tensors. The shape of the result tensor is `[len(values)] + fn(initializer, values[0]).shape`. If reverse=True, it's fn(initializer, values[-1]).shape. This method also allows multi-arity `elems` and accumulator. If `elems` is a (possibly nested) list or tuple of tensors, then each of these tensors must have a matching first (unpack) dimension. The second argument of `fn` must match the structure of `elems`. If no `initializer` is provided, the output structure and dtypes of `fn` are assumed to be the same as its input; and in this case, the first argument of `fn` must match the structure of `elems`. If an `initializer` is provided, then the output of `fn` must have the same structure as `initializer`; and the first argument of `fn` must match this structure. For example, if `elems` is `(t1, [t2, t3])` and `initializer` is `[i1, i2]` then an appropriate signature for `fn` in `python2` is: `fn = lambda (acc_p1, acc_p2), (t1, [t2, t3]):` and `fn` must return a list, `[acc_n1, acc_n2]`. An alternative correct signature for `fn`, and the one that works in `python3`, is: `fn = lambda a, t:`, where `a` and `t` correspond to the input tuples. Args: fn: The callable to be performed. It accepts two arguments. The first will have the same structure as `initializer` if one is provided, otherwise it will have the same structure as `elems`. The second will have the same (possibly nested) structure as `elems`. Its output must have the same structure as `initializer` if one is provided, otherwise it must have the same structure as `elems`. elems: A tensor or (possibly nested) sequence of tensors, each of which will be unpacked along their first dimension. The nested sequence of the resulting slices will be the first argument to `fn`. initializer: (optional) A tensor or (possibly nested) sequence of tensors, initial value for the accumulator, and the expected output type of `fn`. parallel_iterations: (optional) The number of iterations allowed to run in parallel. back_prop: (optional) True enables support for back propagation. swap_memory: (optional) True enables GPU-CPU memory swapping. infer_shape: (optional) False disables tests for consistent output shapes. reverse: (optional) True scans the tensor last to first (instead of first to last). name: (optional) Name prefix for the returned tensors. Returns: A tensor or (possibly nested) sequence of tensors. Each tensor packs the results of applying `fn` to tensors unpacked from `elems` along the first dimension, and the previous accumulator value(s), from first to last (or last to first, if `reverse=True`). Raises: TypeError: if `fn` is not callable or the structure of the output of `fn` and `initializer` do not match. ValueError: if the lengths of the output of `fn` and `initializer` do not match. Examples: ```python elems = np.array([1, 2, 3, 4, 5, 6]) sum = scan(lambda a, x: a + x, elems) # sum == [1, 3, 6, 10, 15, 21] sum = scan(lambda a, x: a + x, elems, reverse=True) # sum == [22, 21, 18, 15, 11, 6] ``` ```python elems = np.array([1, 2, 3, 4, 5, 6]) initializer = np.array(0) sum_one = scan( lambda a, x: x[0] - x[1] + a, (elems + 1, elems), initializer) # sum_one == [1, 2, 3, 4, 5, 6] ``` ```python elems = np.array([1, 0, 0, 0, 0, 0]) initializer = (np.array(0), np.array(1)) fibonaccis = scan(lambda a, _: (a[1], a[0] + a[1]), elems, initializer) # fibonaccis == ([1, 1, 2, 3, 5, 8], [1, 2, 3, 5, 8, 13]) ``` """ if not callable(fn): raise TypeError("fn must be callable.") input_is_sequence = nest.is_sequence(elems) input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x] def input_pack(x): return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0] if initializer is None: output_is_sequence = input_is_sequence output_flatten = input_flatten output_pack = input_pack else: output_is_sequence = nest.is_sequence(initializer) output_flatten = lambda x: nest.flatten(x) if output_is_sequence else [x] def output_pack(x): return (nest.pack_sequence_as(initializer, x) if output_is_sequence else x[0]) elems_flat = input_flatten(elems) in_graph_mode = not context.executing_eagerly() with ops.name_scope(name, "scan", elems_flat): # TODO(akshayka): Remove the in_graph_mode check once caching devices are # supported in Eager if in_graph_mode: # Any get_variable calls in fn will cache the first call locally # and not issue repeated network I/O requests for each iteration. varscope = vs.get_variable_scope() varscope_caching_device_was_none = False if varscope.caching_device is None: # TODO(ebrevdo): Change to using colocate_with here and in other # methods. varscope.set_caching_device(lambda op: op.device) varscope_caching_device_was_none = True # Convert elems to tensor array. elems_flat = [ ops.convert_to_tensor(elem, name="elem") for elem in elems_flat] # Convert elems to tensor array. n may be known statically. n = (tensor_shape.dimension_value(elems_flat[0].shape[0]) or array_ops.shape(elems_flat[0])[0]) # TensorArrays are always flat elems_ta = [ tensor_array_ops.TensorArray(dtype=elem.dtype, size=n, dynamic_size=False, infer_shape=True) for elem in elems_flat] # Unpack elements elems_ta = [ elem_ta.unstack(elem) for elem_ta, elem in zip(elems_ta, elems_flat)] if initializer is None: a_flat = [elem.read(n - 1 if reverse else 0) for elem in elems_ta] i = constant_op.constant(1) else: initializer_flat = output_flatten(initializer) a_flat = [ops.convert_to_tensor(init) for init in initializer_flat] i = constant_op.constant(0) # Create a tensor array to store the intermediate values. accs_ta = [ tensor_array_ops.TensorArray( dtype=init.dtype, size=n, element_shape=init.shape if infer_shape else None, dynamic_size=False, infer_shape=infer_shape) for init in a_flat] if initializer is None: accs_ta = [acc_ta.write(n - 1 if reverse else 0, a) for (acc_ta, a) in zip(accs_ta, a_flat)] def compute(i, a_flat, tas): """The loop body of scan. Args: i: the loop counter. a_flat: the accumulator value(s), flattened. tas: the output accumulator TensorArray(s), flattened. Returns: [i + 1, a_flat, tas]: the updated counter + new accumulator values + updated TensorArrays Raises: TypeError: if initializer and fn() output structure do not match ValueType: if initializer and fn() output lengths do not match """ packed_elems = input_pack([elem_ta.read(i) for elem_ta in elems_ta]) packed_a = output_pack(a_flat) a_out = fn(packed_a, packed_elems) nest.assert_same_structure( elems if initializer is None else initializer, a_out) flat_a_out = output_flatten(a_out) tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_a_out)] if reverse: next_i = i - 1 else: next_i = i + 1 return (next_i, flat_a_out, tas) if reverse: initial_i = n - 1 - i condition = lambda i, _1, _2: i >= 0 else: initial_i = i condition = lambda i, _1, _2: i < n _, _, r_a = control_flow_ops.while_loop( condition, compute, (initial_i, a_flat, accs_ta), parallel_iterations=parallel_iterations, back_prop=back_prop, swap_memory=swap_memory, maximum_iterations=n) results_flat = [r.stack() for r in r_a] n_static = tensor_shape.Dimension(tensor_shape.dimension_value( elems_flat[0].get_shape().with_rank_at_least(1)[0])) for elem in elems_flat[1:]: n_static.merge_with(tensor_shape.Dimension(tensor_shape.dimension_value( elem.get_shape().with_rank_at_least(1)[0]))) for r in results_flat: r.set_shape(tensor_shape.TensorShape(n_static).concatenate( r.get_shape()[1:])) # TODO(akshayka): Remove the in_graph_mode check once caching devices are # supported in Eager if in_graph_mode and varscope_caching_device_was_none: varscope.set_caching_device(None) return output_pack(results_flat)
def build(self, input_shape): """Create variables of the Cudnn RNN. It can be called manually before `__call__()` or automatically through `__call__()`. In the former case, subsequent `__call__()`s will skip creating variables. Args: input_shape: network input tensor shape, a python list or a TensorShape object with 3 dimensions. Raises: ValueError: if input_shape has wrong dimension or unknown 3rd dimension. """ if self.built: return input_shape = tensor_shape.TensorShape(input_shape) if input_shape.ndims != 3: raise ValueError("Expecting input_shape with 3 dims, got %d" % input_shape.ndims) if input_shape[-1].value is None: raise ValueError("The last dimension of the inputs to `CudnnRNN` " "should be defined. Found `None`.") self._input_size = input_shape[-1].value self.input_spec = base_layer.InputSpec(ndim=3, axes={-1: self._input_size}) self._set_scope(None) # Not using base class `add_variable()` since the it calls # `tf.get_variable()` with a callable initializer whereas here with a # tensor. The difference is mandated to support forward-compatibility with # Cudnn. with vs.variable_scope(self._scope, reuse=self.built, custom_getter=self._update_trainable_weights): if self._kernel_initializer is None: self._kernel_initializer = init_ops.glorot_uniform_initializer( seed=self._seed, dtype=self._plain_dtype) if self._bias_initializer is None: self._bias_initializer = init_ops.constant_initializer( 0.0, dtype=self._plain_dtype) weights = [ self._kernel_initializer(sp, dtype=self._plain_dtype) for sp in self.canonical_weight_shapes ] biases = [ self._bias_initializer(sp, dtype=self._plain_dtype) for sp in self.canonical_bias_shapes ] opaque_params_t = self._canonical_to_opaque(weights, biases) if vs.get_variable_scope().partitioner is not None: logging.warn( "Partitioner is not supported for Cudnn RNN layer variables, using " "it will create forward-compatibility issues with future " "CUDA/CuDNN generations.") # Initialize opaque params with a tensor. self.kernel = vs.get_variable("opaque_kernel", dtype=self._plain_dtype, initializer=opaque_params_t, validate_shape=False) # Create saveable in the outer scope of the cudnn subgraph, such that # alternative subgraph with platform-independent rnn cells can load the # checkpoints directly. if not (self.built or vs.get_variable_scope().reuse is True): self._create_saveable() self.built = True
def attention_isf_decoder(decoder_inputs, initial_state, attention_states, isf_scores, idf_scores, locisf_scores, cell, output_size=None, num_heads=1, loop_function=None, dtype=None, scope=None, initial_state_attention=False): """ isf_scores: np array with ISF scores (not a tensor) (normalized or not) """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError( "With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError( "Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_ifsscore_decoder"): batch_size = array_ops.shape( decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value if attn_length is None: attn_length = shape(attention_states)[1] attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in range(num_heads): k = variable_scope.get_variable( "AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append( nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append( variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) state = initial_state def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for a in range(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [ array_ops.zeros(batch_attn_size, dtype=dtype) for _ in range(num_heads) ] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) #h_isf = tf.mul(isf_scores[i],inp) #extra_feats = [h_isf] extra_feats = [] if FLAGS.use_locisf: extra_feats.append(locisf_scores[i]) if FLAGS.use_isf: extra_feats.append(isf_scores[i]) if FLAGS.use_idf: extra_feats.append(idf_scores[i]) x = linear([inp] + attns + extra_feats, input_size, True) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + attns + extra_feats, output_size, True) if loop_function is not None: prev = output outputs.append(output) return outputs, state
def model_with_buckets(encoder_inputs, encoder_mask, decoder_inputs, targets, weights, buckets, seq2seq, softmax_loss_function=None, per_example_loss=False, name=None): """Create a sequence-to-sequence model with support for bucketing. Args: encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input. encoder_mask: the mask of encoder inputs that label where are PADs. decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input. targets: A list of 1D batch-sized int32 Tensors (desired output sequence). weights: List of 1D batch-sized float-Tensors to weight the targets. buckets: A list of pairs of (input size, output size) for each bucket. seq2seq: A sequence-to-sequence model function softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch to be used instead of the standard softmax (the default if this is None). per_example_loss: Boolean. If set, the returned loss will be a batch-sized tensor of losses for each sequence in the batch. If unset, it will be a scalar with the averaged loss from all examples. name: Optional name for this operation, defaults to "model_with_buckets". Returns: A tuple of the form (outputs, losses, symbols), where: outputs: The outputs for each bucket. Its j'th element consists of a list of 2D Tensors of shape [batch_size x num_decoder_symbols] (jth outputs). losses: List of scalar Tensors, representing losses for each bucket, or, if per_example_loss is set, a list of 1D batch-sized float Tensors. symbols: The final translation result got from beam search Raises: ValueError: If length of encoder_inputsut, targets, or weights is smaller than the largest (last) bucket. """ if len(encoder_inputs) < buckets[-1][0]: raise ValueError( "Length of encoder_inputs (%d) must be at least that of la" "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0])) if len(targets) < buckets[-1][1]: raise ValueError("Length of targets (%d) must be at least that of last" "bucket (%d)." % (len(targets), buckets[-1][1])) if len(weights) < buckets[-1][1]: raise ValueError("Length of weights (%d) must be at least that of last" "bucket (%d)." % (len(weights), buckets[-1][1])) all_inputs = encoder_inputs + decoder_inputs + targets + weights losses = [] outputs = [] symbols = [] # to save the output of beam search with ops.name_scope(name, "model_with_buckets", all_inputs): for j, bucket in enumerate(buckets): with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=True if j > 0 else None): bucket_outputs, _, bucket_symbols = seq2seq( encoder_inputs[:bucket[0]], encoder_mask, decoder_inputs[:bucket[1]]) outputs.append(bucket_outputs) symbols.append(bucket_symbols) if per_example_loss: losses.append( sequence_loss_by_example( outputs[-1], targets[:bucket[1]], weights[:bucket[1]], softmax_loss_function=softmax_loss_function)) else: losses.append( sequence_loss( outputs[-1], targets[:bucket[1]], weights[:bucket[1]], softmax_loss_function=softmax_loss_function)) return outputs, losses, symbols
def time_aware_multihead_attention( self, queries, keys, key_length, query_length, t_querys, t_keys, t_querys_length, t_keys_length, num_units=None, num_heads=8, dropout_rate=0, is_training=True, scope="multihead_attention", reuse=None, ): '''Applies multihead attention. Args: queries: A 3d tensor with shape of [N, T_q, C_q]. queries_length: A 1d tensor with shape of [N]. keys: A 3d tensor with shape of [N, T_k, C_k]. keys_length: A 1d tensor with shape of [N]. num_units: A scalar. Attention size. dropout_rate: A floating point number. is_training: Boolean. Controller of mechanism for dropout. num_heads: An int. Number of heads. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns A 3d tensor with shape of (N, T_q, C) ''' # Linear projections, C = # dim or column, T_x = # vectors or actions Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C) #Q = tf.layers.dropout(Q, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) #K = tf.layers.dropout(K, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) #V = tf.layers.dropout(V, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) with tf.variable_scope(scope, reuse=reuse): # Set the fall back option for num_units if num_units is None: num_units = queries.get_shape().as_list()[-1] #list = t_querys.get_shape().as_list() #query_len = queries.get_shape().as_list()[-2] #key_len = queries.get_shape().as_list()[-2] # time decay gate scope = variable_scope.get_variable_scope() with variable_scope.variable_scope(scope, reuse=None) as unit_scope: with variable_scope.variable_scope(unit_scope): time_input_w = variable_scope.get_variable( "_time_input_w", shape=[num_units, num_units], dtype=queries.dtype) ''' time_input_b = variable_scope.get_variable("_time_input_b", shape=[t_querys_length, t_keys_length], dtype=queries.dtype) time_input_w1 = variable_scope.get_variable("_time_input_w1", shape=[t_querys_length, t_keys_length], dtype=queries.dtype) time_input_b1 = variable_scope.get_variable("_time_input_b1", shape=[t_querys_length, t_keys_length], dtype=queries.dtype) time_output_w1 = variable_scope.get_variable("time_output_w1", shape=[t_querys_length, t_keys_length], dtype=queries.dtype) time_output_w2 = variable_scope.get_variable("time_output_w2", shape=[t_querys_length, t_keys_length], dtype=queries.dtype) time_output_b = variable_scope.get_variable("time_output_b", shape=[t_querys_length, t_keys_length], dtype=queries.dtype) ''' #time_input_b = variable_scope.get_variable("_time_input_b", #shape=[t_querys_length, t_keys_length], #dtype=queries.dtype) time_input_w1 = variable_scope.get_variable( "_time_input_w1", shape=[t_querys_length, t_keys_length], dtype=queries.dtype) time_input_b1 = variable_scope.get_variable( "_time_input_b1", shape=[t_querys_length, t_keys_length], dtype=queries.dtype) time_output_w1 = variable_scope.get_variable( "time_output_w1", shape=[t_querys_length, t_keys_length], dtype=queries.dtype) time_output_w2 = variable_scope.get_variable( "time_output_w2", shape=[t_querys_length, t_keys_length], dtype=queries.dtype) time_output_w3 = variable_scope.get_variable( "time_output_w3", shape=[t_querys_length, t_keys_length], dtype=queries.dtype) time_output_b = variable_scope.get_variable( "time_output_b", shape=[t_querys_length, t_keys_length], dtype=queries.dtype) #time_w = variable_scope.get_variable( #"_time_w", shape=[query_len, key_len], dtype=queries.dtype) #time_b = variable_scope.get_variable( #"_time_b", shape=[query_len, key_len], dtype=queries.dtype) #time_b2 = variable_scope.get_variable( # "_time_b2", shape=[query_len, key_len], dtype=queries.dtype) #time_query_key = tf.matmul(queries,time_input_w, name ='1') time_query_key = math_ops.tensordot(Q, time_input_w, [[2], [0]]) time_query_key = tf.matmul(time_query_key, keys, transpose_b=True, name='2') #time_query_key = tf.nn.tanh(time_query_key+time_input_b) time_query_key = tf.nn.tanh(time_query_key) #time_query_key = tf.layers.dropout(time_query_key, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) ''' t_querys = tf.expand_dims(t_querys,2 ) t_querys = tf.concat([t_querys] * t_keys_length, axis=2) ''' t_querys = tf.stack([t_querys] * t_keys_length, axis=2) ''' t_keys = tf.expand_dims(t_keys, 1) t_keys = tf.concat([t_keys] * t_querys_length, axis=1) ''' t_keys = tf.stack([t_keys] * t_querys_length, axis=1) #decay = tf.relu(time_w * tf.log((t_querys - tf.transpose(t_keys))+1)+time_b) decay = tf.log(tf.add(tf.abs(tf.subtract(t_querys, t_keys)), 1)) #decay_mean = tf.reduce_sum(decay)/(t_keys_length*t_querys_length) #decay = decay/(decay_mean+1) #decay = self.normalize(decay) decay = tf.nn.tanh(decay * time_input_w1 + time_input_b1) #decay = tf.nn.tanh(decay * time_input_w1) #decay_gate = time_output_w1 * decay * time_query_key + time_output_b 1 #decay_gate = time_output_w1 * decay + time_output_b 1 # 3 decay_gate = time_output_w1 * decay + time_output_w2 * time_query_key + time_output_b #decay_gate = tf.sigmoid(time_output_w1*decay*time_query_key+time_output_b) #decay_gate = tf.exp(-time_query_key * decay) #sigmoid -> exp decay 0.145 0.067 #relu sigmoid 0.150 0.729 #relu ->exp decay 0.1423 0.0676 #relu-> sigmoid + 0.156 #relu-> sigmoid + split #relu sigmoid time_output_w1*decay+time_output_w2*time_query_key+time_output_b #0.50 0.68 # Split and concat Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) decay_gate_ = tf.concat([decay_gate] * num_heads, axis=0) # (h*N, T_k, C/h) #decay_gate_ = tf.layers.dropout(decay_gate_, rate=dropout_rate, #training=tf.convert_to_tensor(is_training)) # Multiplication # query-key score matrix # each big score matrix is then split into h score matrix with same size # w.r.t. different part of the feature outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]), name='3') # (h*N, T_q, T_k) outputs *= tf.nn.sigmoid(decay_gate_) # Scale outputs = outputs / (K_.get_shape().as_list()[-1]**0.5) # Key Masking #key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1)))# (N, T_k) key_masks = tf.sequence_mask(key_length, tf.shape(keys)[1]) # (N, T_k) key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k) key_masks = tf.tile(tf.expand_dims( key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k) paddings = tf.ones_like(outputs) * (-2**32 + 1) #outputs = tf.where(tf.equal(key_masks, 0), outputs, paddings) # (h*N, T_q, T_k) outputs = tf.where(key_masks, outputs, paddings) # (h*N, T_q, T_k) # Causality = Future blinding: No use, removed # Activation outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k) ''' # Query Masking query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q) query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q) query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k) outputs *= query_masks # broadcasting. (N, T_q, C) # Attention vector att_vec = outputs # Dropouts outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) # Weighted sum outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h) # Restore shape outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, C) # Residual connection outputs += queries # Normalize outputs = self.normalize(outputs) # (N, T_q, C) ''' # Query Masking #query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q) query_masks = tf.sequence_mask(query_length, tf.shape(queries)[1], dtype=tf.float32) # (N, T_q) query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q) query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k) outputs *= query_masks # broadcasting. (N, T_q, C) print(outputs.shape.as_list()) print(query_masks.shape.as_list()) # Attention vector #########Tom Sun att_vec = outputs # Dropouts #outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) # Weighted sum outputs = tf.matmul(outputs, V_, name='4') # ( h*N, T_q, C/h) # Restore shape outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, C) outputs = outputs # Residual connection outputs += queries # Normalize outputs = self.normalize(outputs) # (N, T_q, C) return outputs, att_vec
def attention_decoder(encoder_mask, decoder_inputs, initial_state, attention_states, cell, beam_size, output_size=None, num_layers=1, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1409.0473 (see below for details). Args: encoder_mask: the mask of encoder inputs [batch_size x attn_length]. decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. beam_size: the beam size of beam search output_size: Size of the output vectors; if None, we use cell.output_size. loop_function: When decoding, this function will be applied to i-th output in order to generate i+1-th input. The generation is by beam search. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state. Returns: A tuple of the form (outputs, state, symbols), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. symbols: When training, it is []; when decoding, it is the best translation generated by beam search. Raises: ValueError: when shapes of attention_states are not set, or input size cannot be inferred from the input. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError( "Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder"): batch_size = array_ops.shape( decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value state_size = initial_state.get_shape()[1].value attention_vec_size = attn_size // 2 # Size of query vectors for attention. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) # compute the initial hidden state of decoder initial_state = math_ops.tanh( linear(initial_state, state_size, False, weight_initializer=init_ops.random_normal_initializer( 0, 0.01, seed=SEED))) with variable_scope.variable_scope(scope or "attention"): k = variable_scope.get_variable( "AttnW", [1, 1, attn_size, attention_vec_size], initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED)) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = variable_scope.get_variable( "AttnV", [attention_vec_size], initializer=init_ops.constant_initializer(0.0)) def attention(query, scope=None): """Put attention masks on hidden using hidden_features and query.""" with variable_scope.variable_scope(scope or "attention"): ds = [] # Results of attention reads will be stored here. if nest.is_sequence( query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) with variable_scope.variable_scope("AttnU"): y = linear( query, attention_vec_size, False, weight_initializer=init_ops.random_normal_initializer( 0, 0.001, seed=SEED)) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # the additive attention is computed by v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) s = array_ops.transpose( array_ops.transpose(s) - math_ops.reduce_max(s, [1])) # sofxmax with mask s = math_ops.exp(s) s = math_ops.to_float(encoder_mask) * s a = array_ops.transpose( array_ops.transpose(s) / math_ops.reduce_sum(s, [1])) d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] output = None state = initial_state out_state = array_ops.split(state, num_layers, 1)[-1] prev = None symbols = [] prev_probs = [0] batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp, prev_probs, index, prev_symbol = loop_function( prev, prev_probs, beam_size, i) out_state = array_ops.gather(out_state, index) # update prev state state = array_ops.gather(state, index) # update prev state attns = [array_ops.gather(attn, index) for attn in attns] # update prev attens for j, output in enumerate(outputs): outputs[j] = array_ops.gather( output, index) # update prev outputs for j, symbol in enumerate(symbols): symbols[j] = array_ops.gather( symbol, index) # update prev symbols symbols.append(prev_symbol) # Run the attention mechanism. if i > 0 or (i == 0 and initial_state_attention): attns = attention(out_state, scope="attention") # Run the RNN. cinp = array_ops.concat( [inp, attns[0]], 1) # concatenate next input and the context vector out_state, state = cell(cinp, state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([out_state] + [cinp], output_size, False) output = array_ops.reshape(output, [-1, output_size // 2, 2]) output = math_ops.reduce_max(output, 2) # maxout if loop_function is not None: prev = output outputs.append(output) if loop_function is not None: # handle the last symbol inp, prev_probs, index, prev_symbol = loop_function( prev, prev_probs, beam_size, i + 1) out_state = array_ops.gather(out_state, index) # update prev state state = array_ops.gather(state, index) # update prev state for j, output in enumerate(outputs): outputs[j] = array_ops.gather(output, index) # update prev outputs for j, symbol in enumerate(symbols): symbols[j] = array_ops.gather(symbol, index) # update prev symbols symbols.append(prev_symbol) # output the best result of beam search for k, symbol in enumerate(symbols): symbols[k] = array_ops.gather(symbol, 0) out_state = array_ops.expand_dims(array_ops.gather(out_state, 0), 0) state = array_ops.expand_dims(array_ops.gather(state, 0), 0) for j, output in enumerate(outputs): outputs[j] = array_ops.expand_dims(array_ops.gather(output, 0), 0) # update prev outputs return outputs, state, symbols
def embedding_lookup_sparse( params, sp_ids, sp_weights, partition_strategy=None, # no used name="embedding_lookup_sparse", combiner="mean", max_norm=None, return_trainable=False, ): """Provides a dynamic version of embedding_lookup_sparse similar with tf.nn.embedding_lookup_sparse. This op assumes that there is at least one id for each row in the dense tensor represented by sp_ids (i.e. there are no rows with empty features), and that all the indices of sp_ids are in canonical row-major order. It also assumes that all id values lie in the range [0, p0), where p0 is the sum of the size of params along dimension 0. Args: params: A single `dynamic_embedding.Variable` instance representing the complete embedding tensor. sp_ids: N x M `SparseTensor` of int64 ids where N is typically batch size and M is arbitrary. sp_weights: either a `SparseTensor` of float / double weights, or `None` to indicate all weights should be taken to be 1. If specified, `sp_weights` must have exactly the same shape and indices as `sp_ids`. partition_strategy: No used. name: a name for the operation. Name is optional in graph mode and required in eager mode. combiner: A string specifying the reduction op. Currently "mean", "sqrtn" and "sum" are supported. "sum" computes the weighted sum of the embedding results for each row. "mean" is the weighted sum divided by the total weight. "sqrtn" is the weighted sum divided by the square root of the sum of the squares of the weights. max_norm: If not `None`, each embedding is clipped if its l2-norm is larger than this value, before combining. return_trainable: optional, If True, also return TrainableWrapper create by `dynamic_embedding.embedding_lookup` Returns: combined_embeddings: A dense tensor representing the combined embeddings for the sparse ids. For each row in the dense tensor represented by `sp_ids`, the op looks up the embeddings for all ids in that row, multiplies them by the corresponding weight, and combines these embeddings as specified. In other words, if `shape(combined params) = [+infinity, dim]` and `shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]` then `shape(output) = [d0, dim]`. For instance, if params dim=20, and sp_ids / sp_weights are ```python [0, 0]: id 1, weight 2.0 [0, 1]: id 3, weight 0.5 [1, 0]: id 0, weight 1.0 [2, 3]: id 1, weight 3.0 ``` with `combiner`="mean", then the output will be a 3x20 matrix where ```python output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5) output[1, :] = (params[0, :] * 1.0) / 1.0 output[2, :] = (params[1, :] * 3.0) / 3.0 ``` trainable_wrap: A TrainableWrapper object used to fill the Optimizers `var_list` Only provided if `return_trainable` is True. Raises: TypeError: If `sp_ids` is not a `SparseTensor`, or if `sp_weights` is neither `None` nor `SparseTensor`. ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}. """ if combiner not in ("mean", "sqrtn", "sum"): raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'") if not isinstance(sp_ids, sparse_tensor.SparseTensor): raise TypeError("sp_ids must be SparseTensor") ignore_weights = sp_weights is None if not ignore_weights: if not isinstance(sp_weights, sparse_tensor.SparseTensor): raise TypeError("sp_weights must be either None or SparseTensor") scope = variable_scope.get_variable_scope() full_name = scope.name + "/" + name if scope.name else name with ops.name_scope(full_name + "/"): segment_ids = sp_ids.indices[:, 0] if segment_ids.dtype != dtypes.int32: segment_ids = math_ops.cast(segment_ids, dtypes.int32) ids = sp_ids.values ids, idx = array_ops.unique(ids) embeddings, trainable_ = embedding_lookup( params, ids, name=name + "/embedding_lookup", partition_strategy=partition_strategy, max_norm=max_norm, return_trainable=True, ) if embeddings.dtype in (dtypes.float16, dtypes.bfloat16): embeddings = math_ops.cast(embeddings, dtypes.float32) if not ignore_weights: weights = sp_weights.values if weights.dtype != embeddings.dtype: weights = math_ops.cast(weights, embeddings.dtype) embeddings = array_ops.gather(embeddings, idx) # Reshape weights to allow broadcast ones = array_ops.fill( array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1) bcast_weights_shape = array_ops.concat([array_ops.shape(weights), ones], 0) orig_weights_shape = weights.get_shape() weights = array_ops.reshape(weights, bcast_weights_shape) # Set the weight shape, since after reshaping to bcast_weights_shape, # the shape becomes None. if embeddings.get_shape().ndims is not None: weights.set_shape( orig_weights_shape.concatenate( [1 for _ in range(embeddings.get_shape().ndims - 1)])) embeddings *= weights if combiner == "sum": embeddings = math_ops.segment_sum(embeddings, segment_ids, name=name) elif combiner == "mean": embeddings = math_ops.segment_sum(embeddings, segment_ids) weight_sum = math_ops.segment_sum(weights, segment_ids) embeddings = math_ops.div(embeddings, weight_sum, name=name) elif combiner == "sqrtn": embeddings = math_ops.segment_sum(embeddings, segment_ids) weights_squared = math_ops.pow(weights, 2) weight_sum = math_ops.segment_sum(weights_squared, segment_ids) weight_sum_sqrt = math_ops.sqrt(weight_sum) embeddings = math_ops.div(embeddings, weight_sum_sqrt, name=name) else: assert False, "Unrecognized combiner" else: assert idx is not None if combiner == "sum": embeddings = de.math.sparse_segment_sum(embeddings, idx, segment_ids, name=name) elif combiner == "mean": embeddings = math_ops.sparse_segment_mean(embeddings, idx, segment_ids, name=name) elif combiner == "sqrtn": embeddings = math_ops.sparse_segment_sqrt_n(embeddings, idx, segment_ids, name=name) else: assert False, "Unrecognized combiner" return (embeddings, trainable_) if return_trainable else embeddings
def pointer_decoder(decoder_inputs, initial_state, attention_states, ori_encoder_inputs, cell, feed_prev=False, dtype=dtypes.float32, scope=None): """RNN decoder with pointer net for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "pointer_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError( "Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) with vs.variable_scope(scope or "point_decoder"): batch_size = array_ops.shape( decoder_inputs[0])[0] # Needed for reshaping. input_size = decoder_inputs[0].get_shape()[1].value attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) attention_vec_size = attn_size # Size of query vectors for attention. k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = vs.get_variable("AttnV", [attention_vec_size]) states = [initial_state] def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = core_rnn_cell_impl._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y), [2, 3]) return s outputs = [] prev = None batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = array_ops.zeros(batch_attn_size, dtype=dtype) attns.set_shape([None, attn_size]) inps = [] for i in range(len(decoder_inputs)): if i > 0: vs.get_variable_scope().reuse_variables() inp = decoder_inputs[i] if feed_prev and i > 0: inp = tf.stack(ori_encoder_inputs) inp = tf.transpose(inp, perm=[1, 0, 2]) inp = tf.reshape(inp, [-1, attn_length, input_size]) inp = tf.reduce_sum( inp * tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1) inp = tf.stop_gradient(inp) inps.append(inp) # Use the same inputs in inference, order internaly # Merge input and previous attentions into one vector of the right size. x = core_rnn_cell_impl._linear([inp, attns], cell.output_size, True) #x = inp # Run the RNN. cell_output, new_state = cell(x, states[-1]) states.append(new_state) # Run the attention mechanism. output = attention(new_state) outputs.append(output) return outputs, states, inps
def _GetBatchNormParams(graph, context, has_scaling): """Extracts relevant tensors for folding batch norms. Args: graph: Graph to inspect. context: The scope under which we look for batch norm params has_scaling: Bool that specifies if scaling is done as part of batch norm. Returns: _BatchNormMatch containing all required batch norm parameters. """ gamma_tensor = None batch_mean_tensor = None batch_variance_tensor = None moving_mean_tensor = None moving_variance_tensor = None batch_epsilon = None bn_decay_mean_tensor = None bn_decay_var_tensor = None # TODO(raghuramank) This code relies on string matching and needs to be # updated if unfused batch norm continues to be widely used # Matching variable names is brittle and relies on scoping # conventions. Fused batch norm folding is more robust. Support for unfused # batch norms will be deprecated as we move forward. Fused batch norms allow # for faster training and should be used whenever possible. # context contains part of the names of the tensors we are interested in: # For MobilenetV1, the context has repetitions: # MobilenetV1/MobilenetV1/Conv2d_3_depthwise # when the moving_mean tensor has the name: # MobilenetV1/Conv2d_3_depthwise/BatchNorm/moving_mean/read # To pick the correct variable name, it is necessary to ignore the repeating # header. # For MobilenetV2, this problem does not exist: # The context is: MobilenetV2/expanded_conv_3/depthwise # and the names of the tensors start with a single MobilenetV2 # The moving mean for example, has the name: # MobilenetV2/expanded_conv_3/depthwise/BatchNorm/moving_mean/read # We identify the best match for an op by checking for # 1. The suffix of the op is exactly matched # 2. Maximum number of matches with the context.The matching # score is given by the number of parts of context (split by /) that # are present in the parts of the tensor name (again split by /). # For example: scope= MobilenetV2/MobilenetV2/expanded_conv_3 and # op.name = MobilenetV2/expanded_conv_3/depthwise/BatchNorm/moving_mean/read # will have 2 matches,scope with a different conv layer will have one match. op_suffix_mean = 'BatchNorm/moments/Squeeze' op_suffix_variance = 'BatchNorm/moments/Squeeze_1' op_suffix_epsilon = 'BatchNorm/batchnorm_1/add/y' op_suffix_bn_decay_mean = 'BatchNorm/AssignMovingAvg/decay' op_suffix_bn_decay_var = 'BatchNorm/AssignMovingAvg_1/decay' if variable_scope.get_variable_scope().use_resource: op_suffix_gamma = 'BatchNorm/gamma/Read/ReadVariableOp' op_suffix_moving_variance = ( 'BatchNorm/moving_variance/Read/ReadVariableOp') op_suffix_moving_mean = ('BatchNorm/moving_mean/Read/ReadVariableOp') else: op_suffix_gamma = 'BatchNorm/gamma' op_suffix_moving_variance = 'BatchNorm/moving_variance/read' op_suffix_moving_mean = 'BatchNorm/moving_mean/read' # Parse through list of ops to find relevant ops batch_mean_tensor = _FindMatchingTensor(graph, op_suffix_mean, context) batch_variance_tensor = _FindMatchingTensor(graph, op_suffix_variance, context) moving_mean_tensor = _FindMatchingTensor(graph, op_suffix_moving_mean, context) moving_variance_tensor = _FindMatchingTensor(graph, op_suffix_moving_variance, context) batch_epsilon = _FindMatchingTensor(graph, op_suffix_epsilon, context) bn_decay_mean_tensor = _FindMatchingTensor(graph, op_suffix_bn_decay_mean, context) bn_decay_var_tensor = _FindMatchingTensor(graph, op_suffix_bn_decay_var, context) if batch_mean_tensor is None and moving_mean_tensor is None: ValueError('Error folding unfused batch norms') if has_scaling: gamma_tensor = _FindMatchingTensor(graph, op_suffix_gamma, context) if not has_scaling: gamma_tensor = array_ops.ones(moving_mean_tensor.shape) return _BatchNormMatch(layer_op=None, bn_op=None, output_tensor=None, input_tensor=None, weight_tensor=None, gamma_tensor=gamma_tensor, beta_tensor=None, mean_tensor=batch_mean_tensor, variance_tensor=batch_variance_tensor, moving_mean_tensor=moving_mean_tensor, moving_variance_tensor=moving_variance_tensor, bn_decay_mean_tensor=bn_decay_mean_tensor, bn_decay_var_tensor=bn_decay_var_tensor, batch_epsilon=batch_epsilon, batch_to_space_op=None)
def actrgn_rnn_decoder(decoder_inputs, initial_state, initial_attn_output, cell, attn_dim, lstm_dim, loop_function=None, scope=None): """RNN decoder for the sequence-to-sequence model. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor with shape [batch_size x cell.state_size]. cell: core_rnn_cell.RNNCell defining the cell function and size. loop_function: If not None, this function will be applied to the i-th output in order to generate the i+1-st input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. scope: VariableScope for the created subgraph; defaults to "rnn_decoder". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing generated outputs. state: The state of each cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. (Note that in some cases, like basic RNN cell or GRU cell, outputs and states can be the same. They are different for LSTM cells though.) """ with variable_scope.variable_scope(scope or "actrgn_rnn_decoder"): state = initial_state output = initial_attn_output outputs = [] prev = None w_l = variable_scope.get_variable(name='lstm_to_attn_w', shape=[lstm_dim, attn_dim], dtype=tf.float32) b_l = variable_scope.get_variable(name='lstm_to_attn_b', shape=[attn_dim], dtype=tf.float32) w_i = variable_scope.get_variable(name='ip_to_attn_w', shape=[attn_dim, attn_dim], dtype=tf.float32) b_i = variable_scope.get_variable(name='ip_to_attn_b', shape=[attn_dim], dtype=tf.float32) w_f = variable_scope.get_variable(name='attn_to_prob_w', shape=[attn_dim, 1], dtype=tf.float32) b_f = variable_scope.get_variable(name='attn_to_prob_b', shape=[1], dtype=tf.float32) for i, inp in enumerate(decoder_inputs): if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) if i > 0: variable_scope.get_variable_scope().reuse_variables() attn_state = tf.matmul(output, w_l) + b_l context_state = tf.matmul( inp, tf.tile(tf.expand_dims(w_i, 0), [int(inp.shape[0]), 1, 1])) + b_i context_state = context_state + tf.expand_dims(attn_state, 1) context_state = tf.tanh(context_state) attn_prob = tf.squeeze( tf.nn.softmax( tf.matmul( context_state, tf.tile(tf.expand_dims(w_f, 0), [int(context_state.shape[0]), 1, 1])) + b_f)) inp_rnn = tf.reduce_sum( tf.multiply(inp, tf.expand_dims(attn_prob, 2)), 1) output, state = cell(inp_rnn, state) outputs.append(output) if loop_function is not None: prev = output return outputs, state