def testAggregateGradients(self): def fn(x): ind1 = tensor.Tensor(np.array([0, 1])) ind2 = tensor.Tensor(np.array([2, 3])) ind3 = tensor.Tensor(np.array([1, 3])) # A mixture of IndexedSlices and dense tensor to aggregate. g1 = embedding_ops.embedding_lookup(x, ind1) g2 = embedding_ops.embedding_lookup(x, ind2) g3 = embedding_ops.embedding_lookup(x, ind3) g4 = math_ops.reduce_sum(x * tensor.Tensor(2.0)) return g1 * g2 * g3 * g4 var_np = np.random.rand(4, 2).astype(np.float32) var = tensor.Tensor(var_np) grad = backprop.gradients_function(fn, [0])(var)[0] with context.graph_mode(), self.test_session(): tf_var = array_ops.constant(var_np, dtypes.float32) tf_ind1 = array_ops.constant([0, 1]) tf_ind2 = array_ops.constant([2, 3]) tf_ind3 = array_ops.constant([1, 3]) tf_g1 = embedding_ops.embedding_lookup(tf_var, tf_ind1) tf_g2 = embedding_ops.embedding_lookup(tf_var, tf_ind2) tf_g3 = embedding_ops.embedding_lookup(tf_var, tf_ind3) tf_g4 = math_ops.reduce_sum(tf_var * 2.0, reduction_indices=(0, 1)) tf_y = tf_g1 * tf_g2 * tf_g3 * tf_g4 tf_grad = gradients.gradients(tf_y, [tf_var])[0] tf_dense_grad = math_ops.unsorted_segment_sum( tf_grad.values, tf_grad.indices, tf_grad.dense_shape[0]) self.assertAllClose(grad.numpy(), tf_dense_grad.eval())
def testAggregateGradients(self): def fn(x): ind1 = constant_op.constant(np.array([0, 1])) ind2 = constant_op.constant(np.array([2, 3])) ind3 = constant_op.constant(np.array([1, 3])) # A mixture of IndexedSlices and dense tensor to aggregate. g1 = embedding_ops.embedding_lookup(x, ind1) g2 = embedding_ops.embedding_lookup(x, ind2) g3 = embedding_ops.embedding_lookup(x, ind3) g4 = math_ops.reduce_sum(x * constant_op.constant(2.0)) return g1 * g2 * g3 * g4 var_np = np.random.rand(4, 2).astype(np.float32) var = constant_op.constant(var_np) grad = backprop.gradients_function(fn, [0])(var)[0] grad = self.evaluate(ops.convert_to_tensor(grad)) if not context.executing_eagerly(): tf_var = array_ops.constant(var_np, dtypes.float32) tf_ind1 = array_ops.constant([0, 1]) tf_ind2 = array_ops.constant([2, 3]) tf_ind3 = array_ops.constant([1, 3]) tf_g1 = embedding_ops.embedding_lookup(tf_var, tf_ind1) tf_g2 = embedding_ops.embedding_lookup(tf_var, tf_ind2) tf_g3 = embedding_ops.embedding_lookup(tf_var, tf_ind3) tf_g4 = math_ops.reduce_sum(tf_var * 2.0, axis=(0, 1)) tf_y = tf_g1 * tf_g2 * tf_g3 * tf_g4 tf_grad = gradients.gradients(tf_y, [tf_var])[0] tf_dense_grad = math_ops.unsorted_segment_sum( tf_grad.values, tf_grad.indices, tf_grad.dense_shape[0]) self.assertAllClose(grad, self.evaluate(tf_dense_grad))
def testConstructionNonSharded(self): with ops.Graph().as_default(): p = variables.Variable( array_ops.zeros( shape=[100, 100], dtype=dtypes.float32)) ids = constant_op.constant([0, 1, 1, 7], dtype=dtypes.int32) embedding_ops.embedding_lookup([p], ids)
def testHigherRankMaxNorm(self): np.random.seed(8) with self.cached_session(): for params_shape in (12,), (6, 3), (6, 2, 3): # Test embedding rank 0, 1, 2. # Note: the first dimension must be a common multiple of procs below. params = 2 * np.ones(params_shape) params_norm = params / np.sqrt( np.sum( params * params, tuple(range(params.ndim)[1:]), keepdims=True)) for ids_shape in (), (3), (4, 3), (2, 3, 4): ids = np.random.randint( params.shape[0], size=np.prod(ids_shape, dtype=np.int64)).reshape(ids_shape) # Compare nonsharded to gather simple = embedding_ops.embedding_lookup( params, ids, max_norm=1.0).eval() # assertAllClose is used here as different implementations of sqrt may # be used to compute each of the values being compared. For example, # on AVX512 builds the embedding operation makes use of Eigen's fast # vectorized square root algorithm for doubles. These different # implementations of sqrt are not guaranteed to produce exactly the # same results. Therefore, an exact comparison cannot be made. self.assertAllClose(simple, array_ops.gather(params_norm, ids).eval()) # Run a few different sharded versions. for procs in 1, 2, 3: stride = procs * math_ops.range(params.shape[0] // procs) split_params = [ array_ops.gather(params, stride + p) for p in xrange(procs) ] sharded = embedding_ops.embedding_lookup( split_params, ids, max_norm=1.0).eval() self.assertAllEqual(simple, sharded)
def testHigherRankMaxNorm(self): np.random.seed(8) with self.test_session(): for params_shape in (12,), (6, 3), (6, 2, 3): # Test embedding rank 0, 1, 2. # Note: the first dimension must be a common multiple of procs below. params = 2 * np.ones(params_shape) params_norm = params / np.sqrt( np.sum( params * params, tuple(range(params.ndim)[1:]), keepdims=True)) for ids_shape in (), (3), (4, 3), (2, 3, 4): ids = np.random.randint( params.shape[0], size=np.prod(ids_shape, dtype=np.int64)).reshape(ids_shape) # Compare nonsharded to gather simple = embedding_ops.embedding_lookup( params, ids, max_norm=1.0).eval() self.assertAllEqual(simple, array_ops.gather(params_norm, ids).eval()) # Run a few different sharded versions. for procs in 1, 2, 3: stride = procs * math_ops.range(params.shape[0] // procs) split_params = [ array_ops.gather(params, stride + p) for p in xrange(procs) ] sharded = embedding_ops.embedding_lookup( split_params, ids, max_norm=1.0).eval() self.assertAllEqual(simple, sharded)
def body(it, cost): embedding = embedding_ops.embedding_lookup(embedding_matrix, [0]) cost = control_flow_ops.cond( math_ops.equal(it, 3), lambda: math_ops.square(cost), (lambda: cost + math_ops.reduce_sum(embedding))) return it + 1, cost _, cost = control_flow_ops.while_loop( cond, body, [constant_op.constant(0), constant_op.constant(0.0)]) dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0] dynamic_grads = math_ops.segment_sum(dynamic_grads.values, dynamic_grads.indices) embedding = embedding_ops.embedding_lookup(embedding_matrix, [0]) static = math_ops.square( math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding)) + math_ops.reduce_sum(embedding) static_grads = gradients_impl.gradients(static, [embedding_matrix])[0] static_grads = math_ops.segment_sum(static_grads.values, static_grads.indices) with self.cached_session(): self.evaluate(variables.global_variables_initializer()) self.assertAllEqual(*self.evaluate([static_grads, dynamic_grads]))
def fn(x): ind1 = constant_op.constant(np.array([0, 1])) ind2 = constant_op.constant(np.array([2, 3])) ind3 = constant_op.constant(np.array([1, 3])) # A mixture of IndexedSlices and dense tensor to aggregate. g1 = embedding_ops.embedding_lookup(x, ind1) g2 = embedding_ops.embedding_lookup(x, ind2) g3 = embedding_ops.embedding_lookup(x, ind3) g4 = math_ops.reduce_sum(x * constant_op.constant(2.0)) return g1 * g2 * g3 * g4
def testConstructionSharded(self): with ops.Graph().as_default(): p = [] for _ in range(2): p += [ variables.Variable( array_ops.zeros(shape=[100, 100], dtype=dtypes.float32)) ] ids = constant_op.constant([0, 1, 1, 17], dtype=dtypes.int32) embedding_ops.embedding_lookup(p, ids)
def __call__(self, inputs, state, scope=None): """Run the cell on embedded inputs.""" with vs.variable_scope(scope or type(self).__name__): # "EmbeddingWrapper2" with ops.device("/cpu:0"): if self._initializer: initializer = self._initializer elif vs.get_variable_scope().initializer: initializer = vs.get_variable_scope().initializer else: # Default initializer for embeddings should have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3) embeddings = [] for i in xrange(len(self._embedding_classes)): embeddings.append(vs.get_variable("embedding"+str(i), [self._embedding_classes[i], self._embedding_sizes[i]], initializer=initializer)) embedded = [] for i in xrange(len(self._embedding_classes)): embedded.append(embedding_ops.embedding_lookup( embeddings[i], array_ops.reshape(inputs[i], [-1]))) finalEmbedded = tf.concat(1, embedded) return self._cell(finalEmbedded, state)
def __init__(self, embedding, start_tokens, end_token): """Initializer. Args: embedding: A callable that takes a vector tensor of `ids` (argmax ids), or the `params` argument for `embedding_lookup`. The returned tensor will be passed to the decoder input. start_tokens: `int32` vector shaped `[batch_size]`, the start tokens. end_token: `int32` scalar, the token that marks end of decoding. Raises: ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a scalar. """ if callable(embedding): self._embedding_fn = embedding else: self._embedding_fn = ( lambda ids: embedding_ops.embedding_lookup(embedding, ids)) self._start_tokens = ops.convert_to_tensor( start_tokens, dtype=dtypes.int32, name="start_tokens") self._end_token = ops.convert_to_tensor( end_token, dtype=dtypes.int32, name="end_token") if self._start_tokens.get_shape().ndims != 1: raise ValueError("start_tokens must be a vector") self._batch_size = array_ops.size(start_tokens) if self._end_token.get_shape().ndims != 0: raise ValueError("end_token must be a scalar") self._start_inputs = self._embedding_fn(self._start_tokens)
def embedding_encoder(encoder_inputs, cell, embedding, num_symbols, embedding_size, bidirectional=False, dtype=None, weight_initializer=None, scope=None): with variable_scope.variable_scope( scope or "embedding_encoder", dtype=dtype) as scope: dtype = scope.dtype # Encoder. if not embedding: embedding = variable_scope.get_variable("embedding", [num_symbols, embedding_size], initializer=weight_initializer()) emb_inp = [embedding_ops.embedding_lookup(embedding, i) for i in encoder_inputs] if bidirectional: _, output_state_fw, output_state_bw = rnn.bidirectional_rnn(cell, cell, emb_inp, dtype=dtype) encoder_state = tf.concat(1, [output_state_fw, output_state_bw]) else: _, encoder_state = rnn.rnn( cell, emb_inp, dtype=dtype) return encoder_state
def _tf_dec_embedding_attention_decoder(self, enc_out, decoder_input, last_state, cell, num_symbols, embedding_size, num_heads=1, output_size=None, output_projection=None, dtype=dtypes.float32, scope=None, src_mask=None, maxout_layer=False, encoder="reverse", start=None, init_const=False, bow_mask=None): """Decode single step version of tensorflow.models.rnn.seq2seq.embedding_attention_decoder """ if output_size is None: output_size = cell.output_size if output_projection is not None: proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype) proj_weights.get_shape().assert_is_compatible_with([cell.output_size, num_symbols]) proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with variable_scope.variable_scope(scope or "embedding_attention_decoder"): with ops.device("/cpu:0"): embedding = variable_scope.get_variable("embedding", [num_symbols, embedding_size]) emb_inp = embedding_ops.embedding_lookup(embedding, decoder_input) return self._tf_dec_attention_decoder( enc_out, emb_inp, last_state, cell, output_size=output_size, num_heads=num_heads, src_mask=src_mask, maxout_layer=maxout_layer, embedding_size=embedding_size, encoder=encoder, start=start, init_const=init_const, bow_mask=bow_mask)
def create_decoder(self): start_time = time.time() with vs.variable_scope("embedding" or scope): tokens = self.tokens[:-1] embeddings = [] with tf.device("/cpu:0"): sqrt3 = np.sqrt(3) embedding = vs.get_variable( "embedding", [self.vocab_size, self.embedding_size], initializer=tf.random_uniform_initializer(-sqrt3, sqrt3)) for token in tokens: # Create the embedding layer. emb = embedding_ops.embedding_lookup(embedding, token) emb.set_shape([self.batch_size, self.embedding_size]) embeddings.append(emb) cell = rnn_cell.GRUCell(self.decoder_cell_size) cell = rnn_cell.OutputProjectionWrapper(cell, self.vocab_size) self.decoder_states = rnn.rnn( cell, embeddings, dtype=tf.float32, sequence_length=self.tokens_len)[0] self.logits = self.decoder_states print('create_decoder graph time %f' % (time.time() - start_time))
def embedding_lookup_unique(params, ids, name=None): """Version of embedding_lookup that avoids duplicate lookups. This can save communication in the case of repeated ids. Same interface as embedding_lookup. Except it supports multi-dimensional `ids` which allows to not reshape input/output to fit gather. Args: params: A list of tensors with the same shape and type, or a `PartitionedVariable`. Shape `[index, d1, d2, ...]`. ids: A one-dimensional `Tensor` with type `int32` or `int64` containing the ids to be looked up in `params`. Shape `[ids1, ids2, ...]`. name: A name for this operation (optional). Returns: A `Tensor` with the same type as the tensors in `params` and dimension of `[ids1, ids2, d1, d2, ...]`. Raises: ValueError: If `params` is empty. """ with ops.name_scope(name, "EmbeddingLookupUnique", [params, ids]): ids = ops.convert_to_tensor(ids) shape = array_ops.shape(ids) ids_flat = array_ops.reshape( ids, math_ops.reduce_prod(shape, keep_dims=True)) unique_ids, idx = array_ops.unique(ids_flat) unique_embeddings = embedding_ops.embedding_lookup(params, unique_ids) embeds_flat = array_ops.gather(unique_embeddings, idx) embed_shape = array_ops.concat( [shape, array_ops.shape(unique_embeddings)[1:]], 0) embeds = array_ops.reshape(embeds_flat, embed_shape) embeds.set_shape(ids.get_shape().concatenate( unique_embeddings.get_shape()[1:])) return embeds
def extract_argmax_and_embed(prev, _): """Loop_function that extracts the symbol from prev and embeds it.""" if output_projection is not None: prev = nn_ops.xw_plus_b( prev, output_projection[0], output_projection[1]) prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1)) return embedding_ops.embedding_lookup(embedding, prev_symbol)
def calculate_loss_from_wals_model(self, wals_model, sp_inputs): current_rows = embedding_ops.embedding_lookup( wals_model.row_factors, math_ops.range(wals_model._input_rows), partition_strategy="div") current_cols = embedding_ops.embedding_lookup( wals_model.col_factors, math_ops.range(wals_model._input_cols), partition_strategy="div") row_wts = embedding_ops.embedding_lookup( wals_model._row_weights, math_ops.range(wals_model._input_rows), partition_strategy="div") col_wts = embedding_ops.embedding_lookup( wals_model._col_weights, math_ops.range(wals_model._input_cols), partition_strategy="div") return factorization_ops_test_utils.calculate_loss( sp_inputs, current_rows, current_cols, wals_model._regularization, wals_model._unobserved_weight, row_wts, col_wts)
def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols): if output_projection is not None: prev = nn_ops.xw_plus_b( prev, output_projection[0], output_projection[1]) # prev= prev.get_shape().with_rank(2)[1] probs = tf.log(tf.nn.softmax(prev)) if i > 1: probs = tf.reshape(probs + log_beam_probs[-1], [-1, beam_size * num_symbols]) best_probs, indices = tf.nn.top_k(probs, beam_size) indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1]))) best_probs = tf.stop_gradient(tf.reshape(best_probs, [-1, 1])) symbols = indices % num_symbols # Which word in vocabulary. beam_parent = indices // num_symbols # Which hypothesis it came from. beam_symbols.append(symbols) beam_path.append(beam_parent) log_beam_probs.append(best_probs) # Note that gradients will not propagate through the second parameter of # embedding_lookup. emb_prev = embedding_ops.embedding_lookup(embedding, symbols) emb_prev = tf.reshape(emb_prev,[beam_size,embedding_size]) # emb_prev = embedding_ops.embedding_lookup(embedding, symbols) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev
def testMinimizeSparseResourceVariable(self): for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: with self.cached_session(): var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype) x = constant_op.constant([[4.0], [5.0]], dtype=dtype) pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x) pred += var1 loss = pred * pred sgd_op = gradient_descent.GradientDescentOptimizer(1.0).minimize(loss) # TODO(apassos) calling initialize_resources on all resources here # doesn't work because the sessions and graph are reused across unit # tests and this would mean trying to reinitialize variables. Figure out # a long-term solution for this. variables.global_variables_initializer().run() # Fetch params to validate initial values self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval()) self.assertAllCloseAccordingToType([3.0], var1.eval()) # Run 1 step of sgd sgd_op.run() # Validate updated params np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0 np_grad = 2 * np_pred self.assertAllCloseAccordingToType( [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval()) self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
def testAdamSparse(self): with ops.device('/cpu:0'): # Create 2-D embedding for 3 objects on CPU because sparse/sliced updates # are not implemented on TPU. embedding_matrix = resource_variable_ops.ResourceVariable( array_ops.ones([3, 2])) with self.test_scope(): with backprop.GradientTape() as tape: embedding = embedding_ops.embedding_lookup(embedding_matrix, [1]) y = math_ops.reduce_sum(embedding) dy_dx = tape.gradient(y, embedding_matrix) self.assertIsInstance(dy_dx, ops.IndexedSlices) optimizer = adam.AdamOptimizer(0.1) # The gradient application operations will run on CPU because optimizer # updates are always collocated with the variable. optimizer.apply_gradients([(dy_dx, embedding_matrix)]) # This assign_add will run on CPU because when an input to an # operation is a resource, this operation is placed on the resource's # device by the eager runtime. embedding_matrix.assign_add(array_ops.ones([3, 2])) self.assertAllClose([[2.0, 2.0], [1.9, 1.9], [2.0, 2.0]], embedding_matrix.numpy())
def testShardedDivPartitioningUnknownParamShape(self): with self.test_session(): num_shards = 5 vocab_size = 13 # Embedding dimensions is 10. The vocab_size x 10 embedding # parameters are spread in num_shards matrices, so the first # 3 shards are 3 x 10 and the last 2 shards are 2 x 10. # We clear parameter shapes, to test when shape is not statically known. p, params, feed_dict = _EmbeddingParams( num_shards, vocab_size, use_shapeless_placeholder=True) num_vals = 30 # Fetch num_vals embeddings for random word ids. Since # num_vals > vocab_size, this ought to have repetitions, so # will test that aspect. id_vals = np.random.randint(vocab_size, size=num_vals) ids = constant_op.constant(list(id_vals), dtype=dtypes.int64) embedding = embedding_ops.embedding_lookup( p, ids, partition_strategy="div") tf_result = embedding.eval(feed_dict=feed_dict) np_result, _, _ = _EmbeddingResult( params, id_vals, num_shards, vocab_size, partition_strategy="div") self.assertAllEqual(np_result, tf_result)
def __call__(self, inputs, state, scope=None): """Run the cell on embedded inputs.""" with vs.variable_scope(scope or type(self).__name__): # "EmbeddingWrapper" with ops.device("/cpu:0"): if self._embedding: embedding = self._embedding else: if self._initializer: initializer = self._initializer elif vs.get_variable_scope().initializer: initializer = vs.get_variable_scope().initializer else: # Default initializer for embeddings should have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3) embedding = vs.get_variable("embedding", [self._embedding_classes, self._cell.input_size], initializer=initializer) embedded = embedding_ops.embedding_lookup( embedding, array_ops.reshape(inputs, [-1])) """print (embedded) print ("{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}")""" return self._cell(embedded, state)
def embedding_rnn_decoder(decoder_inputs, initial_state, cell, num_symbols, embedding_size, output_projection=None, feed_previous=False, update_embedding_for_previous=True, scope=None): """RNN decoder with embedding and a pure-decoding option. Args: decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). initial_state: 2D Tensor [batch_size x cell.state_size]. cell: rnn_cell.RNNCell defining the cell function. num_symbols: Integer, how many symbols come into the embedding. embedding_size: Integer, the length of the embedding vector for each symbol. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be generated by: next = embedding_lookup(embedding, argmax(previous_output)), In effect, this implements a greedy decoder. It can also be used during training to emulate http://arxiv.org/abs/1506.03099. If False, decoder_inputs are used as given (the standard decoder case). update_embedding_for_previous: Boolean; if False and feed_previous=True, only the embedding for the first symbol of decoder_inputs (the "GO" symbol) will be updated by back propagation. Embeddings for the symbols generated from the decoder itself remain unchanged. This parameter has no effect if feed_previous=False. scope: VariableScope for the created subgraph; defaults to "embedding_rnn_decoder". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. state: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: When output_projection has the wrong shape. """ if output_projection is not None: proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtypes.float32) proj_weights.get_shape().assert_is_compatible_with([None, num_symbols]) proj_biases = ops.convert_to_tensor( output_projection[1], dtype=dtypes.float32) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with variable_scope.variable_scope(scope or "embedding_rnn_decoder"): with ops.device("/cpu:0"): embedding = variable_scope.get_variable("embedding", [num_symbols, embedding_size]) loop_function = _extract_argmax_and_embed( embedding, output_projection, update_embedding_for_previous) if feed_previous else None emb_inp = ( embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs) return rnn_decoder(emb_inp, initial_state, cell, loop_function=loop_function)
def doTestIndexedSlicesGradientInCondInWhileLoop(self, use_resource=False): with ops.Graph().as_default(): embedding_matrix = variable_scope.get_variable( "embedding_matrix", [5, 5], initializer=init_ops.random_normal_initializer(), use_resource=use_resource) def Cond(it, _): return it < 5 def Body(it, cost): embedding = embedding_ops.embedding_lookup(embedding_matrix, [0]) cost = control_flow_ops.cond( math_ops.equal(it, 3), lambda: math_ops.square(cost), lambda: cost + math_ops.reduce_sum(embedding)) return it + 1, cost _, cost = control_flow_ops.while_loop( Cond, Body, [constant_op.constant(0), constant_op.constant(0.0)]) dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0] dynamic_grads = math_ops.segment_sum(dynamic_grads.values, dynamic_grads.indices) embedding = embedding_ops.embedding_lookup(embedding_matrix, [0]) static = math_ops.square( math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding)) + math_ops.reduce_sum(embedding) static_grads = gradients_impl.gradients(static, [embedding_matrix])[0] static_grads = math_ops.segment_sum(static_grads.values, static_grads.indices) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
def testIndexedSlicesGradientInCondInWhileLoop(self): with ops.Graph().as_default(): embedding_matrix = tf.get_variable( "embedding_matrix", [5, 5], initializer=tf.random_normal_initializer()) def Cond(it, _): return it < 5 def Body(it, cost): embedding = embedding_ops.embedding_lookup(embedding_matrix, [0]) cost = tf.cond(tf.equal(it, 3), lambda: tf.square(cost), lambda: cost + tf.reduce_sum(embedding)) return it + 1, cost _, cost = control_flow_ops.While( Cond, Body, [tf.constant(0), tf.constant(0.0)]) dynamic_grads = tf.gradients(cost, [embedding_matrix])[0] dynamic_grads = tf.segment_sum(dynamic_grads.values, dynamic_grads.indices) embedding = embedding_ops.embedding_lookup(embedding_matrix, [0]) static = tf.square( tf.reduce_sum(embedding) + tf.reduce_sum(embedding) + tf.reduce_sum(embedding)) + tf.reduce_sum(embedding) static_grads = tf.gradients(static, [embedding_matrix])[0] static_grads = tf.segment_sum(static_grads.values, static_grads.indices) with self.test_session() as sess: sess.run(tf.initialize_all_variables()) self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
def attention_decoder_with_embedding(decoder_inputs, initial_state, attention_states, cell, embedding, num_heads=1, output_size=None, dtype=dtypes.float32, scope=None, initial_state_attention=False): """ We are not using output_projection because we are NOT using a sampled softmax Parameters ---------- decoder_inputs initial_state attention_states cell embedding: outside embedding passed in num_heads output_size dtype scope initial_state_attention Returns ------- """ if output_size is None: output_size = cell.output_size with vs.variable_scope(scope or "attention_decoder_with_embedding"): emb_inp = [ embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs] return attention_decoder( emb_inp, initial_state, attention_states, cell, output_size=output_size, num_heads=num_heads, loop_function=None, initial_state_attention=initial_state_attention)
def __call__(self, inputs, state, scope=None): """Run the cell on embedded inputs.""" with _checked_scope(self, scope or "embedding_wrapper", reuse=self._reuse): with ops.device("/cpu:0"): if self._initializer: initializer = self._initializer elif vs.get_variable_scope().initializer: initializer = vs.get_variable_scope().initializer else: # Default initializer for embeddings should have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3) if type(state) is tuple: data_type = state[0].dtype else: data_type = state.dtype embedding = vs.get_variable( "embedding", [self._embedding_classes, self._embedding_size], initializer=initializer, dtype=data_type) embedded = embedding_ops.embedding_lookup( embedding, array_ops.reshape(inputs, [-1])) return self._cell(embedded, state)
def _random(self): indices = random_ops.random_uniform( array_ops.reshape(self._num_remaining, [-1]), minval=0, maxval=math_ops.cast(self._num_data, dtypes.int64), seed=self._random_seed, dtype=dtypes.int64) return embedding_lookup(self._inputs, indices, partition_strategy='div')
def _one_hot_to_embedding(one_hot, embedding_size): """Get a dense embedding vector from a one-hot encoding.""" num_tokens = one_hot.shape[1] label_id = math_ops.argmax(one_hot, axis=1) embedding = variable_scope.get_variable( 'embedding', [num_tokens, embedding_size]) return embedding_ops.embedding_lookup( embedding, label_id, name='token_to_embedding')
def add_embedding_layer(self, emb_matrix): """ Adds word embedding layer to the graph. Inputs: emb_matrix: shape (400002, embedding_size). The GloVe vectors, plus vectors for PAD and UNK. """ with vs.variable_scope("embeddings"): # Note: the embedding matrix is a tf.constant which means it's not a trainable parameter embedding_matrix = tf.constant(emb_matrix, dtype=tf.float32, name="emb_matrix") # shape (400002, embedding_size) # Get the word embeddings for the context and question, # using the placeholders self.context_ids and self.qn_ids self.context_embs = embedding_ops.embedding_lookup(embedding_matrix, self.context_ids) # shape (batch_size, context_len, embedding_size) self.qn_embs = embedding_ops.embedding_lookup(embedding_matrix, self.qn_ids) # shape (batch_size, question_len, embedding_size)
def calculate_loss(self): """Calculates the loss of the current (trained) model.""" current_rows = embedding_ops.embedding_lookup( self._model.get_row_factors(), math_ops.range(self._num_rows), partition_strategy='div') current_cols = embedding_ops.embedding_lookup( self._model.get_col_factors(), math_ops.range(self._num_cols), partition_strategy='div') row_wts = embedding_ops.embedding_lookup( self._row_weights, math_ops.range(self._num_rows), partition_strategy='div') col_wts = embedding_ops.embedding_lookup( self._col_weights, math_ops.range(self._num_cols), partition_strategy='div') sp_inputs = self.np_array_to_sparse(self.INPUT_MATRIX) return factorization_ops_test_utils.calculate_loss( sp_inputs, current_rows, current_cols, self._regularization_coeff, self._unobserved_weight, row_wts, col_wts)
def embedding_rnn_decoder(decoder_inputs, initial_state, cell, num_symbols, embedding_size, output_projection=None, feed_previous=False, update_embedding_for_previous=True, scope=None, beam_search=True, beam_size=10): """RNN decoder with embedding and a pure-decoding option. Args: decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). initial_state: 2D Tensor [batch_size x cell.state_size]. cell: rnn_cell.RNNCell defining the cell function. num_symbols: Integer, how many symbols come into the embedding. embedding_size: Integer, the length of the embedding vector for each symbol. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be generated by: next = embedding_lookup(embedding, argmax(previous_output)), In effect, this implements a greedy decoder. It can also be used during training to emulate http://arxiv.org/abs/1506.03099. If False, decoder_inputs are used as given (the standard decoder case). update_embedding_for_previous: Boolean; if False and feed_previous=True, only the embedding for the first symbol of decoder_inputs (the "GO" symbol) will be updated by back propagation. Embeddings for the symbols generated from the decoder itself remain unchanged. This parameter has no effect if feed_previous=False. scope: VariableScope for the created subgraph; defaults to "embedding_rnn_decoder". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. state: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: When output_projection has the wrong shape. """ if output_projection is not None: proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtypes.float32) proj_weights.get_shape().assert_is_compatible_with([None, num_symbols]) proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtypes.float32) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) # TODO: Investigate could an EmbeddingWrapper work here? As well as an OutputProjectionWrapper with variable_scope.variable_scope(scope or "embedding_rnn_decoder"): with ops.device("/cpu:0"): embedding = variable_scope.get_variable( "embedding", [num_symbols, embedding_size]) emb_inp = [ embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs ] if beam_search: return beam_rnn_decoder(emb_inp, initial_state, cell, output_projection=output_projection, embedding=embedding, beam_size=beam_size) else: loop_function = _extract_argmax_and_embed( embedding, output_projection, update_embedding_for_previous) if feed_previous else None return rnn_decoder(emb_inp, initial_state, cell, loop_function=loop_function, scope=scope)
def loss(): x = constant_op.constant([[4.0], [5.0]], dtype=dtype) pred = math_ops.matmul( embedding_ops.embedding_lookup([var0], [0]), x) return pred * pred
def decode(cell, init_state, vocab_size, embedding, decoder_inputs, out_proj, maxlen, more_args, mem_args, feed_prev=False, loop_function=None, copy_transform=None, dtype=tf.float32): with variable_scope.variable_scope("embedding_rnn_decoder") as scope: outputs = [] hiddens = [] state = init_state if not feed_prev: emb_inputs = (embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs) for i, emb_inp in enumerate(emb_inputs): if i >= maxlen: break if i > 0: variable_scope.get_variable_scope().reuse_variables() output, state = cell(emb_inp, state) outputs.append(output) hiddens.append(state) return outputs, hiddens, state else: a1s = [] kdists = [] Ndists = [] Rdebugs = [] samples = [] i = 0 prev = None tmp = None emb_inp = embedding_ops.embedding_lookup(embedding, decoder_inputs[0]) while (True): if i > 0: variable_scope.get_variable_scope().reuse_variables() output, state = cell(emb_inp, state) outputs.append(output) hiddens.append(state) with tf.variable_scope('loop', reuse=True): if output is not None: loop_return = loop_function(output, out_proj, embedding) #loop_return = loop_function(state, output) if loop_return is not None: emb_inp, prev_symbol = loop_return samples.append(prev_symbol) #emb_inp, prev_symbol, a1, kdist, Ndist, Rdebug = loop_return #a1s.append(a1) #kdists.append(kdist) #Ndists.append(Ndist) #Rdebugs.append(Rdebug) i += 1 if i >= maxlen: break return outputs, samples, hiddens, a1s, kdists, Ndists, Rdebugs
def _rank_resample(weights, biases, inputs, sampled_values, num_resampled, resampling_temperature, partition_strategy): """A helper function for rank_sampled_softmax_loss. This computes, for each i in `sampled_values`, log(sum_j exp((w_i * x_j + b_i) / resampling_temperature)) where w_i, b_i are the weight and bias of the i-th class, respectively, and j ranges over the rows of `inputs`. For efficiency, we rearrange the computation to log(sum_j exp(w_i * (x_j / resampling_temperature))) + b_i / resampling_temperature. This translates to the following batched computation using tensorflow ops: reduce_logsumexp(matmul(embeddings, transpose(inputs / resampling_temperature))) + biases / resampling_temperature The computation of the first term is colocated with the embeddings using `transform_fn` in `embedding_ops._embedding_lookup_and_transform`. The second term, not the bottleneck, is computed at the worker. Args: weights: From `rank_sampled_softmax_loss`. biases: From `rank_sampled_softmax_loss`. inputs: From `rank_sampled_softmax_loss`. sampled_values: A tuple of (`sampled_candidates`, `true_expected_count`, `sampled_expected_count`) returned by a `*_candidate_sampler` function. num_resampled: An `int`. This many values are selected from `sampled_values` using the adaptive resampling algorithm. The caller must ensure that `num_resampled` is less than the size of `sampled_values`. resampling_temperature: A scalar `Tensor` with the temperature parameter for the adaptive resampling algorithm. partition_strategy: From `rank_sampled_softmax_loss`. Returns: A tuple of (`resampled_candidates`, `true_expected_count`, `resampled_expected_count`), similar to `sampled_values` but sampled down to `num_resampled` values. """ # This code supports passing a Tensor for num_resampled, but since it is only # called with an int, that's what we specify in the arg list. If this # function is ever externalized, we should change the doc to support Tensor. sampled, true_expected_count, sampled_expected_count = sampled_values sampled = math_ops.cast(array_ops.stop_gradient(sampled), dtypes.int64) true_expected_count = array_ops.stop_gradient(true_expected_count) sampled_expected_count = array_ops.stop_gradient(sampled_expected_count) reweighted_inputs = inputs / resampling_temperature def logsumexp_logit(embeddings): return math_ops.reduce_logsumexp( math_ops.matmul(embeddings, reweighted_inputs, transpose_b=True), axis=1, keep_dims=False) # Calling this protected form of embedding_lookup allows co-locating # the logsumexp computation with the partitioned weights, which yields # a large speedup in practice. sampled_logits = embedding_ops._embedding_lookup_and_transform( # pylint: disable=protected-access weights, sampled, partition_strategy, transform_fn=logsumexp_logit) sampled_b = array_ops.reshape( embedding_ops.embedding_lookup(biases, sampled, partition_strategy), [-1]) sampled_logits += sampled_b / resampling_temperature _, resampled_indices = nn.top_k(sampled_logits, k=num_resampled, sorted=False) resampled = array_ops.gather(sampled, indices=resampled_indices) resampled_expected_count = array_ops.gather( sampled_expected_count, indices=resampled_indices) return resampled, true_expected_count, resampled_expected_count
def loss(): return math_ops.reduce_sum( embedding_ops.embedding_lookup(var0, [[1]]))
def loop_function(prev, out_proj, embedding): prev = nn_ops.xw_plus_b(prev, out_proj[0], out_proj[1]) prev_symbol = math_ops.argmax(prev, axis=1) emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) return [emb_prev, prev_symbol]
def __init__(self, sess, config, api, log_dir, forward, scope=None): self.vocab = api.vocab self.rev_vocab = api.rev_vocab self.vocab_size = len(self.vocab) self.sess = sess self.scope = scope self.max_utt_len = config.max_utt_len self.go_id = self.rev_vocab["<s>"] self.eos_id = self.rev_vocab["</s>"] self.context_cell_size = config.cxt_cell_size self.sent_cell_size = config.sent_cell_size self.dec_cell_size = config.dec_cell_size self.num_topics = config.num_topics with tf.name_scope("io"): # all dialog context and known attributes self.input_contexts = tf.placeholder(dtype=tf.int32, shape=(None, None, self.max_utt_len), name="dialog_context") self.floors = tf.placeholder(dtype=tf.float32, shape=(None, None), name="floor") # TODO float self.floor_labels = tf.placeholder(dtype=tf.float32, shape=(None, 1), name="floor_labels") self.context_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="context_lens") self.paragraph_topics = tf.placeholder(dtype=tf.float32, shape=(None, self.num_topics), name="paragraph_topics") # target response given the dialog context self.output_tokens = tf.placeholder(dtype=tf.int32, shape=(None, None), name="output_token") self.output_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="output_lens") self.output_das = tf.placeholder(dtype=tf.float32, shape=(None, self.num_topics), name="output_dialog_acts") # optimization related variables self.learning_rate = tf.Variable(float(config.init_lr), trainable=False, name="learning_rate") self.learning_rate_decay_op = self.learning_rate.assign( tf.multiply(self.learning_rate, config.lr_decay)) self.global_t = tf.placeholder(dtype=tf.int32, name="global_t") self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior") max_dialog_len = array_ops.shape(self.input_contexts)[1] max_out_len = array_ops.shape(self.output_tokens)[1] batch_size = array_ops.shape(self.input_contexts)[0] with variable_scope.variable_scope("wordEmbedding"): self.embedding = tf.get_variable( "embedding", [self.vocab_size, config.embed_size], dtype=tf.float32) embedding_mask = tf.constant( [0 if i == 0 else 1 for i in range(self.vocab_size)], dtype=tf.float32, shape=[self.vocab_size, 1]) embedding = self.embedding * embedding_mask # embed the input input_embedding = embedding_ops.embedding_lookup( embedding, tf.reshape(self.input_contexts, [-1])) # reshape embedding. -1 means that the first dimension can be whatever necessary to make the other 2 dimensions work w/the data input_embedding = tf.reshape( input_embedding, [-1, self.max_utt_len, config.embed_size]) # embed the output so you can feed it into the VAE output_embedding = embedding_ops.embedding_lookup( embedding, self.output_tokens) # if config.sent_type == "bow": input_embedding, sent_size = get_bow(input_embedding) output_embedding, _ = get_bow(output_embedding) elif config.sent_type == "rnn": sent_cell = self.get_rnncell("gru", self.sent_cell_size, config.keep_prob, 1) input_embedding, sent_size = get_rnn_encode(input_embedding, sent_cell, scope="sent_rnn") output_embedding, _ = get_rnn_encode(output_embedding, sent_cell, self.output_lens, scope="sent_rnn", reuse=True) elif config.sent_type == "bi_rnn": fwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) bwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) input_embedding, sent_size = get_bi_rnn_encode( input_embedding, fwd_sent_cell, bwd_sent_cell, scope="sent_bi_rnn") output_embedding, _ = get_bi_rnn_encode(output_embedding, fwd_sent_cell, bwd_sent_cell, self.output_lens, scope="sent_bi_rnn", reuse=True) else: raise ValueError( "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]") # reshape input into dialogs input_embedding = tf.reshape(input_embedding, [-1, max_dialog_len, sent_size]) if config.keep_prob < 1.0: input_embedding = tf.nn.dropout(input_embedding, config.keep_prob) # reshape floors floor = tf.reshape(self.floors, [-1, max_dialog_len, 1]) joint_embedding = tf.concat([input_embedding, floor], 2, "joint_embedding") with variable_scope.variable_scope("contextRNN"): enc_cell = self.get_rnncell(config.cell_type, self.context_cell_size, keep_prob=1.0, num_layer=config.num_layer) # and enc_last_state will be same as the true last state _, enc_last_state = tf.nn.dynamic_rnn( enc_cell, joint_embedding, dtype=tf.float32, sequence_length=self.context_lens) if config.num_layer > 1: if config.cell_type == 'lstm': enc_last_state = [temp.h for temp in enc_last_state] enc_last_state = tf.concat(enc_last_state, 1) else: if config.cell_type == 'lstm': enc_last_state = enc_last_state.h # combine with other attributes if config.use_hcf: # TODO is this reshape ok? attribute_embedding = tf.reshape( self.output_das, [-1, self.num_topics]) # da_embedding attribute_fc1 = layers.fully_connected(attribute_embedding, 30, activation_fn=tf.tanh, scope="attribute_fc1") # conditions include topic and rnn of all previous birnn results and metadata about the two people cond_list = [self.paragraph_topics, enc_last_state] cond_embedding = tf.concat(cond_list, 1) #float32 with variable_scope.variable_scope("recognitionNetwork"): if config.use_hcf: recog_input = tf.concat( [cond_embedding, output_embedding, attribute_fc1], 1) else: recog_input = tf.concat([cond_embedding, output_embedding], 1) self.recog_mulogvar = recog_mulogvar = layers.fully_connected( recog_input, config.latent_size * 2, activation_fn=None, scope="muvar") # mu and logvar are both vectors of size latent_size recog_mu, recog_logvar = tf.split(recog_mulogvar, 2, axis=1) with variable_scope.variable_scope("priorNetwork"): # P(XYZ)=P(Z|X)P(X)P(Y|X,Z) prior_fc1 = layers.fully_connected(cond_embedding, np.maximum( config.latent_size * 2, 100), activation_fn=tf.tanh, scope="fc1") prior_mulogvar = layers.fully_connected(prior_fc1, config.latent_size * 2, activation_fn=None, scope="muvar") prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=1) latent_sample = tf.cond( self.use_prior, lambda: sample_gaussian(prior_mu, prior_logvar), lambda: sample_gaussian(recog_mu, recog_logvar)) with variable_scope.variable_scope("generationNetwork"): gen_inputs = tf.concat([cond_embedding, latent_sample], 1) #float32 # BOW loss bow_fc1 = layers.fully_connected(gen_inputs, 400, activation_fn=tf.tanh, scope="bow_fc1") if config.keep_prob < 1.0: bow_fc1 = tf.nn.dropout(bow_fc1, config.keep_prob) self.bow_logits = layers.fully_connected(bow_fc1, self.vocab_size, activation_fn=None, scope="bow_project") # Predicting Y (topic) if config.use_hcf: meta_fc1 = layers.fully_connected(gen_inputs, 400, activation_fn=tf.tanh, scope="meta_fc1") if config.keep_prob < 1.0: meta_fc1 = tf.nn.dropout(meta_fc1, config.keep_prob) self.da_logits = layers.fully_connected( meta_fc1, self.num_topics, scope="da_project") # float32 da_prob = tf.nn.softmax(self.da_logits) pred_attribute_embedding = da_prob # TODO change the name of this to predicted sentence topic # pred_attribute_embedding = tf.matmul(da_prob, d_embedding) if forward: selected_attribute_embedding = pred_attribute_embedding else: selected_attribute_embedding = attribute_embedding dec_inputs = tf.concat( [gen_inputs, selected_attribute_embedding], 1) # if use_hcf not on, the model won't predict the Y else: self.da_logits = tf.zeros((batch_size, self.num_topics)) dec_inputs = gen_inputs selected_attribute_embedding = None # Predicting whether or not end of paragraph self.paragraph_end_logits = layers.fully_connected( gen_inputs, 1, activation_fn=tf.tanh, scope="paragraph_end_fc1") # float32 # Decoder if config.num_layer > 1: dec_init_state = [] for i in range(config.num_layer): temp_init = layers.fully_connected(dec_inputs, self.dec_cell_size, activation_fn=None, scope="init_state-%d" % i) if config.cell_type == 'lstm': # initializer thing for lstm temp_init = rnn_cell.LSTMStateTuple( temp_init, temp_init) dec_init_state.append(temp_init) dec_init_state = tuple(dec_init_state) else: dec_init_state = layers.fully_connected(dec_inputs, self.dec_cell_size, activation_fn=None, scope="init_state") if config.cell_type == 'lstm': dec_init_state = rnn_cell.LSTMStateTuple( dec_init_state, dec_init_state) with variable_scope.variable_scope("decoder"): dec_cell = self.get_rnncell(config.cell_type, self.dec_cell_size, config.keep_prob, config.num_layer) # projects into thing of vocab size. TODO no softmax? dec_cell = OutputProjectionWrapper(dec_cell, self.vocab_size) if forward: loop_func = decoder_fn_lib.context_decoder_fn_inference( None, dec_init_state, embedding, start_of_sequence_id=self.go_id, end_of_sequence_id=self.eos_id, maximum_length=self.max_utt_len, num_decoder_symbols=self.vocab_size, context_vector=selected_attribute_embedding) dec_input_embedding = None dec_seq_lens = None else: loop_func = decoder_fn_lib.context_decoder_fn_train( dec_init_state, selected_attribute_embedding) dec_input_embedding = embedding_ops.embedding_lookup( embedding, self.output_tokens) dec_input_embedding = dec_input_embedding[:, 0:-1, :] dec_seq_lens = self.output_lens - 1 if config.keep_prob < 1.0: dec_input_embedding = tf.nn.dropout( dec_input_embedding, config.keep_prob) # apply word dropping. Set dropped word to 0 if config.dec_keep_prob < 1.0: # get make of keep/throw-away keep_mask = tf.less_equal( tf.random_uniform((batch_size, max_out_len - 1), minval=0.0, maxval=1.0), config.dec_keep_prob) keep_mask = tf.expand_dims(tf.to_float(keep_mask), 2) dec_input_embedding = dec_input_embedding * keep_mask dec_input_embedding = tf.reshape( dec_input_embedding, [-1, max_out_len - 1, config.embed_size]) dec_outs, _, final_context_state = dynamic_rnn_decoder( dec_cell, loop_func, inputs=dec_input_embedding, sequence_length=dec_seq_lens, name='output_node') if final_context_state is not None: final_context_state = final_context_state[:, 0:array_ops. shape(dec_outs)[1]] mask = tf.to_int32(tf.sign(tf.reduce_max(dec_outs, axis=2))) self.dec_out_words = tf.multiply( tf.reverse(final_context_state, axis=[1]), mask) else: self.dec_out_words = tf.argmax(dec_outs, 2) if not forward: with variable_scope.variable_scope("loss"): labels = self.output_tokens[:, 1:] # correct word tokens label_mask = tf.to_float(tf.sign(labels)) # Loss between words rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=dec_outs, labels=labels) rc_loss = tf.reduce_sum(rc_loss * label_mask, reduction_indices=1) self.avg_rc_loss = tf.reduce_mean(rc_loss) # used only for perpliexty calculation. Not used for optimzation self.rc_ppl = tf.exp( tf.reduce_sum(rc_loss) / tf.reduce_sum(label_mask)) # BOW loss tile_bow_logits = tf.tile(tf.expand_dims(self.bow_logits, 1), [1, max_out_len - 1, 1]) bow_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tile_bow_logits, labels=labels) * label_mask bow_loss = tf.reduce_sum(bow_loss, reduction_indices=1) self.avg_bow_loss = tf.reduce_mean(bow_loss) # Predict 0/1 (1 = last sentence in paragraph) end_loss = tf.nn.softmax_cross_entropy_with_logits( labels=self.floor_labels, logits=self.paragraph_end_logits) self.avg_end_loss = tf.reduce_mean(end_loss) # Topic prediction loss if config.use_hcf: div_prob = tf.divide(self.da_logits, self.output_das) self.avg_da_loss = tf.reduce_mean( -tf.nn.softmax_cross_entropy_with_logits( logits=self.da_logits, labels=div_prob)) else: self.avg_da_loss = 0.0 kld = gaussian_kld(recog_mu, recog_logvar, prior_mu, prior_logvar) self.avg_kld = tf.reduce_mean(kld) if log_dir is not None: kl_weights = tf.minimum( tf.to_float(self.global_t) / config.full_kl_step, 1.0) else: kl_weights = tf.constant(1.0) self.kl_w = kl_weights self.elbo = self.avg_rc_loss + kl_weights * self.avg_kld aug_elbo = self.avg_bow_loss + self.avg_da_loss + self.elbo + self.avg_end_loss tf.summary.scalar("da_loss", self.avg_da_loss) tf.summary.scalar("rc_loss", self.avg_rc_loss) tf.summary.scalar("elbo", self.elbo) tf.summary.scalar("kld", self.avg_kld) tf.summary.scalar("bow_loss", self.avg_bow_loss) tf.summary.scalar("paragraph_end_loss", self.avg_end_loss) self.summary_op = tf.summary.merge_all() self.log_p_z = norm_log_liklihood(latent_sample, prior_mu, prior_logvar) self.log_q_z_xy = norm_log_liklihood(latent_sample, recog_mu, recog_logvar) self.est_marginal = tf.reduce_mean(rc_loss + bow_loss - self.log_p_z + self.log_q_z_xy) self.optimize(sess, config, aug_elbo, log_dir) self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2)
def embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_symbols, embedding_size, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None): """Embedding RNN sequence-to-sequence model with tied (shared) parameters. This model first embeds encoder_inputs by a newly created embedding (of shape [num_symbols x input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs using the same embedding. Then it runs RNN decoder, initialized with the last encoder state, on embedded decoder_inputs. Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_symbols: Integer; number of symbols for both encoder and decoder. embedding_size: Integer, the length of the embedding vector for each symbol. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype to use for the initial RNN states (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_tied_rnn_seq2seq". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: When output_projection has the wrong shape. """ if output_projection is not None: proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype) proj_weights.get_shape().assert_is_compatible_with([None, num_symbols]) proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with variable_scope.variable_scope(scope or "embedding_tied_rnn_seq2seq"): with ops.device("/cpu:0"): embedding = variable_scope.get_variable("embedding", [num_symbols, embedding_size]) emb_encoder_inputs = [embedding_ops.embedding_lookup(embedding, x) for x in encoder_inputs] emb_decoder_inputs = [embedding_ops.embedding_lookup(embedding, x) for x in decoder_inputs] if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_symbols) if isinstance(feed_previous, bool): loop_function = _extract_argmax_and_embed( embedding, output_projection, True) if feed_previous else None return tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell, loop_function=loop_function, dtype=dtype) # If feed_previous is a Tensor, we construct 2 graphs and use cond. def decoder(feed_previous_bool): loop_function = _extract_argmax_and_embed( embedding, output_projection, False) if feed_previous_bool else None reuse = None if feed_previous_bool else True with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=reuse): outputs, state = tied_rnn_seq2seq( emb_encoder_inputs, emb_decoder_inputs, cell, loop_function=loop_function, dtype=dtype) return outputs + [state] outputs_and_state = control_flow_ops.cond(feed_previous, lambda: decoder(True), lambda: decoder(False)) return outputs_and_state[:-1], outputs_and_state[-1]
def dynamic_distraction_m2_decoder_wrapper(decoder_inputs, initial_state, distract_initial_state, attention_states, attention_states_query, cell_encoder, distraction_cell, num_symbols, embedding_size, num_heads=1, output_size=None, output_projection=None, feed_previous=False, update_embedding_for_previous=True, embedding_scope=None, dtype=None, scope=None, initial_state_attention=False): """RNN decoder with embedding and attention and a pure-decoding option. Args: decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function. num_symbols: Integer, how many symbols come into the embedding. embedding_size: Integer, the length of the embedding vector for each symbol. num_heads: Number of attention heads that read from attention_states. output_size: Size of the output vectors; if None, use output_size. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be generated by: next = embedding_lookup(embedding, argmax(previous_output)), In effect, this implements a greedy decoder. It can also be used during training to emulate http://arxiv.org/abs/1506.03099. If False, decoder_inputs are used as given (the standard decoder case). update_embedding_for_previous: Boolean; if False and feed_previous=True, only the embedding for the first symbol of decoder_inputs (the "GO" symbol) will be updated by back propagation. Embeddings for the symbols generated from the decoder itself remain unchanged. This parameter has no effect if feed_previous=False. dtype: The dtype to use for the RNN initial states (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: When output_projection has the wrong shape. """ if output_size is None: output_size = cell_encoder.output_size if output_projection is not None: proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with variable_scope.variable_scope( embedding_scope or "dynamic_distraction_m2_decoder_wrapper", dtype=dtype, reuse=True) as s1: print("Preksha", s1.name) embedding = variable_scope.get_variable("embedding", [num_symbols, embedding_size]) loop_function = _extract_argmax_and_embed( embedding, output_projection, update_embedding_for_previous) if feed_previous else None emb_inp = [ embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs ] with variable_scope.variable_scope( scope or "dynamic_distraction_m2_decoder_wrapper", dtype=dtype) as scope: return dynamic_distraction_m2_decoder( emb_inp, initial_state=initial_state, attention_states_query=attention_states_query, attention_states=attention_states, cell1=cell_encoder, cell2=cell_encoder, distract_initial_state=distract_initial_state, distraction_cell=distraction_cell, output_size=output_size, num_heads=num_heads, loop_function=loop_function, initial_state_attention=initial_state_attention)
def _compute_sampled_logits(weights, biases, inputs, labels, num_sampled, num_classes, num_true=1, sampled_values=None, subtract_log_q=True, remove_accidental_hits=False, partition_strategy="mod", name=None): """Helper function for nce_loss and sampled_softmax_loss functions. Computes sampled output training logits and labels suitable for implementing e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see sampled_softmax_loss). Note: In the case where num_true > 1, we assign to each target class the target probability 1 / num_true so that the target probabilities sum to 1 per-example. Args: weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor` objects whose concatenation along dimension 0 has shape `[num_classes, dim]`. The (possibly-partitioned) class embeddings. biases: A `Tensor` of shape `[num_classes]`. The class biases. inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The target classes. Note that this format differs from the `labels` argument of `nn.softmax_cross_entropy_with_logits`. num_sampled: An `int`. The number of classes to randomly sample per batch. num_classes: An `int`. The number of possible classes. num_true: An `int`. The number of target classes per training example. sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`, `sampled_expected_count`) returned by a `*_candidate_sampler` function. (if None, we default to `log_uniform_candidate_sampler`) subtract_log_q: A `bool`. whether to subtract the log expected count of the labels in the sample to get the logits of the true labels. Default is True. Turn off for Negative Sampling. remove_accidental_hits: A `bool`. whether to remove "accidental hits" where a sampled class equals one of the target classes. Default is False. partition_strategy: A string specifying the partitioning strategy, relevant if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported. Default is `"mod"`. See `tf.nn.embedding_lookup` for more details. name: A name for the operation (optional). Returns: out_logits, out_labels: `Tensor` objects each with shape `[batch_size, num_true + num_sampled]`, for passing to either `nn.sigmoid_cross_entropy_with_logits` (NCE) or `nn.softmax_cross_entropy_with_logits` (sampled softmax). """ if not isinstance(weights, list): weights = [weights] with ops.op_scope(weights + [biases, inputs, labels], name, "compute_sampled_logits"): if labels.dtype != dtypes.int64: labels = math_ops.cast(labels, dtypes.int64) labels_flat = array_ops.reshape(labels, [-1]) # Sample the negative labels. # sampled shape: [num_sampled] tensor # true_expected_count shape = [batch_size, 1] tensor # sampled_expected_count shape = [num_sampled] tensor if sampled_values is None: sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler( true_classes=labels, num_true=num_true, num_sampled=num_sampled, unique=True, range_max=num_classes) # NOTE: pylint cannot tell that 'sampled_values' is a sequence # pylint: disable=unpacking-non-sequence sampled, true_expected_count, sampled_expected_count = sampled_values # pylint: enable=unpacking-non-sequence # labels_flat is a [batch_size * num_true] tensor # sampled is a [num_sampled] int tensor all_ids = array_ops.concat(0, [labels_flat, sampled]) # weights shape is [num_classes, dim] all_w = embedding_ops.embedding_lookup( weights, all_ids, partition_strategy=partition_strategy) all_b = embedding_ops.embedding_lookup(biases, all_ids) # true_w shape is [batch_size * num_true, dim] # true_b is a [batch_size * num_true] tensor true_w = array_ops.slice( all_w, [0, 0], array_ops.pack([array_ops.shape(labels_flat)[0], -1])) true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat)) # inputs shape is [batch_size, dim] # true_w shape is [batch_size * num_true, dim] # row_wise_dots is [batch_size, num_true, dim] dim = array_ops.shape(true_w)[1:2] new_true_w_shape = array_ops.concat(0, [[-1, num_true], dim]) row_wise_dots = math_ops.mul( array_ops.expand_dims(inputs, 1), array_ops.reshape(true_w, new_true_w_shape)) # We want the row-wise dot plus biases which yields a # [batch_size, num_true] tensor of true_logits. dots_as_matrix = array_ops.reshape(row_wise_dots, array_ops.concat(0, [[-1], dim])) true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true]) true_b = array_ops.reshape(true_b, [-1, num_true]) true_logits += true_b # Lookup weights and biases for sampled labels. # sampled_w shape is [num_sampled, dim] # sampled_b is a [num_sampled] float tensor sampled_w = array_ops.slice( all_w, array_ops.pack([array_ops.shape(labels_flat)[0], 0]), [-1, -1]) sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1]) # inputs has shape [batch_size, dim] # sampled_w has shape [num_sampled, dim] # sampled_b has shape [num_sampled] # Apply X*W'+B, which yields [batch_size, num_sampled] sampled_logits = math_ops.matmul(inputs, sampled_w, transpose_b=True) + sampled_b if remove_accidental_hits: acc_hits = candidate_sampling_ops.compute_accidental_hits( labels, sampled, num_true=num_true) acc_indices, acc_ids, acc_weights = acc_hits # This is how SparseToDense expects the indices. acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1]) acc_ids_2d_int32 = array_ops.reshape( math_ops.cast(acc_ids, dtypes.int32), [-1, 1]) sparse_indices = array_ops.concat( 1, [acc_indices_2d, acc_ids_2d_int32], "sparse_indices") # Create sampled_logits_shape = [batch_size, num_sampled] sampled_logits_shape = array_ops.concat(0, [ array_ops.shape(labels)[:1], array_ops.expand_dims(num_sampled, 0) ]) if sampled_logits.dtype != acc_weights.dtype: acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype) sampled_logits += sparse_ops.sparse_to_dense( sparse_indices, sampled_logits_shape, acc_weights, default_value=0.0, validate_indices=False) if subtract_log_q: # Subtract log of Q(l), prior probability that l appears in sampled. true_logits -= math_ops.log(true_expected_count) sampled_logits -= math_ops.log(sampled_expected_count) # Construct output logits and labels. The true labels/logits start at col 0. out_logits = array_ops.concat(1, [true_logits, sampled_logits]) # true_logits is a float tensor, ones_like(true_logits) is a float tensor # of ones. We then divide by num_true to ensure the per-example labels sum # to 1.0, i.e. form a proper probability distribution. out_labels = array_ops.concat(1, [ array_ops.ones_like(true_logits) / num_true, array_ops.zeros_like(sampled_logits) ]) return out_logits, out_labels
def Body(it, cost): embedding = embedding_ops.embedding_lookup( embedding_matrix, [0]) cost += math_ops.reduce_sum(embedding) return it + 1, cost
def call(self, inputs): dtype = K.dtype(inputs) if dtype != 'int32' and dtype != 'int64': inputs = math_ops.cast(inputs, 'int32') out = embedding_ops.embedding_lookup(self.embeddings, inputs) return out
def __init__(self, cell, embedding, first_input, end_token, initial_state, beam_width, vocab_size=None, output_fn=None, length_penalty_weight=0.0): """Initialize BeamSearchDecoder. Args: cell: An `RNNCell` instance. embedding: A callable that takes a vector tensor of `ids` (argmax ids), or the `params` argument for `embedding_lookup`. start_tokens: `int32` vector shaped `[batch_size]`, the start tokens. end_token: `int32` scalar, the token that marks end of decoding. initial_state: A (possibly nested tuple of...) tensors and TensorArrays. beam_width: Python integer, the number of beams. output_fn: (Optional) An instance of `tf.layers.Layer`, i.e., `tf.layers.Dense`. Optional layer to apply to the RNN output prior to storing the result or sampling. length_penalty_weight: Float weight to penalize length. Disabled with 0.0. Raises: TypeError: if `cell` is not an instance of `RNNCell`, or `output_fn` is not an instance of `tf.layers.Layer`. ValueError: If `start_tokens` is not a vector or `end_token` is not a scalar. """ if not isinstance(cell, core_rnn_cell.RNNCell): raise TypeError("cell must be an RNNCell, received: %s" % type(cell)) self._cell = cell self._output_fn = output_fn if callable(embedding): self._embedding_fn = embedding else: self._embedding_fn = ( lambda ids: embedding_ops.embedding_lookup(embedding, ids)) self._end_token = ops.convert_to_tensor(end_token, dtype=dtypes.int32, name="end_token") if self._end_token.get_shape().ndims != 0: raise ValueError("end_token must be a scalar") if vocab_size is not None: if output_fn is not None: self._output_fn = output_fn else: self._output_fn = lambda cell_output: tf.contrib.layers.fully_connected( inputs=cell_output, num_outputs=vocab_size, activation_fn=None) self._vocab_size = vocab_size self._output_size = self._vocab_size if self._vocab_size is not None else self._cell.output_size #--TODO #try: #self._batch_size = ops.convert_to_tensor(first_input.get_shape().as_list()[0]) self._batch_size = first_input.shape[0].value #except Exception: if self._batch_size is None: self._batch_size = array_ops.shape(first_input)[0] self._beam_width = beam_width self._length_penalty_weight = length_penalty_weight self._initial_cell_state = nest.map_structure( self._maybe_split_batch_beams, initial_state, self._cell.state_size) self._start_inputs = array_ops.tile( array_ops.expand_dims(first_input, 1), [1, self._beam_width, 1]) self._finished = array_ops.zeros([self._batch_size, self._beam_width], dtype=dtypes.bool)
def call(self, x): return embedding_ops.embedding_lookup(self.embedding, x)
def __init__(self, sess, config, api, log_dir, forward, scope=None): self.vocab_size = 32 self.sess = sess self.scope = scope self.sent_cell_size = config.sent_cell_size self.max_length = config.max_length with tf.name_scope("io"): # all dialog context and known attributes self.sensor = tf.placeholder(dtype=tf.int32, shape=(None, None), name="sensor") # target response given the dialog context self.output = tf.placeholder(dtype=tf.float32, shape=(None, ), name="output") # optimization related variables self.learning_rate = tf.Variable(float(config.init_lr), trainable=False, name="learning_rate") self.learning_rate_decay_op = self.learning_rate.assign( tf.multiply(self.learning_rate, config.lr_decay)) self.global_t = tf.placeholder(dtype=tf.int32, name="global_t") with variable_scope.variable_scope("wordEmbedding"): self.embedding = tf.get_variable( "embedding", [self.vocab_size, config.embed_size], dtype=tf.float32) embedding_mask = tf.constant( [0 if i == 0 else 1 for i in range(self.vocab_size)], dtype=tf.float32, shape=[self.vocab_size, 1]) embedding = self.embedding * embedding_mask input_embedding = embedding_ops.embedding_lookup( embedding, self.sensor) length_mask = tf.reduce_sum(tf.sign( tf.reduce_max(tf.abs(input_embedding), reduction_indices=2)), reduction_indices=1) length_mask = tf.to_int32(length_mask) mask = tf.sequence_mask(length_mask, self.max_length, tf.float32) one = tf.ones_like(mask) bias = one - mask bias = -100000 * bias if config.sent_type == "bow": pass # input_embedding, sent_size = get_bow(input_embedding) elif config.sent_type == "rnn": pass # sent_cell = self.get_rnncell("gru", self.sent_cell_size, config.keep_prob, 1) # input_embedding, sent_size = get_rnn_encode(input_embedding, sent_cell, scope="sent_rnn") elif config.sent_type == "bi_rnn": fwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) bwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) input_embedding, sent_size, hidden = get_bi_rnn_encode( input_embedding, fwd_sent_cell, bwd_sent_cell, scope="sent_bi_rnn") input_embedding = tf.expand_dims(input_embedding, 1) query = tf.get_variable("query", [config.att_size], dtype=tf.float32) #input_embedding = layers.fully_connected(input_embedding, config.att_size, activation_fn=None, biases_initializer=None, scope="att") hidden_project = layers.fully_connected( hidden, config.att_size, activation_fn=None, biases_initializer=None, scope="att") vector_attn = tf.reduce_sum(tf.multiply(hidden_project, query), axis=2, keep_dims=True) bias = tf.expand_dims(bias, -1) attention_weights = tf.nn.softmax(vector_attn + bias, dim=1) self.weights = attention_weights attention = hidden * attention_weights feature = tf.reduce_sum(attention, 1) else: raise ValueError( "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]") # reshape input into dialogs if config.keep_prob < 1.0: feature = tf.nn.dropout(feature, config.keep_prob) # convert floors into 1 hot predict = layers.fully_connected(feature, 1, activation_fn=None, scope="fc") self.predict = tf.squeeze(predict) self.loss = tf.losses.absolute_difference(self.output, self.predict) tf.summary.scalar("loss", self.loss) self.summary_op = tf.summary.merge_all() self.optimize(sess, config, self.loss, log_dir) self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2)
def generate_embedding_RNN_output(encoder_inputs, cell, num_encoder_symbols, word_embedding_size, embedding, num_heads=1, dtype=dtypes.float32, scope=None, initial_state_attention=False, sequence_length=None, bidirectional_rnn=False): """ Generate RNN state outputs with word embeddings as inputs - Note that this example code does not include output label dependency modeling. One may add a loop function as in the rnn_decoder function in tf seq2seq.py example to feed emitted label embedding back to RNN state. """ with variable_scope.variable_scope(scope or "generate_embedding_RNN_output"): if bidirectional_rnn: encoder_cell_fw = cell encoder_cell_bw = cell #embedding = variable_scope.get_variable("embedding", [num_encoder_symbols, word_embedding_size]) encoder_embedded_inputs = list() #n_symbol, embed_size = embedding.shape #X = variable_scope.get_variable("X", [embed_size, embed_size]) #b = variable_scope.get_variable("b", [embed_size]) #encoder_embedded_inputs = [tf.multiply(embedding_ops.embedding_lookup(embedding, encoder_input), X) + b for encoder_input in encoder_inputs] encoder_embedded_inputs = [ embedding_ops.embedding_lookup(embedding, encoder_input) for encoder_input in encoder_inputs ] encoder_outputs, encoder_state_fw, encoder_state_bw = rnn.static_bidirectional_rnn( encoder_cell_fw, encoder_cell_bw, encoder_embedded_inputs, sequence_length=sequence_length, dtype=dtype) encoder_state = array_ops.concat([ array_ops.concat(encoder_state_fw, 1), array_ops.concat(encoder_state_bw, 1) ], 1) top_states = [ array_ops.reshape(e, [-1, 1, cell.output_size * 2]) for e in encoder_outputs ] attention_states = array_ops.concat(top_states, 1) else: encoder_cell = cell embedding = variable_scope.get_variable( "embedding", [num_encoder_symbols, word_embedding_size]) encoder_embedded_inputs = list() encoder_embedded_inputs = [ embedding_ops.embedding_lookup(embedding, encoder_input) for encoder_input in encoder_inputs ] encoder_outputs, encoder_state = rnn.rnn( encoder_cell, encoder_embedded_inputs, sequence_length=sequence_length, dtype=dtype) encoder_state = array_ops.concat(encoder_state, 1) top_states = [ array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs ] attention_states = array_ops.concat(top_states, 1) return encoder_outputs, encoder_state, attention_states
def _process_input_helper(self, update_row_factors, sp_input=None, transpose_input=False, row_weights=None): """Creates the graph for processing a sparse slice of input. Args: update_row_factors: if True, update or project the row_factors, else update or project the column factors. sp_input: Please refer to comments for update_row_factors, update_col_factors, project_row_factors, and project_col_factors for restrictions. transpose_input: If True, the input is logically transposed and then the corresponding rows/columns of the transposed input are updated. row_weights: If not None, this is the row/column weights to be used for the update or projection. If None, use the corresponding weights from the model. Note that the feature (column/row) weights will be determined by the model. When not None, it can either be a scalar or a rank-1 tensor with the same number of elements as the number of rows of columns to be updated/projected. Returns: A tuple consisting of the following elements: new_values: New values for the row/column factors. update_op: An op that assigns the newly computed values to the row/column factors. unregularized_loss: A tensor (scalar) that contains the normalized minibatch loss corresponding to sp_input, without the regularization term. Add the regularization term below to yield the loss. regularization: A tensor (scalar) that contains the normalized regularization term for the minibatch loss corresponding to sp_input. sum_weights: The sum of the weights corresponding to sp_input. This can be used with unregularized loss to calculate the root weighted squared error. """ assert isinstance(sp_input, sparse_tensor.SparseTensor) if update_row_factors: left = self._row_factors right_factors = self._col_factors_cache row_wt = self._row_wt_cache col_wt = self._col_wt_cache total_rows = self._input_rows total_cols = self._input_cols sharding_func = WALSModel._get_sharding_func( self._input_rows, self._num_row_shards) gramian = self._col_gramian_cache else: left = self._col_factors right_factors = self._row_factors_cache row_wt = self._col_wt_cache col_wt = self._row_wt_cache total_rows = self._input_cols total_cols = self._input_rows sharding_func = WALSModel._get_sharding_func( self._input_cols, self._num_col_shards) gramian = self._row_gramian_cache transpose_input = not transpose_input # Note that the row indices of sp_input are based on the original full input # Here we reindex the rows and give them contiguous ids starting at 0. # We use tf.unique to achieve this reindexing. Note that this is done so # that the downstream kernel can assume that the input is "dense" along the # row dimension. row_ids, col_ids = array_ops.split(value=sp_input.indices, num_or_size_splits=2, axis=1) update_row_indices, all_row_ids = array_ops.unique(row_ids[:, 0]) update_col_indices, all_col_ids = array_ops.unique(col_ids[:, 0]) col_ids = array_ops.expand_dims( math_ops.cast(all_col_ids, dtypes.int64), 1) row_ids = array_ops.expand_dims( math_ops.cast(all_row_ids, dtypes.int64), 1) if transpose_input: update_indices = update_col_indices row_shape = [ math_ops.cast( array_ops.shape(update_row_indices)[0], dtypes.int64) ] gather_indices = update_row_indices else: update_indices = update_row_indices row_shape = [ math_ops.cast( array_ops.shape(update_col_indices)[0], dtypes.int64) ] gather_indices = update_col_indices num_rows = math_ops.cast( array_ops.shape(update_indices)[0], dtypes.int64) col_shape = [num_rows] right = embedding_ops.embedding_lookup(right_factors, gather_indices, partition_strategy="div") new_sp_indices = array_ops.concat([row_ids, col_ids], 1) new_sp_shape = (array_ops.concat([row_shape, col_shape], 0) if transpose_input else array_ops.concat( [col_shape, row_shape], 0)) new_sp_input = sparse_tensor.SparseTensor(indices=new_sp_indices, values=sp_input.values, dense_shape=new_sp_shape) # Compute lhs and rhs of the normal equations total_lhs = (self._unobserved_weight * gramian) if self._regularization_matrix is not None: total_lhs += self._regularization_matrix if self._row_weights is None: # Special case of ALS. Use a much simpler update rule. total_rhs = (self._unobserved_weight * sparse_ops.sparse_tensor_dense_matmul( new_sp_input, right, adjoint_a=transpose_input)) # TODO(rmlarsen): handle transposing in tf.matrix_solve instead of # transposing explicitly. # TODO(rmlarsen): multi-thread tf.matrix_solve. new_left_values = array_ops.transpose( linalg_ops.matrix_solve(total_lhs, array_ops.transpose(total_rhs))) else: if row_weights is None: # TODO(yifanchen): Add special handling for single shard without using # embedding_lookup and perform benchmarks for those cases. Same for # col_weights lookup below. row_weights_slice = embedding_ops.embedding_lookup( row_wt, update_indices, partition_strategy="div") else: num_indices = array_ops.shape(update_indices)[0] with ops.control_dependencies([ check_ops.assert_less_equal( array_ops.rank(row_weights), 1) ]): row_weights_slice = control_flow_ops.cond( math_ops.equal(array_ops.rank(row_weights), 0), lambda: (array_ops.ones([num_indices]) * row_weights), lambda: math_ops.cast(row_weights, dtypes.float32)) col_weights = embedding_ops.embedding_lookup( col_wt, gather_indices, partition_strategy="div") partial_lhs, total_rhs = ( gen_factorization_ops.wals_compute_partial_lhs_and_rhs( right, col_weights, self._unobserved_weight, row_weights_slice, new_sp_input.indices, new_sp_input.values, [], num_rows, transpose_input, name="wals_compute_partial_lhs_rhs")) total_lhs = array_ops.expand_dims(total_lhs, 0) + partial_lhs total_rhs = array_ops.expand_dims(total_rhs, -1) new_left_values = array_ops.squeeze( linalg_ops.matrix_solve(total_lhs, total_rhs), [2]) update_op_name = "row_update" if update_row_factors else "col_update" update_op = self.scatter_update(left, update_indices, new_left_values, sharding_func, name=update_op_name) # Create the loss subgraph loss_sp_input = (sparse_ops.sparse_transpose(new_sp_input) if transpose_input else new_sp_input) # sp_approx is the low rank estimate of the input matrix, formed by # computing the product <\\(u_i, v_j\\)> for (i, j) in loss_sp_input.indices. sp_approx_vals = gen_factorization_ops.masked_matmul( new_left_values, right, loss_sp_input.indices, transpose_a=False, transpose_b=True) sp_approx = sparse_tensor.SparseTensor(loss_sp_input.indices, sp_approx_vals, loss_sp_input.dense_shape) sp_approx_sq = math_ops.square(sp_approx) sp_residual = sparse_ops.sparse_add(loss_sp_input, sp_approx * (-1)) sp_residual_sq = math_ops.square(sp_residual) row_wt_mat = (constant_op.constant(0.) if self._row_weights is None else array_ops.expand_dims(row_weights_slice, 1)) col_wt_mat = (constant_op.constant(0.) if self._col_weights is None else array_ops.expand_dims(col_weights, 0)) # We return the normalized loss partial_row_gramian = math_ops.matmul(new_left_values, new_left_values, transpose_a=True) normalization_factor = total_rows / math_ops.cast( num_rows, dtypes.float32) unregularized_loss = ( self._unobserved_weight * ( # pyformat line break sparse_ops.sparse_reduce_sum(sp_residual_sq) - # pyformat break sparse_ops.sparse_reduce_sum(sp_approx_sq) + # pyformat break math_ops.trace(math_ops.matmul(partial_row_gramian, gramian))) + sparse_ops.sparse_reduce_sum( row_wt_mat * (sp_residual_sq * col_wt_mat))) * normalization_factor if self._regularization is not None: regularization = self._regularization * ( math_ops.trace(partial_row_gramian) * normalization_factor + math_ops.trace(gramian)) else: regularization = constant_op.constant(0.) sum_weights = self._unobserved_weight * math_ops.cast( total_rows * total_cols, dtypes.float32) if self._row_weights is not None and self._col_weights is not None: ones = sparse_tensor.SparseTensor( indices=loss_sp_input.indices, values=array_ops.ones(array_ops.shape(loss_sp_input.values)), dense_shape=loss_sp_input.dense_shape) sum_weights += sparse_ops.sparse_reduce_sum( row_wt_mat * (ones * col_wt_mat)) * normalization_factor return (new_left_values, update_op, unregularized_loss, regularization, sum_weights)
def call(self, inputs): inputs = math_ops.cast(inputs, 'int32') return embedding_ops.embedding_lookup(self.embeddings, inputs)
def testConstructionNonSharded(self): with ops.Graph().as_default(): p = variables.Variable( array_ops.zeros(shape=[100, 100], dtype=dtypes.float32)) ids = constant_op.constant([0, 1, 1, 7], dtype=dtypes.int32) embedding_ops.embedding_lookup([p], ids)
def _sampled_scattered_embedding_lookup( params, values, dimension=None, sampled_candidates=None, hash_key=None, name=None): """Looks up embeddings using parameter hashing for each value in `values`. This method looks up selected embedding dimensions if `sampled_candidates` is given, otherwise looks up all dimensions. The i-th embedding component of a value v in `values` is found by retrieving the weight whose index is a fingerprint of the pair (v,i). The concept is explored as "feature hashing" for model compression in this paper: http://arxiv.org/pdf/1504.04788.pdf Feature hashing has the pleasant effect of allowing us to compute an embedding without needing a pre-determined vocabulary, relieving some amount of process complexity. It also allows for us to maintain embeddings for possibly trillions of features with a fixed amount of memory. Note that this is superior to out-of-vocabulary shared "hash buckets" in that the embedding is extremely likely to be unique for each token as opposed to being shared across probably-colliding tokens. The price is that we must compute a hash once for each scalar in the token's embedding as opposed to once per token. If `params` is a list, it represents a partition of the embedding parameters. Each tensor in the list should have the same length, except for the first ones which may have an additional element. For instance 10 parameters can be partitioned in 4 tensors with length `[3, 3, 2, 2]`. Args: params: A `Tensor`, `list` of `Tensors`, or `PartitionedVariable`. Each tensor must be of rank 1 with fully-defined shape. values: `Tensor` of values to be embedded with shape `[d0, ..., dn]`. dimension: Embedding dimension. The user must specify either `dimension` or `sampled_candidates`. sampled_candidates: An optional `Tensor` of slice indices to keep along the final dimension with shape `[d0, ..., dn, N]`. If given, `dimension` is ignored. If `None`, looks up all candidates. hash_key: Specify the hash_key that will be used by the `FingerprintCat64` function to combine the crosses fingerprints on SparseFeatureCrossOp (optional). name: An optional name for this op. Returns: A `Tensor` with shape `[d0, ..., dn, dimension]`. If `sampled_candidates` is given, the output shape is `[d0, ..., dn, N]` Raises: ValueError: if dimension is not positive or the partition size is invalid. """ if isinstance(params, variables.PartitionedVariable): params = list(params) if not isinstance(params, list): params = [params] with ops.name_scope(name, "scattered_embedding_lookup", params + [dimension, values]): # Flatten the values values_shape = array_ops.shape(values) values = array_ops.reshape(values, [-1, 1]) if sampled_candidates is None: if dimension is None: raise ValueError( "You must specify either dimension or sampled_candidates.") if dimension <= 0: raise ValueError("Dimension must be >0. Given is %d" % dimension) sampled_candidates = array_ops.tile(array_ops.expand_dims( math_ops.range(0, dimension), 0), array_ops.shape(values)) else: dimension = array_ops.shape(sampled_candidates)[ math_ops.subtract(array_ops.rank(sampled_candidates), 1)] sampled_candidates_shape = array_ops.shape(sampled_candidates) dimension_tensor = array_ops.reshape(dimension, shape=[1,]) expected_shape = array_ops.concat([values_shape, dimension_tensor], 0) with ops.control_dependencies([control_flow_ops.Assert( math_ops.reduce_all(math_ops.equal(sampled_candidates_shape, expected_shape)), ["The shape of sampled_candidates: ", sampled_candidates_shape, " does not match the shape of values: ", values_shape])]): # Flatten sampled_candidates, same way as values are flattened. sampled_candidates = array_ops.reshape(sampled_candidates, [-1, dimension]) num_partitions = len(params) partition_sizes = [] for p in range(num_partitions): shape = params[p].get_shape() shape.assert_has_rank(1) shape.assert_is_fully_defined() partition_sizes.append(shape[0].value) num_params = sum(partition_sizes) # Total number of parameters. # Assert the size of each partition. for p in range(num_partitions): expected_size = (num_params - p - 1) // num_partitions + 1 if partition_sizes[p] != expected_size: raise ValueError("Tensor %d in params has size %d, expected %d." % (p, partition_sizes[p], expected_size)) # With two values v1 and v2 and 3 dimensions, we will cross # [[0, 1, 2], [0, 1, 2]] with [[v1], [v2]]. tensors_to_cross = [sampled_candidates, values] ids = sparse_feature_cross_op.sparse_feature_cross( tensors_to_cross, hashed_output=True, num_buckets=num_params, hash_key=hash_key) ids = sparse_ops.sparse_tensor_to_dense(ids) # No need to validate the indices since we have checked the params # dimensions and we know the largest id. result = embedding_ops.embedding_lookup( params, ids, partition_strategy="div") return array_ops.reshape(result, array_ops.concat([values_shape, [dimension]], 0))
def f(): tape.watch_variable(embedding) embedded_x = embedding_ops.embedding_lookup(embedding, x) return constant_op.constant(1.0, dtypes.float32) - embedded_x
def __init__(self, sess, config, api, log_dir, forward, scope=None): self.vocab = api.vocab self.rev_vocab = api.rev_vocab self.vocab_size = len(self.vocab) self.topic_vocab = api.topic_vocab self.topic_vocab_size = len(self.topic_vocab) self.da_vocab = api.dialog_act_vocab self.da_vocab_size = len(self.da_vocab) self.sess = sess self.scope = scope self.max_utt_len = config.max_utt_len self.go_id = self.rev_vocab["<s>"] self.eos_id = self.rev_vocab["</s>"] self.context_cell_size = config.cxt_cell_size self.sent_cell_size = config.sent_cell_size self.dec_cell_size = config.dec_cell_size with tf.name_scope("io"): # all dialog context and known attributes self.input_contexts = tf.placeholder(dtype=tf.int32, shape=(None, None, self.max_utt_len), name="dialog_context") self.floors = tf.placeholder(dtype=tf.int32, shape=(None, None), name="floor") self.context_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="context_lens") self.topics = tf.placeholder(dtype=tf.int32, shape=(None, ), name="topics") self.my_profile = tf.placeholder(dtype=tf.float32, shape=(None, 4), name="my_profile") self.ot_profile = tf.placeholder(dtype=tf.float32, shape=(None, 4), name="ot_profile") # target response given the dialog context self.output_tokens = tf.placeholder(dtype=tf.int32, shape=(None, None), name="output_token") self.output_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="output_lens") self.output_das = tf.placeholder(dtype=tf.int32, shape=(None, ), name="output_dialog_acts") # optimization related variables self.learning_rate = tf.Variable(float(config.init_lr), trainable=False, name="learning_rate") self.learning_rate_decay_op = self.learning_rate.assign( tf.multiply(self.learning_rate, config.lr_decay)) self.global_t = tf.placeholder(dtype=tf.int32, name="global_t") self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior") max_dialog_len = array_ops.shape( self.input_contexts)[1] #shape: Returns the shape of a tensor. max_out_len = array_ops.shape(self.output_tokens)[1] batch_size = array_ops.shape(self.input_contexts)[0] with variable_scope.variable_scope("topicEmbedding"): t_embedding = tf.get_variable( "embedding", [self.topic_vocab_size, config.topic_embed_size], dtype=tf.float32) topic_embedding = embedding_ops.embedding_lookup( t_embedding, self.topics) if config.use_hcf: with variable_scope.variable_scope("dialogActEmbedding"): d_embedding = tf.get_variable( "embedding", [self.da_vocab_size, config.da_embed_size], dtype=tf.float32) da_embedding = embedding_ops.embedding_lookup( d_embedding, self.output_das) #about embedding_lookup: http://blog.csdn.net/u013041398/article/details/60955847 with variable_scope.variable_scope("wordEmbedding"): self.embedding = tf.get_variable( "embedding", [self.vocab_size, config.embed_size], dtype=tf.float32) embedding_mask = tf.constant( [0 if i == 0 else 1 for i in range(self.vocab_size)], dtype=tf.float32, shape=[self.vocab_size, 1]) #?????????????? embedding = self.embedding * embedding_mask #Whether or not the input value0 is a special "padding" value that should be masked out. # This is useful for recurrent layers which may take variable length input. #maybe need to be changed input_embedding = embedding_ops.embedding_lookup( embedding, tf.reshape(self.input_contexts, [-1])) # pass '[-1]' to flatten input_contexts input_embedding = tf.reshape( input_embedding, [-1, self.max_utt_len, config.embed_size]) output_embedding = embedding_ops.embedding_lookup( embedding, self.output_tokens) if config.sent_type == "bow": input_embedding, sent_size = get_bow(input_embedding) output_embedding, _ = get_bow(output_embedding) elif config.sent_type == "rnn": sent_cell = self.get_rnncell("gru", self.sent_cell_size, config.keep_prob, 1) input_embedding, sent_size = get_rnn_encode(input_embedding, sent_cell, scope="sent_rnn") output_embedding, _ = get_rnn_encode(output_embedding, sent_cell, self.output_lens, scope="sent_rnn", reuse=True) elif config.sent_type == "bi_rnn": fwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) bwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) input_embedding, sent_size = get_bi_rnn_encode( input_embedding, fwd_sent_cell, bwd_sent_cell, scope="sent_bi_rnn") output_embedding, _ = get_bi_rnn_encode(output_embedding, fwd_sent_cell, bwd_sent_cell, self.output_lens, scope="sent_bi_rnn", reuse=True) else: raise ValueError( "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]") # reshape input into dialogs input_embedding = tf.reshape(input_embedding, [-1, max_dialog_len, sent_size]) if config.keep_prob < 1.0: input_embedding = tf.nn.dropout(input_embedding, config.keep_prob) # convert floors into 1 hot floor_one_hot = tf.one_hot(tf.reshape(self.floors, [-1]), depth=2, dtype=tf.float32) floor_one_hot = tf.reshape(floor_one_hot, [-1, max_dialog_len, 2]) joint_embedding = tf.concat([input_embedding, floor_one_hot], 2, "joint_embedding") with variable_scope.variable_scope("contextRNN"): enc_cell = self.get_rnncell(config.cell_type, self.context_cell_size, keep_prob=1.0, num_layer=config.num_layer) # and enc_last_state will be same as the true last state _, enc_last_state = tf.nn.dynamic_rnn( enc_cell, joint_embedding, dtype=tf.float32, sequence_length=self.context_lens) if config.num_layer > 1: enc_last_state = tf.concat(enc_last_state, 1) # combine with other attributes if config.use_hcf: attribute_embedding = da_embedding attribute_fc1 = layers.fully_connected(attribute_embedding, 30, activation_fn=tf.tanh, scope="attribute_fc1") cond_list = [ topic_embedding, self.my_profile, self.ot_profile, enc_last_state ] cond_embedding = tf.concat( cond_list, 1) ##context and some meta data, such as topic #introduce a recognition network q(phi)(z|x, c, y) to approximate the true posterior distribution p(z|x, c, y) with variable_scope.variable_scope("recognitionNetwork"): if config.use_hcf: recog_input = tf.concat( [cond_embedding, output_embedding, attribute_fc1], 1) #c: cond, x: output, y: attribute else: recog_input = tf.concat([cond_embedding, output_embedding], 1) self.recog_mulogvar = recog_mulogvar = layers.fully_connected( recog_input, config.latent_size * 2, activation_fn=None, scope="muvar") recog_mu, recog_logvar = tf.split(recog_mulogvar, 2, axis=1) with variable_scope.variable_scope("priorNetwork"): # P(XYZ)=P(Z|X)P(X)P(Y|X,Z) prior_fc1 = layers.fully_connected( cond_embedding, int(np.maximum(config.latent_size * 2, 100)), activation_fn=tf.tanh, scope="fc1") prior_mulogvar = layers.fully_connected(prior_fc1, config.latent_size * 2, activation_fn=None, scope="muvar") prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=1) # use sampled Z or posterior Z latent_sample = tf.cond( self.use_prior, lambda: sample_gaussian(prior_mu, prior_logvar), lambda: sample_gaussian(recog_mu, recog_logvar)) with variable_scope.variable_scope("generationNetwork"): gen_inputs = tf.concat([cond_embedding, latent_sample], 1) # BOW loss bow_fc1 = layers.fully_connected(gen_inputs, 400, activation_fn=tf.tanh, scope="bow_fc1") if config.keep_prob < 1.0: bow_fc1 = tf.nn.dropout(bow_fc1, config.keep_prob) self.bow_logits = layers.fully_connected(bow_fc1, self.vocab_size, activation_fn=None, scope="bow_project") # Y loss if config.use_hcf: meta_fc1 = layers.fully_connected(gen_inputs, 400, activation_fn=tf.tanh, scope="meta_fc1") if config.keep_prob < 1.0: meta_fc1 = tf.nn.dropout(meta_fc1, config.keep_prob) self.da_logits = layers.fully_connected(meta_fc1, self.da_vocab_size, scope="da_project") da_prob = tf.nn.softmax(self.da_logits) pred_attribute_embedding = tf.matmul(da_prob, d_embedding) if forward: selected_attribute_embedding = pred_attribute_embedding else: selected_attribute_embedding = attribute_embedding dec_inputs = tf.concat( [gen_inputs, selected_attribute_embedding], 1) else: self.da_logits = tf.zeros((batch_size, self.da_vocab_size)) dec_inputs = gen_inputs # Decoder if config.num_layer > 1: dec_init_state = [ layers.fully_connected(dec_inputs, self.dec_cell_size, activation_fn=None, scope="init_state-%d" % i) for i in range(config.num_layer) ] dec_init_state = tuple(dec_init_state) else: dec_init_state = layers.fully_connected(dec_inputs, self.dec_cell_size, activation_fn=None, scope="init_state") with variable_scope.variable_scope("decoder"): dec_cell = self.get_rnncell(config.cell_type, self.dec_cell_size, config.keep_prob, config.num_layer) dec_cell = OutputProjectionWrapper(dec_cell, self.vocab_size) if forward: loop_func = decoder_fn_lib.context_decoder_fn_inference( None, dec_init_state, embedding, start_of_sequence_id=self.go_id, end_of_sequence_id=self.eos_id, maximum_length=self.max_utt_len, num_decoder_symbols=self.vocab_size, context_vector=selected_attribute_embedding) dec_input_embedding = None dec_seq_lens = None else: loop_func = decoder_fn_lib.context_decoder_fn_train( dec_init_state, selected_attribute_embedding) dec_input_embedding = embedding_ops.embedding_lookup( embedding, self.output_tokens) dec_input_embedding = dec_input_embedding[:, 0:-1, :] dec_seq_lens = self.output_lens - 1 if config.keep_prob < 1.0: dec_input_embedding = tf.nn.dropout( dec_input_embedding, config.keep_prob) # apply word dropping. Set dropped word to 0 if config.dec_keep_prob < 1.0: keep_mask = tf.less_equal( tf.random_uniform((batch_size, max_out_len - 1), minval=0.0, maxval=1.0), config.dec_keep_prob) keep_mask = tf.expand_dims(tf.to_float(keep_mask), 2) dec_input_embedding = dec_input_embedding * keep_mask dec_input_embedding = tf.reshape( dec_input_embedding, [-1, max_out_len - 1, config.embed_size]) dec_outs, _, final_context_state = dynamic_rnn_decoder( dec_cell, loop_func, inputs=dec_input_embedding, sequence_length=dec_seq_lens) if final_context_state is not None: final_context_state = final_context_state[:, 0:array_ops. shape(dec_outs)[1]] mask = tf.to_int32(tf.sign(tf.reduce_max(dec_outs, axis=2))) self.dec_out_words = tf.multiply( tf.reverse(final_context_state, axis=[1]), mask) else: self.dec_out_words = tf.argmax(dec_outs, 2) if not forward: with variable_scope.variable_scope("loss"): labels = self.output_tokens[:, 1:] label_mask = tf.to_float(tf.sign(labels)) rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=dec_outs, labels=labels) rc_loss = tf.reduce_sum(rc_loss * label_mask, reduction_indices=1) self.avg_rc_loss = tf.reduce_mean(rc_loss) # used only for perpliexty calculation. Not used for optimzation self.rc_ppl = tf.exp( tf.reduce_sum(rc_loss) / tf.reduce_sum(label_mask)) """ as n-trial multimodal distribution. """ tile_bow_logits = tf.tile(tf.expand_dims(self.bow_logits, 1), [1, max_out_len - 1, 1]) bow_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tile_bow_logits, labels=labels) * label_mask bow_loss = tf.reduce_sum(bow_loss, reduction_indices=1) self.avg_bow_loss = tf.reduce_mean(bow_loss) # reconstruct the meta info about X if config.use_hcf: da_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.da_logits, labels=self.output_das) self.avg_da_loss = tf.reduce_mean(da_loss) else: self.avg_da_loss = 0.0 kld = gaussian_kld(recog_mu, recog_logvar, prior_mu, prior_logvar) self.avg_kld = tf.reduce_mean(kld) if log_dir is not None: kl_weights = tf.minimum( tf.to_float(self.global_t) / config.full_kl_step, 1.0) else: kl_weights = tf.constant(1.0) self.kl_w = kl_weights self.elbo = self.avg_rc_loss + kl_weights * self.avg_kld aug_elbo = self.avg_bow_loss + self.avg_da_loss + self.elbo tf.summary.scalar("da_loss", self.avg_da_loss) tf.summary.scalar("rc_loss", self.avg_rc_loss) tf.summary.scalar("elbo", self.elbo) tf.summary.scalar("kld", self.avg_kld) tf.summary.scalar("bow_loss", self.avg_bow_loss) self.summary_op = tf.summary.merge_all() self.log_p_z = norm_log_liklihood(latent_sample, prior_mu, prior_logvar) self.log_q_z_xy = norm_log_liklihood(latent_sample, recog_mu, recog_logvar) self.est_marginal = tf.reduce_mean(rc_loss + bow_loss - self.log_p_z + self.log_q_z_xy) self.optimize(sess, config, aug_elbo, log_dir) self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2)
def testRaggedMaxNorm(self): embeddings = constant_op.constant([[2.0]]) ids = ragged_factory_ops.constant([[0, 0], [0]], dtype=dtypes.int32) embedding = embedding_ops.embedding_lookup([embeddings], ids, max_norm=1.0) self.assertAllEqual(embedding, [[[1.0], [1.0]], [[1.0]]])
def __init__(self, cell, embedding, start_tokens, end_token, initial_state, beam_width, output_layer=None, length_penalty_weight=0.0, coverage_penalty_weight=0.0, reorder_tensor_arrays=True): """Initialize the BeamSearchDecoder. Args: cell: An `RNNCell` instance. embedding: A callable that takes a vector tensor of `ids` (argmax ids), or the `params` argument for `embedding_lookup`. start_tokens: `int32` vector shaped `[batch_size]`, the start tokens. end_token: `int32` scalar, the token that marks end of decoding. initial_state: A (possibly nested tuple of...) tensors and TensorArrays. beam_width: Python integer, the number of beams. output_layer: (Optional) An instance of `tf.layers.Layer`, i.e., `tf.layers.Dense`. Optional layer to apply to the RNN output prior to storing the result or sampling. length_penalty_weight: Float weight to penalize length. Disabled with 0.0. coverage_penalty_weight: Float weight to penalize the coverage of source sentence. Disabled with 0.0. reorder_tensor_arrays: If `True`, `TensorArray`s' elements within the cell state will be reordered according to the beam search path. If the `TensorArray` can be reordered, the stacked form will be returned. Otherwise, the `TensorArray` will be returned as is. Set this flag to `False` if the cell state contains `TensorArray`s that are not amenable to reordering. Raises: TypeError: if `cell` is not an instance of `RNNCell`, or `output_layer` is not an instance of `tf.layers.Layer`. ValueError: If `start_tokens` is not a vector or `end_token` is not a scalar. """ rnn_cell_impl.assert_like_rnncell("cell", cell) # pylint: disable=protected-access if (output_layer is not None and not isinstance(output_layer, layers_base.Layer)): raise TypeError("output_layer must be a Layer, received: %s" % type(output_layer)) self._cell = cell self._output_layer = output_layer self._reorder_tensor_arrays = reorder_tensor_arrays if callable(embedding): self._embedding_fn = embedding else: self._embedding_fn = ( lambda ids: embedding_ops.embedding_lookup(embedding, ids)) self._start_tokens = ops.convert_to_tensor(start_tokens, dtype=dtypes.int32, name="start_tokens") if self._start_tokens.get_shape().ndims != 1: raise ValueError("start_tokens must be a vector") self._end_token = ops.convert_to_tensor(end_token, dtype=dtypes.int32, name="end_token") if self._end_token.get_shape().ndims != 0: raise ValueError("end_token must be a scalar") self._batch_size = array_ops.size(start_tokens) self._beam_width = beam_width self._length_penalty_weight = length_penalty_weight self._coverage_penalty_weight = coverage_penalty_weight self._initial_cell_state = nest.map_structure( self._maybe_split_batch_beams, initial_state, self._cell.state_size) self._start_tokens = array_ops.tile( array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width]) self._start_inputs = self._embedding_fn(self._start_tokens) self._finished = array_ops.one_hot(array_ops.zeros([self._batch_size], dtype=dtypes.int32), depth=self._beam_width, on_value=False, off_value=True, dtype=dtypes.bool)
def embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_symbols, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None): """Embedding RNN sequence-to-sequence model with tied (shared) parameters. This model first embeds encoder_inputs by a newly created embedding (of shape [num_symbols x cell.input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs using the same embedding. Then it runs RNN decoder, initialized with the last encoder state, on embedded decoder_inputs. Args: encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_symbols: integer; number of symbols for both encoder and decoder. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [cell.output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype to use for the initial RNN states (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_tied_rnn_seq2seq". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when output_projection has the wrong shape. """ if output_projection is not None: proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype) proj_weights.get_shape().assert_is_compatible_with( [cell.output_size, num_symbols]) proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with vs.variable_scope(scope or "embedding_tied_rnn_seq2seq"): with ops.device("/cpu:0"): embedding = vs.get_variable("embedding", [num_symbols, cell.input_size]) emb_encoder_inputs = [ embedding_ops.embedding_lookup(embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ embedding_ops.embedding_lookup(embedding, x) for x in decoder_inputs ] def extract_argmax_and_embed(prev, _): """Loop_function that extracts the symbol from prev and embeds it.""" if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1)) return embedding_ops.embedding_lookup(embedding, prev_symbol) if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_symbols) if isinstance(feed_previous, bool): loop_function = extract_argmax_and_embed if feed_previous else None return tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell, loop_function=loop_function, dtype=dtype) else: # If feed_previous is a Tensor, we construct 2 graphs and use cond. outputs1, states1 = tied_rnn_seq2seq( emb_encoder_inputs, emb_decoder_inputs, cell, loop_function=extract_argmax_and_embed, dtype=dtype) vs.get_variable_scope().reuse_variables() outputs2, states2 = tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell, dtype=dtype) outputs = control_flow_ops.cond(feed_previous, lambda: outputs1, lambda: outputs2) states = control_flow_ops.cond(feed_previous, lambda: states1, lambda: states2) return outputs, states
def __init__(self, cell, embedding, start_tokens, end_token, initial_state, beam_width, output_layer=None, length_penalty_weight=0.0, positional_embedding=None): """Initialize the BeamSearchDecoder. Args: cell: An `RNNCell` instance. embedding: A callable that takes a vector tensor of `ids` (argmax ids), or the `params` argument for `embedding_lookup`. start_tokens: `int32` vector shaped `[batch_size]`, the start tokens. end_token: `int32` scalar, the token that marks end of decoding. initial_state: A (possibly nested tuple of...) tensors and TensorArrays. beam_width: Python integer, the number of beams. output_layer: (Optional) An instance of `tf.layers.Layer`, i.e., `tf.layers.Dense`. Optional layer to apply to the RNN output prior to storing the result or sampling. length_penalty_weight: Float weight to penalize length. Disabled with 0.0. positional_embedding: A callable to use decoder positional embedding. Default is None in which case positional embedding is disabled Raises: TypeError: if `cell` is not an instance of `RNNCell`, or `output_layer` is not an instance of `tf.layers.Layer`. ValueError: If `start_tokens` is not a vector or `end_token` is not a scalar. """ rnn_cell_impl.assert_like_rnncell("cell", cell) if (output_layer is not None and not isinstance(output_layer, layers_base.Layer)): raise TypeError( "output_layer must be a Layer, received: %s" % type(output_layer)) self._cell = cell self._output_layer = output_layer if callable(embedding): self._embedding_fn = embedding else: self._embedding_fn = ( lambda ids: embedding_ops.embedding_lookup(embedding, ids)) self._use_pos_embedding = False if positional_embedding is not None: if callable(positional_embedding): self._pos_embedding_fn = positional_embedding else: self._pos_embedding_fn = ( lambda ids: embedding_ops.embedding_lookup(positional_embedding, ids)) self._use_pos_embedding = True self._start_tokens = ops.convert_to_tensor( start_tokens, dtype=dtypes.int32, name="start_tokens") if self._start_tokens.get_shape().ndims != 1: raise ValueError("start_tokens must be a vector") self._end_token = ops.convert_to_tensor( end_token, dtype=dtypes.int32, name="end_token") if self._end_token.get_shape().ndims != 0: raise ValueError("end_token must be a scalar") self._batch_size = array_ops.size(start_tokens) self._beam_width = beam_width self._length_penalty_weight = length_penalty_weight self._initial_cell_state = nest.map_structure( self._maybe_split_batch_beams, initial_state, self._cell.state_size) self._start_tokens = array_ops.tile( array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width]) self._start_inputs = self._embedding_fn(self._start_tokens) if self._use_pos_embedding: self._start_inputs += self._pos_embedding_fn(ops.convert_to_tensor(0)) self._finished = array_ops.one_hot( array_ops.zeros([self._batch_size], dtype=dtypes.int32), depth=self._beam_width, on_value=False, off_value=True, dtype=dtypes.bool)
def embedding_attention_decoder(decoder_inputs, initial_state, attention_states, cell, num_symbols, num_heads=1, output_size=None, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None): """RNN decoder with embedding and attention and a pure-decoding option. Args: decoder_inputs: a list of 1D batch-sized int32-Tensors (decoder inputs). initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function. num_symbols: integer, how many symbols come into the embedding. num_heads: number of attention heads that read from attention_states. output_size: size of the output vectors; if None, use cell.output_size. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be generated by: next = embedding_lookup(embedding, argmax(previous_output)), In effect, this implements a greedy decoder. It can also be used during training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype to use for the RNN initial states (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when output_projection has the wrong shape. """ if output_size is None: output_size = cell.output_size if output_projection is not None: proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype) proj_weights.get_shape().assert_is_compatible_with( [cell.output_size, num_symbols]) proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with vs.variable_scope(scope or "embedding_attention_decoder"): with ops.device("/cpu:0"): embedding = vs.get_variable("embedding", [num_symbols, cell.input_size]) def extract_argmax_and_embed(prev, _): """Loop_function that extracts the symbol from prev and embeds it.""" if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1)) emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) return emb_prev loop_function = None if feed_previous: loop_function = extract_argmax_and_embed emb_inp = [ embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs ] return attention_decoder(emb_inp, initial_state, attention_states, cell, output_size=output_size, num_heads=num_heads, loop_function=loop_function)
def loss(): pred = math_ops.matmul( embedding_ops.embedding_lookup([var0], [0]), x) # pylint: disable=cell-var-from-loop return pred * pred
def dynamic_distraction_m2_seq2seq(encoder_inputs, decoder_inputs, query_inputs, cell_encoder_fw, cell_encoder_bw, distraction_cell, num_encoder_symbols, num_decoder_symbols, embedding_size, initial_embedding=None, num_heads=1, embedding_trainable=False, output_projection=None, feed_previous=False, dtype=None, scope=None, initial_state_attention=False): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: Integer; number of symbols on the encoder side. num_decoder_symbols: Integer; number of symbols on the decoder side. embedding_size: Integer, the length of the embedding vector for each symbol. num_heads: Number of attention heads that read from attention_states. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_decoder_symbols] and B has shape [num_decoder_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. """ with variable_scope.variable_scope(scope or "dynamic_distraction_m2_seq2seq", dtype=dtype) as scope: dtype = scope.dtype # Encoder. """encoder_cell = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) """ if initial_embedding is not None: embedding = variable_scope.get_variable( 'embedding', initializer=initial_embedding, trainable=embedding_trainable) else: embedding = variable_scope.get_variable( 'embedding', [num_encoder_symbols, embedding_size], trainable=embedding_trainable) embedded_inputs = embedding_ops.embedding_lookup( embedding, encoder_inputs) embedded_inputs = array_ops.unpack(embedded_inputs) query_embeddings = embedding_ops.embedding_lookup( embedding, query_inputs) query_embeddings = array_ops.unpack(query_embeddings) print("Embedded Inputs length:", len(embedded_inputs)) print("Shape in embedded inputs:", embedded_inputs[0].get_shape()) with variable_scope.variable_scope("Encoder_Cell"): encoder_outputs, encoder_state_fw, encoder_state_bw = rnn.bidirectional_rnn( cell_encoder_fw, cell_encoder_bw, embedded_inputs, dtype=dtype) with variable_scope.variable_scope("Query_Cell"): query_outputs, query_state_fw, query_state_bw = rnn.bidirectional_rnn( cell_encoder_fw, cell_encoder_bw, query_embeddings, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. encoder_state = array_ops.concat(1, [encoder_state_fw, encoder_state_bw]) query_state = array_ops.concat(1, [query_state_fw, query_state_bw]) top_states_encoder = [ array_ops.reshape(e, [-1, 1, 2 * cell_encoder_fw.output_size]) for e in encoder_outputs ] attention_states_encoder = array_ops.concat(1, top_states_encoder) top_states_query = [ array_ops.reshape(e, [-1, 1, 2 * cell_encoder_fw.output_size]) for e in query_outputs ] attention_states_query = array_ops.concat(1, top_states_query) # Decoder. output_size = None if output_projection is None: cell_encoder_fw = rnn_cell.OutputProjectionWrapper( cell_encoder_fw, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): return dynamic_distraction_m2_decoder_wrapper( decoder_inputs, initial_state=encoder_state, attention_state=attention_states_encoder, attention_states_query=attention_states_query, cell_encoder=cell_encoder_fw, num_symbols=num_decoder_symbols, embedding_size=embedding_size, distract_initial_state=encoder_state, num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous, embedding_scope=scope, initial_state_attention=initial_state_attention) # If feed_previous is a Tensor, we construct 2 graphs and use cond. def decoder(feed_previous_bool): reuse = None if feed_previous_bool else True with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=reuse) as scope: outputs, state = dynamic_distraction_m2_decoder_wrapper( decoder_inputs, initial_state=encoder_state, attention_states=attention_states_encoder, attention_states_query=attention_states_query, cell_encoder=cell_encoder_fw, num_symbols=num_decoder_symbols, embedding_size=embedding_size, distract_initial_state=encoder_state, distraction_cell=distraction_cell, num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous_bool, embedding_scope=scope, update_embedding_for_previous=False, initial_state_attention=initial_state_attention) state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list outputs_and_state = control_flow_ops.cond(feed_previous, lambda: decoder(True), lambda: decoder(False)) outputs_len = len( decoder_inputs) # Outputs length same as decoder inputs. state_list = outputs_and_state[outputs_len:] state = state_list[0] if nest.is_sequence(encoder_state): state = nest.pack_sequence_as(structure=encoder_state, flat_sequence=state_list) return outputs_and_state[:outputs_len], state
def func(self, x): return embedding_ops.embedding_lookup([self._var0, self._var1], x)