def __init__(self, num_units, use_peepholes=False, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=1, num_proj_shards=1, forget_bias=1.0, state_is_tuple=False, activation=math_ops.tanh): """Initialize the parameters for an LSTM cell. Args: num_units: int, The number of units in the LSTM cell use_peepholes: bool, set True to enable diagonal/peephole connections. initializer: (optional) The initializer to use for the weight and projection matrices. num_proj: (optional) int, The output dimensionality for the projection matrices. If None, no projection is performed. proj_clip: (optional) A float value. If `num_proj > 0` and `proj_clip` is provided, then the projected values are clipped elementwise to within `[-proj_clip, proj_clip]`. num_unit_shards: How to split the weight matrix. If >1, the weight matrix is stored across num_unit_shards. num_proj_shards: How to split the projection matrix. If >1, the projection matrix is stored across num_proj_shards. forget_bias: Biases of the forget gate are initialized by default to 1 in order to reduce the scale of forgetting at the beginning of the training. state_is_tuple: If True, accepted and returned states are 2-tuples of the `c_state` and `m_state`. By default (False), they are concatenated along the column axis. This default behavior will soon be deprecated. activation: Activation function of the inner states. """ if not state_is_tuple: logging.warn( "%s: Using a concatenated state is slower and will soon be " "deprecated. Use state_is_tuple=True." % self) self._num_units = num_units self._use_peepholes = use_peepholes self._initializer = initializer self._num_proj = num_proj self._proj_clip = proj_clip self._num_unit_shards = num_unit_shards self._num_proj_shards = num_proj_shards self._forget_bias = forget_bias self._state_is_tuple = state_is_tuple self._activation = activation if num_proj: self._state_size = (rnn_cell.LSTMStateTuple(num_units, num_proj) if state_is_tuple else num_units + num_proj) self._output_size = num_proj else: self._state_size = (rnn_cell.LSTMStateTuple(num_units, num_units) if state_is_tuple else 2 * num_units) self._output_size = num_units
def __init__(self, cell_size, num_copies, input_keys=1, output_keys=1, initializer=None, num_proj=None, forget_bias=1.0, state_is_tuple=True, activation=None, reuse=None, name=None): """Initialize the parameters for an Associative LSTM cell. Args: cell_size: int, The number of units per copy in the ALSTM cell num_copies: int, The number of memory copies in the ALSTM cell input_keys: int, The number of inputs to be used. output_keys: int, The number of outputs to be used. initializer: (optional) The initializer to use for the weight and projection matrices. num_proj: (optional) int, The output dimensionality for the projection matrices. If None, no projection is performed. num_unit_shards: How to split the weight matrix. If >1, the weight matrix is stored across num_unit_shards. activation: Activation function of the inner states. """ super(AssociativeLSTMCell, self).__init__(_reuse=reuse, name=name) if cell_size % 2 != 0: raise ValueError("cell_size must be an even number") self._cell_size = cell_size self._num_copies = num_copies self._input_keys = input_keys self._output_keys = output_keys self._initializer = initializer self._num_proj = num_proj # Generating key permutations for each copy. self._permutations = np.array([ permutation(self._cell_size // 2) for _ in range(self._num_copies) ]) self._permutations = tf.concat( [self._permutations, self._permutations + self._cell_size // 2], axis=1, name='concat6') if num_proj: if num_proj % output_keys != 0: raise ValueError("num_proj must be divisible by output_keys") self._state_size = (rnn_cell.LSTMStateTuple(cell_size, num_proj)) self._output_size = num_proj else: num_proj = cell_size * output_keys self._state_size = (rnn_cell.LSTMStateTuple(cell_size, cell_size)) self._output_size = cell_size
def testStateTupleDictConversion(self): """Test `state_tuple_to_dict` and `dict_to_state_tuple`.""" cell_sizes = [5, 3, 7] # A MultiRNNCell of LSTMCells is both a common choice and an interesting # test case, because it has two levels of nesting, with an inner class that # is not a plain tuple. cell = rnn_cell.MultiRNNCell( [rnn_cell.LSTMCell(i) for i in cell_sizes]) state_dict = { dynamic_rnn_estimator._get_state_name(i): array_ops.expand_dims(math_ops.range(cell_size), 0) for i, cell_size in enumerate([5, 5, 3, 3, 7, 7]) } expected_state = (rnn_cell.LSTMStateTuple( np.reshape(np.arange(5), [1, -1]), np.reshape(np.arange(5), [1, -1])), rnn_cell.LSTMStateTuple( np.reshape(np.arange(3), [1, -1]), np.reshape(np.arange(3), [1, -1])), rnn_cell.LSTMStateTuple( np.reshape(np.arange(7), [1, -1]), np.reshape(np.arange(7), [1, -1]))) actual_state = dynamic_rnn_estimator.dict_to_state_tuple( state_dict, cell) flattened_state = dynamic_rnn_estimator.state_tuple_to_dict( actual_state) with self.cached_session() as sess: (state_dict_val, actual_state_val, flattened_state_val) = sess.run( [state_dict, actual_state, flattened_state]) def _recursive_assert_equal(x, y): self.assertEqual(type(x), type(y)) if isinstance(x, (list, tuple)): self.assertEqual(len(x), len(y)) for i, _ in enumerate(x): _recursive_assert_equal(x[i], y[i]) elif isinstance(x, np.ndarray): np.testing.assert_array_equal(x, y) else: self.fail('Unexpected type: {}'.format(type(x))) for k in state_dict_val.keys(): np.testing.assert_array_almost_equal( state_dict_val[k], flattened_state_val[k], err_msg='Wrong value for state component {}.'.format(k)) _recursive_assert_equal(expected_state, actual_state_val)
def testBahdanauNotNormalized(self): create_attention_mechanism = wrapper.BahdanauAttentionV2 create_attention_kwargs = {"kernel_initializer": "ones"} expected_final_output = basic_decoder.BasicDecoderOutput( rnn_output=ResultSummary( shape=(5, 3, 6), dtype=np.dtype(np.float32), mean=4.8290324), sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype(np.int32), mean=0)) expected_final_state = wrapper.AttentionWrapperState( cell_state=rnn_cell.LSTMStateTuple( c=ResultSummary( shape=(5, 9), dtype=np.dtype(np.float32), mean=1.6432636), h=ResultSummary( shape=(5, 9), dtype=np.dtype(np.float32), mean=0.75866824)), attention=ResultSummary( shape=(5, 6), dtype=np.dtype(np.float32), mean=6.7445569), time=3, alignments=ResultSummary( shape=(5, 8), dtype=np.dtype(np.float32), mean=0.125), attention_state=ResultSummary( shape=(5, 8), dtype=np.dtype(np.float32), mean=0.125), alignment_history=()) expected_final_alignment_history = ResultSummary( shape=(3, 5, 8), dtype=np.dtype(np.float32), mean=0.125) self._testWithAttention( create_attention_mechanism, expected_final_output, expected_final_state, alignment_history=True, create_query_layer=True, expected_final_alignment_history=expected_final_alignment_history, create_attention_kwargs=create_attention_kwargs)
def DoOneIter(prev_word, prev_c, prev_h): # lookup embedding prev_embed = tf.nn.embedding_lookup(self._word_embeddings, prev_word) prev_embed = tf.expand_dims(prev_embed, 0) if params.use_softmax_adaptation: prev_embed = prev_embed[:, params.context_embed_size:] # one iteration of recurrent layer state = rnn_cell.LSTMStateTuple(self.prev_c, self.prev_h) with vs.variable_scope('RNN', reuse=True): result, (next_c, next_h) = self.cell(prev_embed, state) proj_result = tf.matmul(result, self.linear_proj) if params.use_softmax_adaptation: proj_result = tf.concat(1, [self.final_context_embed, proj_result]) # softmax layer bias = self.base_bias if params.use_hash_table: hval = self.hash_func(self.all_ids, self.context_placeholders) bias += hval logits = tf.matmul(proj_result, self._word_embeddings, transpose_b=True) + bias next_prob = tf.nn.softmax(logits / self.temperature) cumsum = tf.cumsum(next_prob, exclusive=True, axis=1) idx = tf.less(cumsum, tf.random_uniform([1])) selected = tf.reduce_max(tf.where(idx)) #selected = tf.squeeze(tf.argmax(next_prob, 1)) #selected.set_shape(()) selected_p = tf.nn.embedding_lookup(tf.transpose(next_prob), selected) return next_prob, selected, selected_p, next_c, next_h
def __call__(self, inputs, state, scope=None): with vs.variable_scope(scope or "nmts_decoder_cell"): states, encoder_hs = state cur_inp = inputs new_states = [] with vs.variable_scope("cell_0"): cur_inp, cur_state = self._cells[0](cur_inp, states[0]) if self._attention == "luong": c_t = attention_luong(cur_inp, encoder_hs) elif self._attention == "nmts": c_t = attention_nmts_fast(cur_inp, encoder_hs) else: raise ValueError("Unknown attention type: {}".format(self._attention)) new_states.append(cur_state) states = states[1:] for i, cell in enumerate(self._cells[1:]): with vs.variable_scope("cell_{}".format(i+1)): cur_state = states[i] prev_inp = cur_inp h_dim = cur_inp.get_shape().with_rank(2)[1].value Wp = vs.get_variable("Wp", [2*h_dim, h_dim]) bp = vs.get_variable("bp", [h_dim]) cur_inp = math_ops.matmul(array_ops.concat(1, [cur_inp, c_t]), Wp) + bp cur_state = rnn_cell.LSTMStateTuple(cur_state.c, cur_inp) next_inp, new_state = cell(cur_inp, cur_state) cur_inp = prev_inp + next_inp if i < len(self._cells[1:]) - 1 else next_inp new_states.append(new_state) new_states = tuple(new_states) return cur_inp, (new_states, encoder_hs)
def testMaskedLSTMCell(self): expected_num_masks = 1 expected_num_rows = 2 * self.dim expected_num_cols = 4 * self.dim with self.test_session(): inputs = variables.Variable( random_ops.random_normal([self.batch_size, self.dim])) c = variables.Variable( random_ops.random_normal([self.batch_size, self.dim])) h = variables.Variable( random_ops.random_normal([self.batch_size, self.dim])) state = tf_rnn_cells.LSTMStateTuple(c, h) lstm_cell = rnn_cells.MaskedLSTMCell(self.dim) lstm_cell(inputs, state) self.assertEqual(len(pruning.get_masks()), expected_num_masks) self.assertEqual(len(pruning.get_masked_weights()), expected_num_masks) self.assertEqual(len(pruning.get_thresholds()), expected_num_masks) self.assertEqual(len(pruning.get_weights()), expected_num_masks) for mask in pruning.get_masks(): self.assertEqual(mask.shape, (expected_num_rows, expected_num_cols)) for weight in pruning.get_weights(): self.assertEqual(weight.shape, (expected_num_rows, expected_num_cols))
def testLuongScaled(self): create_attention_mechanism = wrapper.LuongAttentionV2 create_attention_kwargs = {"scale": True} expected_final_output = basic_decoder.BasicDecoderOutput( rnn_output=ResultSummary( shape=(5, 3, 6), dtype=np.dtype("float32"), mean=2.6605489), sample_id=ResultSummary( shape=(5, 3), dtype=np.dtype("int32"), mean=0.0)) expected_final_state = wrapper.AttentionWrapperState( cell_state=rnn_cell.LSTMStateTuple( c=ResultSummary( shape=(5, 9), dtype=np.dtype("float32"), mean=0.88403547), h=ResultSummary( shape=(5, 9), dtype=np.dtype("float32"), mean=0.37819088)), attention=ResultSummary( shape=(5, 6), dtype=np.dtype("float32"), mean=4.0846314), time=3, alignments=ResultSummary( shape=(5, 8), dtype=np.dtype("float32"), mean=0.125), attention_state=ResultSummary( shape=(5, 8), dtype=np.dtype("float32"), mean=0.125), alignment_history=()) self._testWithAttention( create_attention_mechanism, expected_final_output, expected_final_state, attention_mechanism_depth=9, create_attention_kwargs=create_attention_kwargs)
def testLuongMonotonicScaled(self): create_attention_mechanism = wrapper.LuongMonotonicAttentionV2 create_attention_kwargs = {"scale": True} expected_final_output = basic_decoder.BasicDecoderOutput( rnn_output=ResultSummary( shape=(5, 3, 6), dtype=np.dtype("float32"), mean=3.159497), sample_id=ResultSummary( shape=(5, 3), dtype=np.dtype("int32"), mean=0.0)) expected_final_state = wrapper.AttentionWrapperState( cell_state=rnn_cell.LSTMStateTuple( c=ResultSummary( shape=(5, 9), dtype=np.dtype("float32"), mean=1.072384), h=ResultSummary( shape=(5, 9), dtype=np.dtype("float32"), mean=0.50331038)), attention=ResultSummary( shape=(5, 6), dtype=np.dtype("float32"), mean=5.3079605), time=3, alignments=ResultSummary( shape=(5, 8), dtype=np.dtype("float32"), mean=0.11467695), attention_state=ResultSummary( shape=(5, 8), dtype=np.dtype("float32"), mean=0.11467695), alignment_history=()) expected_final_alignment_history = ResultSummary( shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.11899644) self._testWithAttention( create_attention_mechanism, expected_final_output, expected_final_state, attention_mechanism_depth=9, alignment_history=True, expected_final_alignment_history=expected_final_alignment_history, create_attention_kwargs=create_attention_kwargs)
def testBahdanauMonotonicNotNormalized(self): create_attention_mechanism = wrapper.BahdanauMonotonicAttentionV2 create_attention_kwargs = {"kernel_initializer": "ones"} expected_final_output = basic_decoder.BasicDecoderOutput( rnn_output=ResultSummary( shape=(5, 3, 6), dtype=np.dtype("float32"), mean=5.9850435), sample_id=ResultSummary( shape=(5, 3), dtype=np.dtype("int32"), mean=0.0)) expected_final_state = wrapper.AttentionWrapperState( cell_state=rnn_cell.LSTMStateTuple( c=ResultSummary( shape=(5, 9), dtype=np.dtype("float32"), mean=1.6752492), h=ResultSummary( shape=(5, 9), dtype=np.dtype("float32"), mean=0.76052248)), attention=ResultSummary( shape=(5, 6), dtype=np.dtype("float32"), mean=8.361186), time=3, alignments=ResultSummary( shape=(5, 8), dtype=np.dtype("float32"), mean=0.10989678), attention_state=ResultSummary( shape=(5, 8), dtype=np.dtype("float32"), mean=0.10989678), alignment_history=()) expected_final_alignment_history = ResultSummary( shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.117412611) self._testWithAttention( create_attention_mechanism, expected_final_output, expected_final_state, alignment_history=True, expected_final_alignment_history=expected_final_alignment_history, create_query_layer=True, create_attention_kwargs=create_attention_kwargs)
def testBahdanauMonotonicNormalized(self): create_attention_mechanism = wrapper.BahdanauMonotonicAttentionV2 create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True} expected_final_output = basic_decoder.BasicDecoderOutput( rnn_output=ResultSummary( shape=(5, 3, 6), dtype=np.dtype("float32"), mean=4.5706983), sample_id=ResultSummary( shape=(5, 3), dtype=np.dtype("int32"), mean=0.0)) expected_final_state = wrapper.AttentionWrapperState( cell_state=rnn_cell.LSTMStateTuple( c=ResultSummary( shape=(5, 9), dtype=np.dtype("float32"), mean=1.6005473), h=ResultSummary( shape=(5, 9), dtype=np.dtype("float32"), mean=0.77863038)), attention=ResultSummary( shape=(5, 6), dtype=np.dtype("float32"), mean=7.3326721), time=3, alignments=ResultSummary( shape=(5, 8), dtype=np.dtype("float32"), mean=0.12258384), attention_state=ResultSummary( shape=(5, 8), dtype=np.dtype("float32"), mean=0.12258384), alignment_history=()) expected_final_alignment_history = ResultSummary( shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.12258384) self._testWithAttention( create_attention_mechanism, expected_final_output, expected_final_state, alignment_history=True, expected_final_alignment_history=expected_final_alignment_history, create_query_layer=True, create_attention_kwargs=create_attention_kwargs)
def __call__(self, inputs, state, scope=None): """LSTM cell with layer normalization and recurrent dropout.""" with vs.variable_scope(scope or type(self).__name__) as scope: # LayerNormBasicLSTMCell # pylint: disable=unused-variables c, h = state args = array_ops.concat(1, [inputs, h]) concat = self._linear(args) i, j, f, o = array_ops.split(1, 4, concat) if self._layer_norm: i = self._norm(i, "input") j = self._norm(j, "transform") f = self._norm(f, "forget") o = self._norm(o, "output") g = self._activation(j) if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1: g = nn_ops.dropout(g, self._keep_prob, seed=self._seed) new_c = (c * math_ops.sigmoid(f + self._forget_bias) + math_ops.sigmoid(i) * g) if self._layer_norm: new_c = self._norm(new_c, "state") new_h = self._activation(new_c) * math_ops.sigmoid(o) new_state = rnn_cell.LSTMStateTuple(new_c, new_h) return new_h, new_state
def testNotUseAttentionLayer(self): create_attention_mechanism = wrapper.BahdanauAttentionV2 create_attention_kwargs = {"kernel_initializer": "ones"} expected_final_output = basic_decoder.BasicDecoderOutput( rnn_output=ResultSummary( shape=(5, 3, 10), dtype=np.dtype("float32"), mean=0.072406612), sample_id=ResultSummary( shape=(5, 3), dtype=np.dtype("int32"), mean=3.86666666)) expected_final_state = wrapper.AttentionWrapperState( cell_state=rnn_cell.LSTMStateTuple( c=ResultSummary( shape=(5, 9), dtype=np.dtype("float32"), mean=1.032002), h=ResultSummary( shape=(5, 9), dtype=np.dtype("float32"), mean=0.61177742)), attention=ResultSummary( shape=(5, 10), dtype=np.dtype("float32"), mean=0.011346335), time=3, alignments=ResultSummary( shape=(5, 8), dtype=np.dtype("float32"), mean=0.125), attention_state=ResultSummary( shape=(5, 8), dtype=np.dtype("float32"), mean=0.125), alignment_history=()) self._testWithAttention( create_attention_mechanism, expected_final_output, expected_final_state, attention_layer_size=None, create_query_layer=True, create_attention_kwargs=create_attention_kwargs)
def loop_fn(i): loop_inputs = [ array_ops.expand_dims(array_ops.gather(x, i), 0) for x in inputs ] loop_init_state = rnn_cell.LSTMStateTuple( *[array_ops.expand_dims(array_ops.gather(x, i), 0) for x in init_state]) return model_fn(loop_inputs, loop_init_state)
def testBahdanauNormalized(self): create_attention_mechanism = wrapper.BahdanauAttentionV2 create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True} expected_final_output = basic_decoder.BasicDecoderOutput( rnn_output=ResultSummary( shape=(5, 3, 6), dtype=np.dtype("float32"), mean=3.9548259), sample_id=ResultSummary( shape=(5, 3), dtype=np.dtype("int32"), mean=0.0)) expected_final_state = wrapper.AttentionWrapperState( cell_state=rnn_cell.LSTMStateTuple( c=ResultSummary( shape=(5, 9), dtype=np.dtype("float32"), mean=1.4652209), h=ResultSummary( shape=(5, 9), dtype=np.dtype("float32"), mean=0.70997983)), attention=ResultSummary( shape=(5, 6), dtype=np.dtype("float32"), mean=6.3075728), time=3, alignments=ResultSummary( shape=(5, 8), dtype=np.dtype("float32"), mean=0.125), attention_state=ResultSummary( shape=(5, 8), dtype=np.dtype("float32"), mean=0.125), alignment_history=()) self._testWithAttention( create_attention_mechanism, expected_final_output, expected_final_state, create_query_layer=True, create_attention_kwargs=create_attention_kwargs)
def multi_rnn(inputs, layer_sizes, sequence_length, dropout_keep_prob=1.0, attn_length=0, base_cell=tf.contrib.rnn.BasicLSTMCell, initial_state=None): if initial_state is not None: batch_size = inputs.shape[0] initial_state = tuple([ rnn_cell.LSTMStateTuple(tf.zeros([batch_size, size]), initial_state) for size in layer_sizes ]) cells = make_rnn_cells(layer_sizes, dropout_keep_prob=dropout_keep_prob, attn_length=attn_length, base_cell=base_cell) cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True) outputs, states = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, sequence_length=sequence_length, dtype=tf.float32) if attn_length: return tf.reduce_sum(outputs, 1) return tf.reduce_sum(outputs, 1) / \ tf.reshape(tf.cast(sequence_length, tf.float32), [-1, 1]) return tf.concat([states[0][0].h, states[0][1]], 1) # 82 return states[-1].h return tf.concat([states[-1].c, states[-1].h], 1) return tf.concat([states[0].c, states[0].h], 1)
def CreateDecodingGraph(self, params): """Construct the part of the graph used for decoding.""" out_embeddings = self.word_embedder.GetAllEmbeddings() # placeholders for decoder self.prev_word = tf.placeholder(tf.int32, (), name='prev_word') self.prev_c = tf.get_variable( 'prev_c', [1, params.cell_size], dtype=tf.float32, collections=[tf.GraphKeys.LOCAL_VARIABLES]) self.prev_h = tf.get_variable( 'prev_h', [1, params.cell_size], dtype=tf.float32, collections=[tf.GraphKeys.LOCAL_VARIABLES]) self.temperature = tf.placeholder_with_default([1.0], [1]) # lookup embedding prev_embed = tf.nn.embedding_lookup(out_embeddings, self.prev_word) prev_embed = tf.expand_dims(prev_embed, 0) if params.use_softmax_adaptation: prev_embed = prev_embed[:, self.context_size:] # one iteration of recurrent layer state = rnn_cell.LSTMStateTuple(self.prev_c, self.prev_h) with tf.variable_scope('RNN', reuse=True): result, (self.next_c, self.next_h) = self.cell(prev_embed, state) proj_result = tf.matmul(result, self.linear_proj) if params.use_softmax_adaptation: proj_result = tf.concat( axis=1, values=[self.final_context_embed, proj_result]) # softmax layer bias = self.base_bias if params.use_hash_table or params.use_context_dependent_bias: hval = self.hash_func(self.all_ids, self.context_placeholders) bias += hval self.beam_size = tf.placeholder_with_default(1, (), name='beam_size') logits = tf.matmul(proj_result, out_embeddings, transpose_b=True) + bias self.next_prob = tf.nn.softmax(logits / self.temperature) #self.selected = tf.multinomial(logits / self.temperature, self.beam_size) self.selected = tf.squeeze( tf.multinomial(logits / self.temperature, self.beam_size)) self.selected, _ = tf.unique(self.selected) self.selected_p = tf.nn.embedding_lookup(tf.transpose(self.next_prob), self.selected) assign1 = self.prev_c.assign(self.next_c) assign2 = self.prev_h.assign(self.next_h) self.assign_op = tf.group(assign1, assign2) # reset state assign1 = self.prev_c.assign(tf.zeros_like(self.prev_c)) assign2 = self.prev_h.assign(tf.zeros_like(self.prev_h)) self.reset_state = tf.group(assign1, assign2)
def _PopnnLSTM(x, h, c): lstm_cell = ipu.ops.rnn_ops.PopnnLSTM( num_hidden, dtype=dataType, weights_initializer=init_ops.zeros_initializer(dtype=dataType), bias_initializer=init_ops.zeros_initializer(dtype=dataType)) state = rnn_cell.LSTMStateTuple(c, h) return lstm_cell(x, initial_state=state, training=False)
def call(self, inputs, initial_state=None, training=True): """Runs the forward step for the LSTM model. Args: inputs: 3-D tensor with shape [time_len, batch_size, input_size]. initial_state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size, num_units]`. If not provided, the state is initialized to zeros. DEPRECATED a tuple of tensor (input_h_state, input_c_state) each of shape [batch_size, num_units]. training: whether this operation will be used in training or inference. Returns: tuple of output and output states: * output: a tensor of shape [time_len, batch_size, num_units]. * output_states: An `LSTMStateTuple` of the same shape and structure as initial_state. If the initial state used the deprecated behaviour of not passing `LSTMStateTuple`, then a tuple (output_h_state, output_c_state) is returned. Raises: ValueError: if initial_state is not valid. """ dtype = self.dtype inputs = ops.convert_to_tensor(inputs, dtype=dtype) batch_size = array_ops.shape(inputs)[1] uses_old_api = False if initial_state is not None and not isinstance(initial_state, rnn_cell.LSTMStateTuple): if isinstance(initial_state, tuple): logging.warning( "Passing a tuple as a `initial_state` to PopnnLSTM is " "deprecated and will be removed in the future. Pass an " "`LSTMStateTuple` instead.") initial_state = rnn_cell.LSTMStateTuple(initial_state[1], initial_state[0]) uses_old_api = True else: raise ValueError("Invalid initial_state type: `%s`, expecting " "`LSTMStateTuple`." % type(initial_state)) if initial_state is None: # Create a zero state. initial_state = self._zero_state(batch_size) c, h = initial_state h = ops.convert_to_tensor(h, dtype=dtype) c = ops.convert_to_tensor(c, dtype=dtype) outputs, state = self._forward(inputs, h, c, self.kernel, self.biases, training) if uses_old_api: state = (state.h, state.c) return outputs, state
def encode(self, x): """Probabilistic encoder from inputs to latent distribution parameters; a.k.a. inference network q(z|x) """ # np.array -> [float, float] feed_dict = {self.input_placeholder: x} return self.sess.run(rnn_cell.LSTMStateTuple(self.encoding_cell, self.encoding_hidden), feed_dict=feed_dict)
def _forward(self, inputs, h, c, kernel, biases, training): output, output_h, output_c, _ = gen_popnn_ops.popnn_lstm_layer( inputs=inputs, num_channels=self._num_units, kernel=kernel, biases=biases, input_h_state=h, input_c_state=c, is_training=training, partials_dtype=self._partials_dtype, name=self._name) return output, rnn_cell.LSTMStateTuple(output_c, output_h)
def _tfLSTM(x, h, c): lstm_cell = rnn_cell.LSTMCell( num_hidden, name='basic_lstm_cell', forget_bias=0., initializer=init_ops.zeros_initializer(dtype=dataType)) state = rnn_cell.LSTMStateTuple(c, h) return rnn.dynamic_rnn(lstm_cell, x, dtype=dataType, initial_state=state, time_major=True)
def _PopnnLSTM(x, h, c, y): lstm_cell = popnn_rnn.PopnnLSTM( num_hidden, dtype=dataType, weights_initializer=init_ops.zeros_initializer(dtype=dataType), bias_initializer=init_ops.zeros_initializer(dtype=dataType)) state = rnn_cell.LSTMStateTuple(c, h) outputs, _ = lstm_cell(x, initial_state=state, training=True) softmax = nn.softmax_cross_entropy_with_logits_v2( logits=outputs[-1], labels=array_ops.stop_gradient(y)) loss = math_ops.reduce_mean(softmax) train = gradient_descent.GradientDescentOptimizer(lr).minimize(loss) return [loss, train]
def call(self, inputs, state): """Long short-term memory cell (LSTM) with masks for pruning. Args: inputs: `2-D` tensor with shape `[batch_size, input_size]`. state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size, self.state_size]`, if `state_is_tuple` has been set to `True`. Otherwise, a `Tensor` shaped `[batch_size, 2 * self.state_size]`. Returns: A pair containing the new hidden state, and the new state (either a `LSTMStateTuple` or a concatenated state, depending on `state_is_tuple`). """ sigmoid = math_ops.sigmoid one = constant_op.constant(1, dtype=dtypes.int32) # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(value=state, num_or_size_splits=2, axis=one) gate_inputs = math_ops.matmul(array_ops.concat([inputs, h], 1), self._masked_kernel) gate_inputs = nn_ops.bias_add(gate_inputs, self._bias) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(value=gate_inputs, num_or_size_splits=4, axis=one) forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=f.dtype) # Note that using `add` and `multiply` instead of `+` and `*` gives a # performance improvement. So using those at the cost of readability. add = math_ops.add multiply = math_ops.multiply new_c = add(multiply(c, sigmoid(add(f, forget_bias_tensor))), multiply(sigmoid(i), self._activation(j))) new_h = multiply(self._activation(new_c), sigmoid(o)) if self._state_is_tuple: new_state = tf_rnn.LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def _LSTMLayerCPU(self, inputs, weights_value, initial_state, forget_bias, training, name): with ops.device("/device:CPU:0"): lstm_cell = rnn_cell.LSTMCell( num_channels, name='basic_lstm_cell', forget_bias=forget_bias, initializer=init_ops.constant_initializer(weights_value, dtype=dataType), reuse=variable_scope.AUTO_REUSE) state = rnn_cell.LSTMStateTuple(initial_state[1], initial_state[0]) outputs, states = rnn.dynamic_rnn(lstm_cell, inputs, dtype=dataType, initial_state=state, time_major=True) return outputs
def __call__(self, inputs, state, scope=None, reuse=None): with tf.variable_scope("hyper_lstm_cell", reuse=reuse): # Parameters of gates are concatenated into one multiply for efficiency. c, h = state the_input = tf.concat(axis=1, values=[inputs, h]) result = tf.matmul(the_input, self.W) if self.lowrank_adaptation: input_expanded = tf.expand_dims(the_input, 1) intermediate = tf.matmul(input_expanded, self.left_adapt) final = tf.matmul(intermediate, self.right_adapt) result += tf.squeeze(final) if self.mikilov_adapt: result += self.delta result += self.bias # j = new_input, f = forget_gate, o = output_gate j, f, o = tf.split(axis=1, num_or_size_splits=3, value=result) def Norm(inputs, gamma, beta): # layer norm helper function m, v = tf.nn.moments(inputs, [1], keep_dims=True) normalized_input = (inputs - m) / tf.sqrt(v + 1e-5) return normalized_input * gamma + beta if self.layer_norm: j = Norm(j, self.gammas[0], self.betas[0]) f = Norm(f, self.gammas[1], self.betas[1]) o = Norm(o, self.gammas[2], self.betas[2]) g = self._activation(j) # recurrent dropout without memory loss if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1: g = tf.nn.dropout(g, self._keep_prob) forget_gate = tf.sigmoid(f + self._forget_bias) input_gate = 1.0 - forget_gate # input and forget gates are coupled new_c = (c * forget_gate + input_gate * g) new_h = self._activation(new_c) * tf.sigmoid(o) new_state = rnn_cell.LSTMStateTuple(new_c, new_h) return new_h, new_state
def _tfLSTM(x, h, c, y): lstm_cell = rnn_cell.LSTMCell( num_hidden, name='basic_lstm_cell', forget_bias=0., initializer=init_ops.zeros_initializer(dtype=dataType)) state = rnn_cell.LSTMStateTuple(c, h) outputs, _ = rnn.dynamic_rnn(lstm_cell, x, dtype=dataType, initial_state=state, time_major=True) softmax = nn.softmax_cross_entropy_with_logits_v2( logits=outputs[-1], labels=array_ops.stop_gradient(y)) loss = math_ops.reduce_mean(softmax) train = gradient_descent.GradientDescentOptimizer(lr).minimize(loss) return [loss, train]
def _build(self, incoming, state, *args, **kwargs): """Long short-term memory cell (LSTM).""" self._declare_dependencies() activation = getters.get_activation(self.activation) inner_activation = getters.get_activation(self.inner_activation) # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(axis=1, num_or_size_splits=2, value=state) concat = _linear([incoming, h], 4 * self._num_units, True, 0., self.weights_init, self.trainable, self.restore) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(axis=1, num_or_size_splits=4, value=concat) # apply batch normalization to inner state and gates if self.batch_norm: i = self._batch_norm_i(i) j = self._batch_norm_j(j) f = self._batch_norm_f(f) o = self._batch_norm_o(o) new_c = (c * inner_activation(f + self._forget_bias) + inner_activation(i) * activation(j)) # hidden-to-hidden batch normalizaiton if self.batch_norm: batch_norm_new_c = self._batch_norm_c(new_c) new_h = activation(batch_norm_new_c) * inner_activation(o) else: new_h = activation(new_c) * inner_activation(o) if self._state_is_tuple: new_state = rnn_cell.LSTMStateTuple(new_c, new_h) else: new_state = tf.concat(values=[new_c, new_h], axis=1) # Retrieve RNN Variables with get_variable_scope(scope='Linear', reuse=True): self._w = tf.get_variable('w') self._b = tf.get_variable('b') return new_h, new_state
def __call__(self, inputs, parent_state, cyc_state, scope=None): """Modified Long short-term memory for tree structure""" with vs.variable_scope(scope or type(self).__name__): # "BasicTreeLSTMCell" # parameters of gates are concatenated into one multiply for efficiency parent_c, parent_h = parent_state cyc_c, cyc_h = cyc_state c = rnn.linear([parent_c, cyc_c], self._num_units, True) concat = rnn.linear([inputs, parent_h, cyc_h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(1, 4, concat) new_c = [c * rnn_cell.sigmoid(f + self._forget_bias) + rnn_cell.sigmoid(i) * self._activation(j)] new_h = self._activation(new_c) * rnn_cell.sigmoid(o) new_state = rnn_cell.LSTMStateTuple(new_c, new_h) return new_h, new_state
def __call__(self, inputs, state, scope=None): with vs.variable_scope(scope or "nmts_decoder_cell"): states, c_t = state cur_inp = inputs new_states = [] for i, cell in enumerate(self._cells): with vs.variable_scope("cell_{}".format(i)): cur_state = states[i] prev_inp = cur_inp h_dim = cur_inp.get_shape().with_rank(2)[1].value Wp = vs.get_variable("Wp", [2*h_dim, h_dim]) bp = vs.get_variable("bp", [h_dim]) cur_inp = math_ops.matmul(array_ops.concat(1, [cur_inp, c_t]), Wp) + bp cur_state = rnn_cell.LSTMStateTuple(cur_state.c, cur_inp) next_inp, new_state = cell(cur_inp, cur_state) cur_inp = prev_inp + next_inp if i < len(self._cells) - 1 else next_inp new_states.append(new_state) new_states = tuple(new_states) return cur_inp, (new_states, c_t)