def tf_reshape_box(true_xy_A: tf.Tensor, true_wh_A: tf.Tensor, p_xy_A: tf.Tensor, p_wh_A: tf.Tensor, layer: int, helper: Helper) -> tuple: """ reshape the xywh to [?,h,w,anchor_nums,true_box_nums,2] NOTE must use obj mask in atrue xywh ! Parameters ---------- true_xy_A : tf.Tensor shape will be [true_box_nums,2] true_wh_A : tf.Tensor shape will be [true_box_nums,2] p_xy_A : tf.Tensor shape will be [?,h,w,anhor_nums,2] p_wh_A : tf.Tensor shape will be [?,h,w,anhor_nums,2] layer : int helper : Helper Returns ------- tuple true_cent, true_box_wh, pred_cent, pred_box_wh """ with tf.name_scope('reshape_box_%d' % layer): true_cent = true_xy_A[tf.newaxis, tf.newaxis, tf.newaxis, tf.newaxis, ...] true_box_wh = true_wh_A[tf.newaxis, tf.newaxis, tf.newaxis, tf.newaxis, ...] true_cent = tf.tile(true_cent, [ helper.batch_size, helper.out_hw[layer][0], helper.out_hw[layer][1], helper.anchor_number, 1, 1 ]) true_box_wh = tf.tile(true_box_wh, [ helper.batch_size, helper.out_hw[layer][0], helper.out_hw[layer][1], helper.anchor_number, 1, 1 ]) pred_cent = p_xy_A[..., tf.newaxis, :] pred_box_wh = p_wh_A[..., tf.newaxis, :] pred_cent = tf.tile(pred_cent, [1, 1, 1, 1, tf.shape(true_xy_A)[0], 1]) pred_box_wh = tf.tile( pred_box_wh, [1, 1, 1, 1, tf.shape(true_wh_A)[0], 1]) return true_cent, true_box_wh, pred_cent, pred_box_wh
def split_targets(y_true: Tensor, y_pred: Tensor, method: Method) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """ Split concatenated hard targets / logits and hard predictions / soft predictions. :param y_true: tensor with the true labels. :param y_pred: tensor with the predicted labels. :param method: the method used to transfer the knowledge. :return: the concatenated logits, soft predictions, hard targets and hard predictions (teacher_logits, student_output, y_true, y_pred). """ # Here we get the split point, which is half of the predicting dimension. # The reason is because the network's output contains the predicted values # concatenated with the predicted logits, which will always have the same dimension. split_point = cast(divide(shape(y_true)[1], 2), int32) # Get hard labels and logits. y_true, teacher_logits = y_true[:, :split_point], y_true[:, split_point:] if method == Method.DISTILLATION or method == Method.PKT_PLUS_DISTILLATION: y_pred, student_output = y_pred[:, :split_point], y_pred[:, split_point:] else: student_output = identity(y_pred) return teacher_logits, student_output, y_true, y_pred
def create_label(click_position, num_labels=10): num_rows = shape(click_position)[0] row_idx = expand_dims(range(num_rows), axis=1) idx = concatenate([row_idx, cast(click_position, int32)], axis=1) labels = SparseTensor(indices=cast(idx, int64), values=ones([num_rows]), dense_shape=[num_rows, num_labels]) return ones([num_rows, num_labels]) - to_dense(labels)
def noise_label(labels): id = range(shape(labels['click_position'])[0]) idx = concatenate( [expand_dims(cast(id, int64), axis=1), labels['click_position']], axis=1) clicked_item = gather_nd(labels['reco'], idx) return cast(equal(expand_dims(clicked_item, axis=1), labels['reco']), float32)
def test_ficken(self): labels = {'click_position': [1, 2], 'reco': [[0, 1, 2], [2, 1, 0]]} id = range(shape(labels['click_position'])[0]) idx = concatenate([ expand_dims(cast(id, int64), axis=1), expand_dims(cast(labels['click_position'], int64), axis=1) ], axis=1) clicked_item = gather_nd(labels['reco'], idx) with self.test_session(): print(clicked_item.eval())
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=None, scope=None, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1412.7449 (see below for details). It is recommended for complex sequence-to-sequence tasks. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. num_heads: Number of attention heads that read from attention_states. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, shapes of attention_states are not set, or input size cannot be inferred from the input. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if attention_states.get_shape()[2].value is None: raise ValueError("Shape[2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope( scope or "attention_decoder", dtype=dtype) as scope: dtype = scope.dtype batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value if attn_length is None: attn_length = shape(attention_states)[1] attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append( variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) state = initial_state def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) x = linear([inp] + attns, input_size, True) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + attns, output_size, True) if loop_function is not None: prev = output outputs.append(output) return outputs, state
def kv_attention_decoder(cell, decoder_inputs, kb_inputs, kb_mask_inputs, initial_state, attention_states, num_decoder_symbols, embedding_size, output_size, output_projection=None, feed_previous=False, attn_type="linear", enc_attn=False, enc_query=False, scope=None, dtype=None): """ Run decoding which includes an attention over both the encoder states and the KB :param cell: :param encoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs) :param decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs) :param kb_inputs: Tensor containing KB to be used for decoding :param kb_col_inputs: Tensor containing col indices for batch of dialogues (batch_size, num_cols) :param kb_mask_inputs: Tensor containing KB masks to be used for zeroing out PAD embeddings in KB :param initial_state: Initial encoder state fed into the decoder :param attention_states: Embedded encoder attention states (batch_size, attn_length, attn_size) :param num_decoder_symbols: Vocab size for decoding :param embedding_size: Size of embedding vector :param output_size: Size of output vectors :param output_projection: :param feed_previous: :param scope: :param dtype: :return: """ if output_projection is not None: proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with( [num_decoder_symbols]) with variable_scope.variable_scope(scope or "kb_attention_decoder", dtype=dtype) as scope: embedding = variable_scope.get_variable( "embedding", [num_decoder_symbols, embedding_size]) loop_function = _extract_argmax_and_embed( embedding, output_projection) if feed_previous else None emb_inp = [ embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs ] # Needed for reshaping. batch_size = array_ops.shape(decoder_inputs[0])[0] attn_length = attention_states.get_shape()[1].value if attn_length is None: attn_length = shape(attention_states)[1] attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to # reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] # Size of query vectors for attention. attention_vec_size = attn_size if attn_type == "linear" or attn_type == "two-mlp": k = variable_scope.get_variable( "AttnW", [1, 1, attn_size, attention_vec_size]) hidden_features.append( nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV", [attention_vec_size])) # Initialize mask embedding table np_mask = np.array([[0.] * embedding_size, [1.] * embedding_size]) embedding_mask = variable_scope.get_variable( "embedding_mask", [2, embedding_size], initializer=tf.constant_initializer(np_mask), trainable=False) embedded_kb_mask_batch = tf.nn.embedding_lookup( embedding_mask, kb_mask_inputs) # Mask for zeroing out attns over PAD tokens kb_attn_mask = tf.cast(kb_mask_inputs[:, :, 0, 0], tf.float32) # Embed kb embedded_kb_batch = tf.nn.embedding_lookup(embedding, kb_inputs) embedded_kb_batch = embedded_kb_batch * embedded_kb_mask_batch embedded_kb_batch = math_ops.reduce_sum(embedded_kb_batch, [3]) # Split into value, type tensors num_triples = embedded_kb_batch.get_shape()[1].value embedded_kb_key = embedded_kb_batch[:, :, :2, :] # Summing head + relation embedded_kb_key = math_ops.reduce_sum(embedded_kb_key, [2]) # Dim: (?, num_triples,) value_idx = kb_inputs[:, :, 3, 0] # Query will usually be of (batch_size, rnn_size) def attention(query): """Put attention masks on hidden using hidden_features and query.""" # Results of attention reads will be stored here. ds = [] # Will store masks over encoder context attn_masks = [] # Store attention logits attn_logits = [] # If the query is a tuple (LSTMStateTuple), flatten it. if nest.is_sequence(query): query_list = nest.flatten(query) # Check that ndims == 2 if specified. for q in query_list: ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, axis=1) with variable_scope.variable_scope("Attention"): if attn_type == "linear": y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[0] * math_ops.tanh(hidden_features[0] + y), [2, 3]) elif attn_type == "bilinear": query = tf.tile(tf.expand_dims(query, 1), [1, attn_length, 1]) query = batch_linear(query, attn_size, bias=True) hid = tf.squeeze(hidden, [2]) s = tf.reduce_sum(tf.matmul(query, hid), [2]) else: # Two layer MLP y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). layer1 = math_ops.tanh(hidden_features[0] + y) k2 = variable_scope.get_variable( "AttnW_1", [1, 1, attn_size, attention_vec_size]) layer2 = nn_ops.conv2d(layer1, k2, [1, 1, 1, 1], "SAME") s = math_ops.reduce_sum(v[0] * math_ops.tanh(layer2), [2, 3]) a = nn_ops.softmax(s) attn_masks.append(a) attn_logits.append(s) # Now calculate the attention-weighted vector d. # Hidden is encoder hidden states d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds, attn_masks, attn_logits def attention_kb_triple(query): """ Compute attention over kb triples given decoder hidden state as a query :param query: :return: """ # Expand dims so can concatenate with embedded_key with variable_scope.variable_scope("Attention_KB_Triple"): if attn_type == "two-mlp": query = tf.expand_dims(query, [1]) with variable_scope.variable_scope("KB_key_W1"): key_layer_1 = batch_linear(embedded_kb_key, attention_vec_size, bias=True) with variable_scope.variable_scope("Query_W1"): query_layer_1 = batch_linear(query, attention_vec_size, bias=True) layer_1 = math_ops.tanh(key_layer_1 + query_layer_1) with variable_scope.variable_scope("KB_Query_W2"): layer_2 = batch_linear(layer_1, attention_vec_size, bias=True) layer_2 = math_ops.tanh(layer_2) with variable_scope.variable_scope("KB_Query_W3"): layer_3 = batch_linear(layer_2, 1, bias=True) layer_3_logits = tf.squeeze(layer_3, [2]) layer_3 = nn_ops.softmax(layer_3_logits) return layer_3, layer_3_logits elif attn_type == "linear": query = tf.expand_dims(query, [1]) with variable_scope.variable_scope("KB_key_W1"): key_layer_1 = batch_linear(embedded_kb_key, attention_vec_size, bias=True) with variable_scope.variable_scope("Query_W1"): query_layer_1 = batch_linear(query, attention_vec_size, bias=True) layer_1 = math_ops.tanh(key_layer_1 + query_layer_1) with variable_scope.variable_scope("KB_Query_W2"): layer_2 = batch_linear(layer_1, 1, bias=True) layer_2_logits = tf.squeeze(layer_2, [2]) layer_2 = nn_ops.softmax(layer_2_logits) return layer_2, layer_2_logits state = initial_state outputs = [] switch_outputs = [] attn_kb_outputs = [] prev = None batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype)] first_indices = tf.tile(tf.expand_dims(tf.range(batch_size), dim=1), [1, num_triples]) # Use encoding of query if enc_query: encoder_q = array_ops.concat([state.c, state.h], axis=1) attn_kb, attn_kb_logits = attention_kb_triple(encoder_q) # Ensure the second shape of attention vectors is set. for a in attns: a.set_shape([None, attn_size]) for i, inp in enumerate(emb_inp): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) # Merge input and previous attentions into one vector of # the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) if enc_attn: # Use encoder attention as well x = linear([inp] + attns, input_size, True) else: x = linear([inp], input_size, True) # Run the RNN. cell_output, state = cell(x, state) # If the query is a tuple (LSTMStateTuple), flatten it. if nest.is_sequence(state): query_list = nest.flatten(state) # Check that ndims == 2 if specified. for q in query_list: ndims = q.get_shape().ndims if ndims: assert ndims == 2 concat_state = array_ops.concat(query_list, axis=1) if enc_attn: attns, attn_masks, attn_logits = attention(state) if not enc_query: attn_kb, attn_kb_logits = attention_kb_triple(concat_state) attn_kb_logits = attn_kb_logits * kb_attn_mask # Gather values from KB gather_indices = tf.stack([first_indices, value_idx], axis=2) updated_p = tf.scatter_nd(gather_indices, attn_kb_logits, [batch_size, num_decoder_symbols]) attn_kb_outputs.append(attn_kb_logits) with variable_scope.variable_scope("AttnOutputProjection"): if enc_attn: output = linear([cell_output] + attns, output_size, True) else: output = linear([cell_output], output_size, True) # Simply add output logits and attn kb logits output = updated_p + output if loop_function is not None: prev = output outputs.append(output) return outputs, attn_kb_outputs, switch_outputs
def attention_decoder(decoder_inputs, # T * [batch_size, input_size] initial_state, # [batch_size, cell.states] attention_states, # [batch_size, attn_length , attn_size] cell, output_size=None, num_heads=1, loop_function=None, dtype=None, scope=None, initial_state_attention=False): if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if attention_states.get_shape()[2].value is None: raise ValueError("Shape[2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope( scope or "attention_decoder", dtype=dtype) as scope: dtype = scope.dtype batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value if attn_length is None: attn_length = shape(attention_states)[1] attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. # W_{1}*h_{i}用的是卷积的方式实现,返回的tensor的形状是[batch_size, attn_length, 1, attention_vec_size] hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in range(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) state = initial_state def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) # W_{2}*d_{t},此项是通过下面的线性映射函数linear实现 for a in range(num_heads): with variable_scope.variable_scope("Attention_%d" % a): # query对应当前隐层状态d_t y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). # 计算u_t s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in range(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = attention(initial_state) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp, inp_symbol = loop_function(prev, i) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) x = linear([inp] + attns, input_size, True) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + attns, output_size, True) if loop_function is not None: prev = output outputs.append(output) return outputs, state
def attention_decoder(initial_state, attention_states, cell, vocab_size, time_steps, batch_size, output_size=None, loop_function=None, dtype=None, scope=None): if attention_states.get_shape()[2].value is None: raise ValueError("Shape[2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope( scope or "attention_decoder", dtype=dtype) as scope: dtype = scope.dtype attn_length = attention_states.get_shape()[1].value if attn_length is None: attn_length = shape(attention_states)[1] attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) attention_vec_size = attn_size # Size of query vectors for attention. k = variable_scope.get_variable("AttnW", [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = variable_scope.get_variable("AttnV", [attention_vec_size]) state = initial_state def attention(query): """Put attention masks on hidden using hidden_features and query.""" if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) with variable_scope.variable_scope("Attention_0"): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds = array_ops.reshape(d, [-1, attn_size]) return ds prev = array_ops.zeros([batch_size,output_size]) batch_attn_size = array_ops.pack([batch_size, attn_size]) attn = array_ops.zeros(batch_attn_size, dtype=dtype) attn.set_shape([None, attn_size]) def cond(time_step, prev_o_t, prev_softmax_input, state_c, state_h, outputs): return time_step < time_steps def body(time_step, prev_o_t, prev_softmax_input, state_c, state_h, outputs): state = tf.nn.rnn_cell.LSTMStateTuple(state_c,state_h) with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev_softmax_input, time_step) input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) x = tf.concat(1,[inp,prev_o_t]) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. attn = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = math_ops.tanh(linear([cell_output, attn], output_size, False)) with variable_scope.variable_scope("FinalSoftmax"): softmax_input = linear(output,vocab_size,False) new_outputs = tf.concat(1, [outputs,tf.expand_dims(softmax_input,1)]) return (time_step + tf.constant(1, dtype=tf.int32),\ output, softmax_input, state.c, state.h, new_outputs) time_step = tf.constant(0, dtype=tf.int32) shape_invariants = [time_step.get_shape(),\ prev.get_shape(),\ tf.TensorShape([batch_size, vocab_size]),\ tf.TensorShape([batch_size,512]),\ tf.TensorShape([batch_size,512]),\ tf.TensorShape([batch_size, None, vocab_size])] # START keyword is 0 init_word = np.zeros([batch_size, vocab_size]) loop_vars = [time_step,\ prev,\ tf.constant(init_word, dtype=tf.float32),\ initial_state.c,initial_state.h,\ tf.zeros([batch_size,1,vocab_size])] # we just need to feed an empty matrix # to start off the while loop since you can # only concat matrices that agree on all but # one dimension. Below, we remove that initial # filler index outputs = tf.while_loop(cond, body, loop_vars, shape_invariants) return outputs[-1][:,1:], tf.nn.rnn_cell.LSTMStateTuple(outputs[-3],outputs[-2])
def attention_decoder(decoder_inputs, encoder_inputs, initial_state, attention_states, cell, sent_decoder_inputs, sent_encoder_inputs, sent_initial_state, sent_attention_states, sent_cell, dec_timesteps, mode_train=True, switch=None, word_weights=None, output_size=None, num_heads=1, loop_function=None, sent_loop_function=None, dtype=None, scope=None, initial_state_attention=False): if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if attention_states.get_shape()[2].value is None: raise ValueError("Shape[2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope( scope or "attention_decoder", dtype=dtype) as scope: dtype = scope.dtype with variable_scope.variable_scope("word_attn") as attn_scope: batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value if attn_length is None: attn_length = shape(attention_states)[1] attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append( variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) def attention(query,coverage=None): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) atn = nn_ops.softmax(s) d = math_ops.reduce_sum( array_ops.reshape(atn, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds,s,atn outputs = [] #prev = None batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns,ss,soft_ss = attention(initial_state) with variable_scope.variable_scope("sent_attn") as sent_attn_scope: #batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. sent_attn_length = sent_attention_states.get_shape()[1].value if sent_attn_length is None: sent_attn_length = shape(sent_attention_states)[1] sent_attn_size = sent_attention_states.get_shape()[2].value sent_hidden = array_ops.reshape( sent_attention_states, [-1, sent_attn_length, 1, sent_attn_size]) sent_hidden_features = [] sent_v = [] sent_attention_vec_size = sent_attn_size # Size of query vectors for attention. for a in xrange(num_heads): sent_k = variable_scope.get_variable("sent_AttnW_%d" % a, [1, 1, sent_attn_size, sent_attention_vec_size]) sent_hidden_features.append(nn_ops.conv2d(sent_hidden, sent_k, [1, 1, 1, 1], "SAME")) sent_v.append( variable_scope.get_variable("sent_AttnV_%d" % a, [sent_attention_vec_size])) def sent_attention(query, sent_coverage=None): ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("sent_Attention_%d" % a): y = linear(query, sent_attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, sent_attention_vec_size]) s = math_ops.reduce_sum( v[a] * math_ops.tanh(sent_hidden_features[a] + y), [2, 3]) atn = nn_ops.softmax(s) # sent_coverage = array_ops.expand_dims(array_ops.expand_dims(atn,2),2) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(atn, [-1, sent_attn_length, 1, 1]) * sent_hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, sent_attn_size])) return ds,s,atn #,sent_coverage outputs = [] sent_outputs = [] soft_outputs = [] soft_sent_outputs = [] sent_batch_attn_size = array_ops.pack([batch_size, sent_attn_size]) sent_attns = [array_ops.zeros(sent_batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in sent_attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, sent_attn_size]) if initial_state_attention: sent_attns,sent_ss,soft_sent_ss = sent_attention(sent_initial_state) hidden_words=[] hidden_sents=[] s_w=[] d_k = variable_scope.get_variable("switch_w", [1, 1, attn_size, attention_vec_size]) T_k = variable_scope.get_variable("switch_s" , [1, 1, sent_attn_size, sent_attention_vec_size]) hidden_words=nn_ops.conv2d(hidden, d_k, [1, 1, 1, 1], "SAME") hidden_sents=nn_ops.conv2d(sent_hidden, T_k, [1, 1, 1, 1], "SAME") def switch_pos(st_w,st_s,h_w,h_s): with variable_scope.variable_scope("switch_w"): y_w=linear(st_w,2, True) with variable_scope.variable_scope("switch_s"): y_s=linear(st_s,2, True) with variable_scope.variable_scope("switch_hw"): y_hw=linear(h_w,2, True) with variable_scope.variable_scope("switch_hs"): y_hs=linear(h_s,2, True) s_b=y_s+y_hs+math_ops.tanh(y_w+y_hw) s_b=array_ops.reshape(s_b,[-1,2]) switch_pb=nn_ops.softmax (s_b) return s_b ,switch_pb sent_state = sent_initial_state state = initial_state sent_prev = None prev=None switch_outputs=[] switch_softmax=[] for i in xrange(dec_timesteps): if i > 0: variable_scope.get_variable_scope().reuse_variables() sb,switch_prob=switch_pos(state,sent_state,attns,sent_attns) switch_outputs.append(sb) switch_softmax.append(switch_prob) inp=decoder_inputs[i] sent_inp=sent_decoder_inputs[i] if mode_train is not True and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): #print("Caliing Loop function") if not loop_function ==None: inp = loop_function(prev,encoder_inputs) sent_inp=sent_loop_function(sent_prev,sent_encoder_inputs) input_size = inp.get_shape().with_rank(2)[1] sent_input_size = sent_inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) sent_switch=(switch_prob[:,1]) word_switch=switch_prob[:,0] #ss(j-1) with variable_scope.variable_scope("word_stpes"): x=linear(array_ops.concat(2,[[inp],attns,math_ops.tanh(sent_attns)])[0],input_size,True) cell_output,state=cell(x,state) ##########Sentence decoder################################## with variable_scope.variable_scope("sent_steps"): sent_x=linear(array_ops.concat(2,[[sent_inp],sent_attns,math_ops.tanh(attns)])[0],sent_input_size,True) sent_cell_output,sent_state=sent_cell(sent_x,sent_state) with variable_scope.variable_scope(sent_attn_scope,reuse=True): sent_attns,sent_ss ,soft_sent_ss= sent_attention(sent_state) with variable_scope.variable_scope(attn_scope,reuse=True): attns,ss,soft_ss = attention(state) soft_ssout=soft_ss *array_ops.reshape([word_switch],[-1,1]) soft_sent_ssout=soft_sent_ss *array_ops.reshape([sent_switch],[-1,1]) prev=ss sent_prev=sent_ss outputs.append(soft_ss) soft_outputs.append(soft_ssout) sent_outputs.append(soft_sent_ss) soft_sent_outputs.append(soft_sent_ssout) return outputs, state, sent_outputs, sent_state,switch_outputs,switch_softmax,soft_outputs,soft_sent_outputs #,coverage
def lookup_positives(scores, click_position): num_rows = shape(scores)[0] row_idx = expand_dims(range(num_rows), axis=1) idx = concatenate([row_idx, cast(click_position, int32)], axis=1) return gather_nd(scores, idx)
def to_one_hot(scores): return one_hot(argmax(scores, axis=1), shape(scores)[1])