def _create_gates(self, inputs, memory): """Create input and forget gates for this step using `inputs` and `memory`. Args: inputs: Tensor input. memory: The current state of memory. Returns: input_gate: A LSTM-like insert gate. forget_gate: A LSTM-like forget gate. """ # We'll create the input and forget gates at once. Hence, calculate double # the gate size. num_gates = 2 * self._calculate_gate_size() batch_size = memory.get_shape().as_list()[0] memory = tf.tanh(memory) # B x N x H * V inputs = tf.reshape(inputs, [batch_size, -1]) # B x In_size gate_inputs = linear(inputs, num_gates, use_bias=False, scope='gate_in') # B x num_gates gate_inputs = tf.expand_dims(gate_inputs, axis=1) # B x sentiment.1 x num_gates memory_flattened = tf.reshape(memory, [-1, self._mem_size]) # [B * N, H * V] gate_memory = linear(memory_flattened, num_gates, use_bias=False, scope='gate_mem') # [B * N, num_gates] gate_memory = tf.reshape( gate_memory, [batch_size, self._mem_slots, num_gates]) # [B, N, num_gates] gates = tf.split(gate_memory + gate_inputs, num_or_size_splits=2, axis=2) input_gate, forget_gate = gates # B x N x num_gates/2, B x N x num_gates/2 input_gate = tf.sigmoid(input_gate + self._input_bias) forget_gate = tf.sigmoid(forget_gate + self._forget_bias) return input_gate, forget_gate
def _multihead_attention(self, memory): """Perform multi-head attention from 'Attention is All You Need'. Implementation of the attention mechanism from https://arxiv.org/abs/1706.03762. Args: memory: Memory tensor to perform attention on, with size [B, N, H*V]. Returns: new_memory: New memory tensor. """ qkv_size = 2 * self._key_size + self._head_size total_size = qkv_size * self._num_heads # Denote as F. batch_size = memory.get_shape().as_list()[0] # Denote as B memory_flattened = tf.reshape(memory, [-1, self._mem_size]) # [B * N, H * V] qkv = linear(memory_flattened, total_size, use_bias=False, scope='lin_qkv') # [B*N, F] qkv = tf.reshape(qkv, [batch_size, -1, total_size]) # [B, N, F] qkv = tf.contrib.layers.layer_norm(qkv, trainable=True) # [B, N, F] # [B, N, F] -> [B, N, H, F/H] qkv_reshape = tf.reshape(qkv, [batch_size, -1, self._num_heads, qkv_size]) # [B, N, H, F/H] -> [B, H, N, F/H] qkv_transpose = tf.transpose(qkv_reshape, [0, 2, 1, 3]) q, k, v = tf.split(qkv_transpose, [self._key_size, self._key_size, self._head_size], -1) q *= qkv_size**-0.5 dot_product = tf.matmul(q, k, transpose_b=True) # [B, H, N, N] weights = tf.nn.softmax(dot_product) output = tf.matmul(weights, v) # [B, H, N, V] # [B, H, N, V] -> [B, N, H, V] output_transpose = tf.transpose(output, [0, 2, 1, 3]) # [B, N, H, V] -> [B, N, H * V] new_memory = tf.reshape(output_transpose, [batch_size, -1, self._mem_size]) return new_memory
def _make_generator(self, input, phase_train): s_h, s_w = self.img_size, self.img_size s_h2, s_w2 = self._conv_out_size_same(s_h, 2), self._conv_out_size_same( s_w, 2) s_h4, s_w4 = self._conv_out_size_same(s_h2, 2), self._conv_out_size_same( s_w2, 2) #s_h8, s_w8 = self._conv_out_size_same(s_h4, 2), self._conv_out_size_same(s_w4, 2) #s_h16, s_w16 = self._conv_out_size_same(s_h8, 2), self._conv_out_size_same(s_w8, 2) # project `z` and reshape self.z_, self.h0_w, self.h0_b = ops.linear(input, self.gf_dim * 8 * s_h4 * s_w4, 'g_h0_lin', with_w=True) normalized_value = ops.batch_norm(self.z_, name='g_bn0', axes=[0], phase_train=phase_train) self.h0 = tf.reshape(normalized_value, [-1, s_h4, s_w4, self.gf_dim * 8]) h0 = ops.lrelu(self.h0) self.h1, self.h1_w, self.h1_b = ops.deconv2d( h0, [self.batch_size, s_h2, s_w2, self.gf_dim * 4], name='g_h1', with_w=True) h1 = ops.lrelu( ops.batch_norm(self.h1, name='g_bn1', phase_train=phase_train)) # h2, self.h2_w, self.h2_b = ops.deconv2d( # h1, [self.batch_size, s_h4, s_w4, self.gf_dim*2], name='g_h2', with_w=True) # h2 = tf.nn.relu(ops.batch_norm(h2, name='g_bn2')) # # h3, self.h3_w, self.h3_b = ops.deconv2d( # h2, [self.batch_size, s_h2, s_w2, self.gf_dim*1], name='g_h3', with_w=True) # h3 = tf.nn.relu(ops.batch_norm(h3, name='g_bn3')) h2, self.h2_w, self.h2_b = ops.deconv2d( h1, [self.batch_size, s_h, s_w, self.c_dim], name='g_h4', with_w=True) h2_non_linear = ops.lrelu(h2, leak=0) return h2_non_linear
def _build(self, inputs, memory): """Adds relational memory to the TensorFlow graph. Args: inputs: Tensor input. memory: Memory output from the previous time step. Returns: output: This time step's output. next_memory: The next version of memory to use. """ batch_size = memory.get_shape().as_list()[0] inputs = tf.reshape(inputs, [batch_size, -1]) # [B, In_size] inputs = linear(inputs, self._mem_size, use_bias=True, scope='input_for_concat') # [B, V * H] inputs_reshape = tf.expand_dims(inputs, 1) # [B, 1, V * H] memory_plus_input = tf.concat([memory, inputs_reshape], axis=1) # [B, N + 1, V * H] # qua faccio la self-attention sulla memoria per determinare la M(t+1) next_memory = self._attend_over_memory( memory_plus_input) # [B, N + 1, V * H] n = inputs_reshape.get_shape().as_list()[1] next_memory = next_memory[:, : -n, :] # [B, N, V * H], rimuovo la dimensione in più data dal'input if self._gate_style == 'unit' or self._gate_style == 'memory': self._input_gate, self._forget_gate = self._create_gates( inputs_reshape, memory) next_memory = self._input_gate * tf.tanh(next_memory) next_memory += self._forget_gate * memory # semplicemente per l'output prendo la memoria e la rendo a una dimensione. Questa dimensione non è quella # del vocabolario perchè poi questo output appena usctio passa attraverso un MLP. # L'output deriva direttamente dalla nuova memoria, si potrebbe usare l'output per determinare lambda output = tf.reshape(next_memory, [batch_size, -1]) return output, next_memory
def _build(self, inputs, memory): """Adds relational memory to the TensorFlow graph. Args: inputs: Tensor input. memory: Memory output from the previous time step. Returns: output: This time step's output. next_memory: The next version of memory to use. """ batch_size = memory.get_shape().as_list()[0] inputs = tf.reshape(inputs, [batch_size, -1]) # [B, In_size] inputs = linear(inputs, self._mem_size, use_bias=True, scope='input_for_cancat') # [B, V * H] inputs_reshape = tf.expand_dims(inputs, 1) # [B, 1, V * H] memory_plus_input = tf.concat([memory, inputs_reshape], axis=1) # [B, N + 1, V * H] next_memory = self._attend_over_memory( memory_plus_input) # [B, N + 1, V * H] n = inputs_reshape.get_shape().as_list()[1] next_memory = next_memory[:, :-n, :] # [B, N, V * H] if self._gate_style == 'unit' or self._gate_style == 'memory': self._input_gate, self._forget_gate = self._create_gates( inputs_reshape, memory) next_memory = self._input_gate * tf.tanh(next_memory) next_memory += self._forget_gate * memory output = tf.reshape(next_memory, [batch_size, -1]) return output, next_memory
def logits(self, x_onehot): batch_size = self.batch_size seq_len = self.seq_len vocab_size = self.vocab_size dis_emb_dim = self.dis_emb_dim num_rep = self.num_rep sn = self.sn # get the embedding dimension for each presentation emb_dim_single = int(dis_emb_dim / num_rep) assert isinstance(emb_dim_single, int) and emb_dim_single > 0 filter_sizes = [2, 3, 4, 5] num_filters = [300, 300, 300, 300] dropout_keep_prob = 0.75 d_embeddings = tf.get_variable('d_emb', shape=[vocab_size, dis_emb_dim], initializer=create_linear_initializer(vocab_size)) input_x_re = tf.reshape(x_onehot, [-1, vocab_size]) emb_x_re = tf.matmul(input_x_re, d_embeddings) # batch_size x seq_len x dis_emb_dim emb_x = tf.reshape(emb_x_re, [batch_size, seq_len, dis_emb_dim]) # batch_size x seq_len x dis_emb_dim x 1 emb_x_expanded = tf.expand_dims(emb_x, -1) # print('shape of emb_x_expanded: {}'.format( # emb_x_expanded.get_shape().as_list())) # Create a convolution + maxpool layer for each filter size pooled_outputs = [] for filter_size, num_filter in zip(filter_sizes, num_filters): conv = conv2d(emb_x_expanded, num_filter, k_h=filter_size, k_w=emb_dim_single, d_h=1, d_w=emb_dim_single, sn=sn, stddev=None, padding='VALID', scope="conv-%s" % filter_size) # batch_size x (seq_len-k_h+1) x num_rep x num_filter out = tf.nn.relu(conv, name="relu_new") pooled = tf.nn.max_pool(out, ksize=[1, seq_len - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool_new") # batch_size x 1 x num_rep x num_filter pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = sum(num_filters) # batch_size x 1 x num_rep x num_filters_total h_pool = tf.concat(pooled_outputs, 3) # print('shape of h_pool: {}'.format(h_pool.get_shape().as_list())) h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total]) # Add highway # (batch_size*num_rep) x num_filters_total h_highway = highway(h_pool_flat, h_pool_flat.get_shape()[1], 1, 0) # Add dropout h_drop = tf.nn.dropout(h_highway, dropout_keep_prob, name='dropout_new') # fc fc_out = linear(h_drop, output_size=100, use_bias=True, sn=sn, scope='fc_new') logits = linear(fc_out, output_size=1, use_bias=True, sn=sn, scope='logits_new') logits = tf.squeeze(logits, -1) # batch_size*num_rep return logits