def __init__( self, learning_rate, num_layers, size, size_layer, output_size, kernel_size=3, n_attn_heads=16, dropout=0.9, ): self.X = tf.placeholder(tf.float32, (None, None, size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) encoder_embedded = tf.layers.dense(self.X, size_layer) encoder_embedded += position_encoding(encoder_embedded) e = tf.identity(encoder_embedded) for i in range(num_layers): dilation_rate = 2**i pad_sz = (kernel_size - 1) * dilation_rate with tf.variable_scope('block_%d' % i): encoder_embedded += cnn_block(encoder_embedded, dilation_rate, pad_sz, size_layer, kernel_size) encoder_output, output_memory = encoder_embedded, encoder_embedded + e g = tf.identity(encoder_embedded) for i in range(num_layers): dilation_rate = 2**i pad_sz = (kernel_size - 1) * dilation_rate with tf.variable_scope('decode_%d' % i): attn_res = h = cnn_block(encoder_embedded, dilation_rate, pad_sz, size_layer, kernel_size) C = [] for j in range(n_attn_heads): h_ = tf.layers.dense(h, size_layer // n_attn_heads) g_ = tf.layers.dense(g, size_layer // n_attn_heads) zu_ = tf.layers.dense(encoder_output, size_layer // n_attn_heads) ze_ = tf.layers.dense(output_memory, size_layer // n_attn_heads) d = tf.layers.dense(h_, size_layer // n_attn_heads) + g_ dz = tf.matmul(d, tf.transpose(zu_, [0, 2, 1])) a = tf.nn.softmax(dz) c_ = tf.matmul(a, ze_) C.append(c_) c = tf.concat(C, 2) h = tf.layers.dense(attn_res + c, size_layer) h = tf.nn.dropout(h, keep_prob=dropout) encoder_embedded += h encoder_embedded = tf.sigmoid(encoder_embedded[-1]) self.logits = tf.layers.dense(encoder_embedded, output_size) self.cost = tf.reduce_mean(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize( self.cost)
def __init__(self, input_, dimension=2, learning_rate=0.01, hidden_layer=256, epoch=20): input_size = input_.shape[1] self.X = tf.placeholder("float", [None, input_.shape[1]]) weights = { 'encoder_h1': tf.Variable(tf.random_normal([input_size, hidden_layer])), 'encoder_h2': tf.Variable(tf.random_normal([hidden_layer, dimension])), 'decoder_h1': tf.Variable(tf.random_normal([dimension, hidden_layer])), 'decoder_h2': tf.Variable(tf.random_normal([hidden_layer, input_size])), } biases = { 'encoder_b1': tf.Variable(tf.random_normal([hidden_layer])), 'encoder_b2': tf.Variable(tf.random_normal([dimension])), 'decoder_b1': tf.Variable(tf.random_normal([hidden_layer])), 'decoder_b2': tf.Variable(tf.random_normal([input_size])), } first_layer_encoder = tf.nn.sigmoid( tf.add(tf.matmul(self.X, weights['encoder_h1']), biases['encoder_b1'])) self.second_layer_encoder = tf.nn.sigmoid( tf.add(tf.matmul(first_layer_encoder, weights['encoder_h2']), biases['encoder_b2'])) first_layer_decoder = tf.nn.sigmoid( tf.add(tf.matmul(self.second_layer_encoder, weights['decoder_h1']), biases['decoder_b1'])) second_layer_decoder = tf.nn.sigmoid( tf.add(tf.matmul(first_layer_decoder, weights['decoder_h2']), biases['decoder_b2'])) self.cost = tf.reduce_mean(tf.pow(self.X - second_layer_decoder, 2)) self.optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize( self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) for i in range(epoch): last_time = time.time() _, loss = self.sess.run([self.optimizer, self.cost], feed_dict={self.X: input_}) if (i + 1) % 10 == 0: print('epoch:', i + 1, 'loss:', loss, 'time:', time.time() - last_time)
def _erase_and_write(memory, address, reset_weights, values): """Module to erase and write in the external memory. Erase operation: M_t'(i) = M_{t-1}(i) * (1 - w_t(i) * e_t) Add operation: M_t(i) = M_t'(i) + w_t(i) * a_t where e are the reset_weights, w the write weights and a the values. Args: memory: 3-D tensor of shape `[batch_size, memory_size, word_size]`. address: 3-D tensor `[batch_size, num_writes, memory_size]`. reset_weights: 3-D tensor `[batch_size, num_writes, word_size]`. values: 3-D tensor `[batch_size, num_writes, word_size]`. Returns: 3-D tensor of shape `[batch_size, num_writes, word_size]`. """ with tf.name_scope('erase_memory', values=[memory, address, reset_weights]): expand_address = tf.expand_dims(address, 3) reset_weights = tf.expand_dims(reset_weights, 2) weighted_resets = expand_address * reset_weights reset_gate = tf.reduce_prod(1 - weighted_resets, [1]) memory *= reset_gate with tf.name_scope('additive_write', values=[memory, address, values]): add_matrix = tf.matmul(address, values, adjoint_a=True) memory += add_matrix return memory
def __init__(self, learning_rate, num_layers, size, size_layer, output_size, forget_bias=0.1): def lstm_cell(size_layer): return tf.nn.rnn_cell.LSTMCell(size_layer, state_is_tuple=False) rnn_cells = tf.nn.rnn_cell.MultiRNNCell( [lstm_cell(size_layer) for _ in range(num_layers)], state_is_tuple=False) self.X = tf.placeholder(tf.float32, (None, None, size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) drop = tf.contrib.rnn.DropoutWrapper(rnn_cells, output_keep_prob=forget_bias) self.hidden_layer = tf.placeholder(tf.float32, (None, num_layers * 2 * size_layer)) self.outputs, self.last_state = tf.nn.dynamic_rnn( drop, self.X, initial_state=self.hidden_layer, dtype=tf.float32) rnn_W = tf.Variable(tf.random_normal((size_layer, output_size))) rnn_B = tf.Variable(tf.random_normal([output_size])) self.logits = tf.matmul(self.outputs[-1], rnn_W) + rnn_B self.cost = tf.reduce_mean(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize( self.cost)
def directional_read_weights(self, link, prev_read_weights, forward): """Calculates the forward or the backward read weights. For each read head (at a given address), there are `num_writes` link graphs to follow. Thus this function computes a read address for each of the `num_reads * num_writes` pairs of read and write heads. Args: link: tensor of shape `[batch_size, num_writes, memory_size, memory_size]` representing the link graphs L_t. prev_read_weights: tensor of shape `[batch_size, num_reads, memory_size]` containing the previous read weights w_{t-1}^r. forward: Boolean indicating whether to follow the "future" direction in the link graph (True) or the "past" direction (False). Returns: tensor of shape `[batch_size, num_reads, num_writes, memory_size]` """ with tf.name_scope('directional_read_weights'): # We calculate the forward and backward directions for each pair of # read and write heads; hence we need to tile the read weights and do a # sort of "outer product" to get this. expanded_read_weights = tf.stack([prev_read_weights] * self._num_writes, 1) result = tf.matmul(expanded_read_weights, link, adjoint_b=forward) # Swap dimensions 1, 2 so order is [batch, reads, writes, memory]: return tf.transpose(result, perm=[0, 2, 1, 3])
def multihead_attn(queries, keys, q_masks, k_masks, future_binding, num_units, num_heads): T_q = tf.shape(queries)[1] T_k = tf.shape(keys)[1] Q = tf.layers.dense(queries, num_units, name='Q') K_V = tf.layers.dense(keys, 2 * num_units, name='K_V') K, V = tf.split(K_V, 2, -1) Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) align = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) align = align / np.sqrt(K_.get_shape().as_list()[-1]) paddings = tf.fill(tf.shape(align), float('-inf')) key_masks = k_masks key_masks = tf.tile(key_masks, [num_heads, 1]) key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, T_q, 1]) align = tf.where(tf.equal(key_masks, 0), paddings, align) if future_binding: lower_tri = tf.ones([T_q, T_k]) lower_tri = tf.linalg.LinearOperatorLowerTriangular( lower_tri).to_dense() masks = tf.tile(tf.expand_dims(lower_tri, 0), [tf.shape(align)[0], 1, 1]) align = tf.where(tf.equal(masks, 0), paddings, align) align = tf.nn.softmax(align) query_masks = tf.to_float(q_masks) query_masks = tf.tile(query_masks, [num_heads, 1]) query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, T_k]) align *= query_masks outputs = tf.matmul(align, V_) outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) outputs += queries outputs = layer_norm(outputs) return outputs
def _build(self, memory, keys, strengths): """Connects the CosineWeights module into the graph. Args: memory: A 3-D tensor of shape `[batch_size, memory_size, word_size]`. keys: A 3-D tensor of shape `[batch_size, num_heads, word_size]`. strengths: A 2-D tensor of shape `[batch_size, num_heads]`. Returns: Weights tensor of shape `[batch_size, num_heads, memory_size]`. """ # Calculates the inner product between the query vector and words in memory. dot = tf.matmul(keys, memory, adjoint_b=True) # Outer product to compute denominator (euclidean norm of query and memory). memory_norms = _vector_norms(memory) key_norms = _vector_norms(keys) norm = tf.matmul(key_norms, memory_norms, adjoint_b=True) # Calculates cosine similarity between the query vector and words in memory. similarity = dot / (norm + _EPSILON) return weighted_softmax(similarity, strengths, self._strength_op)
def _build(self, inputs, prev_state): """Connects the MemoryAccess module into the graph. Args: inputs: tensor of shape `[batch_size, input_size]`. This is used to control this access module. prev_state: Instance of `AccessState` containing the previous state. Returns: A tuple `(output, next_state)`, where `output` is a tensor of shape `[batch_size, num_reads, word_size]`, and `next_state` is the new `AccessState` named tuple at the current time t. """ inputs = self._read_inputs(inputs) # Update usage using inputs['free_gate'] and previous read & write weights. usage = self._freeness( write_weights=prev_state.write_weights, free_gate=inputs['free_gate'], read_weights=prev_state.read_weights, prev_usage=prev_state.usage) # Write to memory. write_weights = self._write_weights(inputs, prev_state.memory, usage) memory = _erase_and_write( prev_state.memory, address=write_weights, reset_weights=inputs['erase_vectors'], values=inputs['write_vectors']) linkage_state = self._linkage(write_weights, prev_state.linkage) # Read from memory. read_weights = self._read_weights( inputs, memory=memory, prev_read_weights=prev_state.read_weights, link=linkage_state.link) read_words = tf.matmul(read_weights, memory) return (read_words, AccessState( memory=memory, read_weights=read_weights, write_weights=write_weights, linkage=linkage_state, usage=usage))
def __init__( self, learning_rate, num_layers, size, size_layer, output_size, kernel_size=3, n_attn_heads=16, dropout=0.9, ): self.X = tf.placeholder(tf.float32, (None, None, size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) encoder_embedded = tf.layers.dense(self.X, size_layer) e = tf.identity(encoder_embedded) for i in range(num_layers): z = layer( encoder_embedded, encoder_block, kernel_size, size_layer * 2, encoder_embedded, ) z = tf.nn.dropout(z, keep_prob=dropout) encoder_embedded = z encoder_output, output_memory = z, z + e g = tf.identity(encoder_embedded) for i in range(num_layers): attn_res = h = layer( encoder_embedded, decoder_block, kernel_size, size_layer * 2, residual=tf.zeros_like(encoder_embedded), ) C = [] for j in range(n_attn_heads): h_ = tf.layers.dense(h, size_layer // n_attn_heads) g_ = tf.layers.dense(g, size_layer // n_attn_heads) zu_ = tf.layers.dense(encoder_output, size_layer // n_attn_heads) ze_ = tf.layers.dense(output_memory, size_layer // n_attn_heads) d = tf.layers.dense(h_, size_layer // n_attn_heads) + g_ dz = tf.matmul(d, tf.transpose(zu_, [0, 2, 1])) a = tf.nn.softmax(dz) c_ = tf.matmul(a, ze_) C.append(c_) c = tf.concat(C, 2) h = tf.layers.dense(attn_res + c, size_layer) h = tf.nn.dropout(h, keep_prob=dropout) encoder_embedded = h encoder_embedded = tf.sigmoid(encoder_embedded[-1]) self.logits = tf.layers.dense(encoder_embedded, output_size) self.cost = tf.reduce_mean(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize( self.cost)