def __call__(self, inputs, state, scope=None): """ :param inputs: [N, d + JQ + JQ * d] :param state: [N, d] :param scope: :return: """ with tf.variable_scope(scope or self.__class__.__name__): c_prev, h_prev = state x = tf.slice(inputs, [0, 0], [-1, self._input_size]) q_mask = tf.slice(inputs, [0, self._input_size], [-1, self._q_len]) # [N, JQ] qs = tf.slice(inputs, [0, self._input_size + self._q_len], [-1, -1]) qs = tf.reshape(qs, [-1, self._q_len, self._input_size]) # [N, JQ, d] x_tiled = tf.tile(tf.expand_dims(x, 1), [1, self._q_len, 1]) # [N, JQ, d] h_prev_tiled = tf.tile(tf.expand_dims(h_prev, 1), [1, self._q_len, 1]) # [N, JQ, d] f = tf.tanh( linear([qs, x_tiled, h_prev_tiled], self._input_size, True, scope='f')) # [N, JQ, d] a = tf.nn.softmax( exp_mask(linear(f, 1, True, squeeze=True, scope='a'), q_mask)) # [N, JQ] q = tf.reduce_sum(qs * tf.expand_dims(a, -1), 1) z = tf.concat(axis=1, values=[x, q]) # [N, 2d] return self._cell(z, state)
def attention(query, keys, key_mask, dim_query, dim_key, dtype=None, scope=None): with ops.variable_scope(scope or "attention", dtype=dtype): # content-based addressing # e_i = v_a^T tanh(W query + key_i) # alpha = softmax({e_i}) # (n_query, dim_query) -> (n_query, dim_key) mapped_query = nn.linear(query, [dim_query, dim_key], False, scope="map-query") # (n_key, n_query, dim_key) act = T.tanh(mapped_query[None, :, :] + keys) # (n_key, n_query, 1) e = nn.linear(act, [dim_key, 1], False, scope="pre-alpha") # (n_key, n_query, 1) # (n_key, n_query) e = T.reshape(e, e.shape[:2]) e = e.T # (n_query, n_key) # match dimension key_mask = key_mask.T alpha = nn.masked_softmax(e, key_mask) # (n_query, n_key) alpha = alpha.T # (n_key, n_query) return alpha
def coarseattention(query, keys, key_mask, dim_query, dim_key, dtype=None, scope=None): with ops.variable_scope(scope or "coarseattention", dtype=dtype): # content-based addressing # e_i = v_a^T tanh(W query + key_i) # alpha = softmax({e_i}) # (n_query, dim_query) -> (n_query, dim_key) e = [] for i in range(len(keys)): mapped_query = nn.linear(query, [dim_query, dim_key[i]], False, scope="map-query_{}".format(i)) # (n_key, n_query, dim_key) act = T.tanh(mapped_query[None, :, :] + keys[i]) # (n_key, n_query, 1) em = nn.linear( act, [dim_key[i], 1], False, scope="pre-alpha_{}".format(i)) # (n_key, n_query, 1) e.append(em) e = reduce(T.add, e) # (n_key, n_query) e = T.reshape(e, e.shape[:2]) e = e.T # (n_query, n_key) # match dimension key_mask = key_mask.T alpha = nn.masked_softmax(e, key_mask) # (n_query, n_key) alpha = alpha.T # (n_key, n_query) return alpha
def attention(query, mapped_states, state_size, attn_size, attention_mask=None, scope=None): with ops.variable_scope(scope or "attention"): mapped_query = nn.linear(query, [state_size, attn_size], False, scope="query_w") mapped_query = mapped_query[None, :, :] hidden = theano.tensor.tanh(mapped_query + mapped_states) score = nn.linear(hidden, [attn_size, 1], False, scope="attention_v") score = score.reshape([score.shape[0], score.shape[1]]) exp_score = theano.tensor.exp(score) if attention_mask is not None: exp_score = exp_score * attention_mask alpha = exp_score / theano.tensor.sum(exp_score, 0) return alpha
def prediction(self, y_emb, state, context, keep_prob=1.0): """ maxout -> readout -> softmax p(y_j) \propto f(y_{j-1}, s_{j-1}, c_{j}) :param y_emb: :param state: :param context: :param keep_prob: :return: """ features = [state, y_emb, context] maxhid = nn.maxout( features, [[self.dim_hid, self.dim_y, self.dim_value], self.dim_maxout], self.max_part, True) readout = nn.linear(maxhid, [self.dim_maxout, self.dim_readout], False, scope="readout") if keep_prob < 1.0: readout = nn.dropout(readout, keep_prob=keep_prob) logits = nn.linear(readout, [self.dim_readout, self.n_y_vocab], True, scope="logits") if logits.ndim == 3: new_shape = [logits.shape[0] * logits.shape[1], -1] logits = logits.reshape(new_shape) probs = T.nnet.softmax(logits) return probs
def __build_decoder(self, dec_inps, total_attn_states, total_mask, global_trace, initial_state, genre, topic_trace): with variable_scope.variable_scope("seq2seq_Decoder"): dec_cell = tf.nn.rnn_cell.GRUCell(self.hps.hidden_size) dec_cell = tf.nn.rnn_cell.DropoutWrapper( dec_cell, output_keep_prob=self.keep_prob, input_keep_prob=self.keep_prob) output_size = self.hps.vocab_size # num_decoder_symbols is vocabulary size if not total_attn_states.get_shape()[1:3].is_fully_defined(): raise ValueError( "Shape[1] and [2] of attn_states must be known: %s" % total_attn_states.get_shape()) dec_outs, dec_states, attn_weights = [], [], [] normed_dec_outs = [] # build attn_mask imask = 1.0 - total_mask attn_mask = tf.where(tf.equal(imask, 1), tf.ones_like(imask) * (-float('inf')), imask) calcu_attention = lambda query: self.__attention_calcu( total_attn_states, query, attn_mask, "calcu_attention") state = initial_state decoder_input_size = initial_state.get_shape()[1].value for i, inp in enumerate(dec_inps): if i > 0: variable_scope.get_variable_scope().reuse_variables() input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from input: %s" % inp.name) # Read memory attns, align = calcu_attention( [flatten_query(state), global_trace, topic_trace]) with variable_scope.variable_scope("input_merge"): x = linear([inp, attns, genre[i], global_trace], decoder_input_size, True) # Run the GRU cell_out, state = dec_cell(x, state) with variable_scope.variable_scope("OutputProjection"): output = linear(cell_out, output_size, True) dec_outs.append(tf.identity(output)) normed_dec_outs.append(tf.identity(tf.nn.softmax(output))) attn_weights.append([align]) dec_states.append(tf.identity(state)) return normed_dec_outs, dec_outs, dec_states, attn_weights
def fuse_gate_v2(lhs, rhs, is_train=None, wd=0.0, input_keep_prob=1.0, scope=None): with tf.variable_scope(scope or "fuse_gate"): dim = lhs.get_shape().as_list()[-1] lhs_2 = linear(lhs, dim, True, bias_start=0.0, scope="lhs_2", squeeze=False, wd=wd, input_keep_prob=input_keep_prob, is_train=is_train) rhs_2 = linear(rhs, dim, True, bias_start=0.0, scope="rhs_2", squeeze=False, wd=wd, input_keep_prob=input_keep_prob, is_train=is_train) f = tf.sigmoid(lhs_2 + rhs_2) out = f * lhs + (1 - f) * rhs return out
def fuse_gate(config, is_train, lhs, rhs, scope=None): with tf.variable_scope(scope or "fuse_gate"): dim = lhs.get_shape().as_list()[-1] lhs_1 = linear(lhs, dim ,True, bias_start=0.0, scope="lhs_1", squeeze=False, wd=config.wd, input_keep_prob=config.input_keep_prob, is_train=is_train) rhs_1 = linear(rhs, dim ,True, bias_start=0.0, scope="rhs_1", squeeze=False, wd=0.0, input_keep_prob=config.input_keep_prob, is_train=is_train) z = tf.tanh(lhs_1 + rhs_1) lhs_2 = linear(lhs, dim ,True, bias_start=0.0, scope="lhs_2", squeeze=False, wd=config.wd, input_keep_prob=config.input_keep_prob, is_train=is_train) rhs_2 = linear(rhs, dim ,True, bias_start=0.0, scope="rhs_2", squeeze=False, wd=config.wd, input_keep_prob=config.input_keep_prob, is_train=is_train) f = tf.sigmoid(lhs_2 + rhs_2) out = f * lhs + (1 - f) * z return out
def __call__(self, inputs, is_training): with tf.variable_scope('decoder'): dec_l1 = nn.linear(inputs, (self.z_dim, 1000), 'dec_l1') dec_b1 = nn.batch_normalization(dec_l1, 1000, 'dec_b1', is_training) dec_r1 = nn.leaky_relu(dec_b1) dec_l2 = nn.linear(dec_r1, (1000, 1000), 'dec_l2') dec_b2 = nn.batch_normalization(dec_l2, 1000, 'dec_b2', is_training) dec_r2 = nn.leaky_relu(dec_b2) dec_l3 = nn.linear(dec_r2, (1000, 784), 'dec_l3') return tf.sigmoid(dec_l3)
def domain_sensitive_attention(keys, key_mask, dim_key, dim_domain, dtype=None, scope=None): with ops.variable_scope(scope or "domain_sensitive_attention", dtype=dtype): mapped_keys = nn.linear(keys, [dim_key, dim_domain], True, scope="map-key") act = T.tanh(mapped_keys) # (n_key, n_query, 1) e = nn.linear(act, [dim_domain, 1], False, scope="pre-alpha") # (n_key, n_query, 1) # (n_key, n_query) e = T.reshape(e, e.shape[:2]) e = e.T # (n_query, n_key) # match dimension key_mask = key_mask.T alpha = nn.masked_softmax(e, key_mask) # (n_query, n_key) alpha = alpha.T # (n_key, n_query) return alpha
def __call__(self, inputs, is_training): with tf.variable_scope('encoder'): enc_l1 = nn.linear(inputs, (784, 1000), 'enc_l1') enc_b1 = nn.batch_normalization(enc_l1, 1000, 'enc_b1', is_training) enc_r1 = nn.leaky_relu(enc_b1) enc_l2 = nn.linear(enc_r1, (1000, 1000), 'enc_l2') enc_b2 = nn.batch_normalization(enc_l2, 1000, 'enc_b2', is_training) enc_r2 = nn.leaky_relu(enc_b2) enc_l3 = nn.linear(enc_r2, (1000, self.z_dim), 'enc_l3') encoded = nn.batch_normalization(enc_l3, self.z_dim, 'enc_b3', is_training) return encoded
def __call__(self, input1, input2, scope=None, input_keep_prob=1.0, wd=0.0, is_train=None): N = tf.shape(input1)[0] d1 = input1.get_shape().as_list()[-1] d2 = input2.get_shape().as_list()[-1] with tf.variable_scope(scope or 'ntn'): out1 = tf.reshape(tf.matmul(tf.reshape(linear(input1, self.rank * d2, False, scope='out1', input_keep_prob=input_keep_prob, wd=wd, is_train=is_train), [N, self.rank, d2]), tf.expand_dims(input2, -1)), [N, self.rank]) # [N, rank] out2 = linear(tf.concat([input1, input2], -1), self.rank, True, scope='out2', input_keep_prob=input_keep_prob, wd=wd, is_train=is_train) out = tf.nn.tanh(out1 + out2) out = linear(out, 1, False, scope='out', input_keep_prob=input_keep_prob, wd=wd, is_train=is_train) return out
def __build_key_memory(self): #print ("memory") key_states = [] with variable_scope.variable_scope("EncoderRNN"): for i in xrange(0, self.hps.key_slots): if i > 0: variable_scope.get_variable_scope().reuse_variables() (outputs, state_fw, state_bw) = rnn.static_bidirectional_rnn(self.enc_cell_fw, self.enc_cell_bw, self.emb_key_inps[i], dtype=tf.float32) key_state = array_ops.concat([state_fw, state_bw], 1) key_states.append(key_state) with variable_scope.variable_scope("key_memory"): key_states = [ array_ops.reshape(e, [-1, 1, self.enc_cell_fw.output_size * 2]) for e in key_states ] key_states = array_ops.concat(key_states, 1) key_states = tf.multiply(self.key_mask, key_states) final_state = math_ops.reduce_mean(key_states, axis=1) final_state = linear(final_state, self.hps.hidden_size, True, scope="key_initial") final_state = tf.tanh(final_state) return final_state, key_states
def __build_encoder(self, step): with variable_scope.variable_scope("EncoderRNN", reuse=True): (outputs, enc_state_fw, enc_state_bw) = rnn.static_bidirectional_rnn( self.enc_cell_fw, self.enc_cell_bw, self.emb_enc_inps[step][:self.enc_len], dtype=tf.float32) enc_outs = outputs with variable_scope.variable_scope("seq2seq_Encoder"): enc_state = enc_state_bw final_state = linear(enc_state, self.hps.hidden_size, True, scope="enc_initial") final_state = tf.tanh(final_state) top_states = [ array_ops.reshape(e, [-1, 1, self.enc_cell_fw.output_size * 2]) for e in enc_outs ] attention_states = array_ops.concat(top_states, 1) final_attn_states = tf.multiply(self.enc_mask[step], attention_states) return final_state, final_attn_states, enc_outs
def prediction(self, y_emb, state, context, y_pos, keep_prob=1.0): """ readout -> softmax p(y_j) \propto f(y_{j-1}, s_{j}, c_{j}) :param y_pos: :param y_emb: :param state: :param context: :param keep_prob: :return: """ state = nn.feedforward([state, y_pos], [[self.dim_hid, self.poshdim], self.dim_hid], True, activation=T.tanh, scope="enhancedstate") features = [state, y_emb, context, y_pos] readout = nn.feedforward(features, [[self.dim_hid, self.dim_y, self.dim_value, self.poshdim], self.dim_readout], True, activation=T.tanh, scope="readout") if keep_prob < 1.0: readout = nn.dropout(readout, keep_prob=keep_prob) logits = nn.linear(readout, [self.dim_readout, self.n_y_vocab], True, scope="logits") if logits.ndim == 3: new_shape = [logits.shape[0] * logits.shape[1], -1] logits = logits.reshape(new_shape) probs = T.nnet.softmax(logits) return probs
def __call__(self, inputs, state, scope=None): if not isinstance(inputs, (list, tuple)): inputs = [inputs] input_size = self.input_size output_size = self.output_size if len(inputs) != len(input_size): raise RuntimeError("unmatched elements: inputs and input_size") size = [list(input_size) + [output_size], 4 * output_size] with variable_scope(scope or "lstm"): c, h = state new_inputs = list(inputs[:]) + [h] concat = linear(new_inputs, size, True, concat=True, scope="gates") i, j, f, o = theano.tensor.split(concat, [output_size] * 4, 4, -1) j = theano.tensor.tanh(j) # input, forget, output gate i = theano.tensor.nnet.sigmoid(i) f = theano.tensor.nnet.sigmoid(f) o = theano.tensor.nnet.sigmoid(o) new_c = c * f + i * j # no output activation new_h = new_c * o new_state = (new_c, new_h) return new_h, new_state
def map_attention_states(attention_states, input_size, attn_size, scope=None): with ops.variable_scope(scope or "attention"): mapped_states = nn.linear(attention_states, [input_size, attn_size], False, scope="attention_w") return mapped_states
def sampleP_theta(self,h_dec,share=None): # H_DEC->X with tf.variable_scope("P_theta",reuse=share): p=nn.linear(h_dec,self.x_dim_flat) x= tf.sigmoid(p) # mean of bernoulli distribution x=tf.reshape(x,(-1,)+self.x_dim) # reshape to original image dimensions return x
def __global_trace_update(self, global_trace, enc_states): #enc_states [batch_size,time,self.enc_cell_fw.output_size*2] with variable_scope.variable_scope("global_trace_update", reuse=None): state = math_ops.reduce_mean(enc_states, axis=1) #[batch_size,self.enc_cell_fw.output_size*2] new_global_trace = math_ops.tanh(linear([global_trace, state], self.hps.global_trace_size, True)) #[batch_size,global_trace_size] new_global_trace = array_ops.reshape(new_global_trace, [-1, self.hps.global_trace_size]) return new_global_trace
def __call__(self, input, scope=None, input_keep_prob=1.0, wd=0.0, is_train=None): N = tf.shape(input)[0] with tf.variable_scope(scope or "maxout"): out = linear(input, self.num_features * self.dim, False, scope='out') out = tf.reshape(out, [N, self.num_features, self.dim]) out = tf.reduce_max(out, 1) # [N, dim] return out
def __attention_calcu(self, attentions, query, attn_mask, scope): with variable_scope.variable_scope(scope): attn_length = attentions.get_shape( )[1].value # the length of a input sentence attn_size = attentions.get_shape( )[2].value # hidden state size of encoder, that is 2*size # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. # Remember we use bidirectional RNN hidden = array_ops.reshape(attentions, [-1, attn_length, 1, attn_size]) # Size of query vectors for attention # query vector is decoder state attention_vec_size = attn_size k = variable_scope.get_variable( "AttnW", [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = variable_scope.get_variable("AttnV", [attention_vec_size]) y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y), [2, 3]) a = nn_ops.softmax(s + attn_mask) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) d = array_ops.reshape(d, [-1, attn_size]) #remember this size return d, a
def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or "gru_cell"): with tf.variable_scope("gates"): r_u = linear([inputs, state], 2 * self._num_units, True, scope=scope) r, u = tf.split(r_u, 2, 1) r, u = tf.nn.sigmoid(r), tf.nn.sigmoid(u) with tf.variable_scope("candidate"): c = self._activation( linear([inputs, r * state], self._num_units, True, scope=scope)) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or "basic_rnn_cell"): output = linear([inputs, state], self._num_units, True, scope=scope) output = self._activation(output) return output, output
def __global_trace_update(self, global_trace, enc_states): with variable_scope.variable_scope("global_trace_update", reuse=None): state = math_ops.reduce_mean(enc_states, axis=1) new_global_trace = math_ops.tanh( linear([global_trace, state], self.hps.global_trace_size, True)) new_global_trace = array_ops.reshape( new_global_trace, [-1, self.hps.global_trace_size]) return new_global_trace
def transition(self, z): # compute A,B,o linearization matrices # Z -> H_TRANS -> (A,B,o) z_dim=self.z_dim u_dim=self.u_dim with tf.variable_scope("trans"): h=self.dynamics(z) with tf.variable_scope("A"): v,r=tf.split(1,2,nn.linear(h,z_dim*2)) v1=tf.expand_dims(v,-1) # (batch, z_dim, 1) rT=tf.expand_dims(r,1) # batch, 1, z_dim I=tf.diag([1.]*z_dim) A=(I+tf.batch_matmul(v1,rT)) # (z_dim, z_dim) + (batch, z_dim, 1)*(batch, 1, z_dim) (I is broadcasted) with tf.variable_scope("B"): B=nn.linear(h,z_dim*u_dim) B=tf.reshape(B,[-1,z_dim,u_dim]) with tf.variable_scope("o"): o=nn.linear(h,z_dim) return A,B,o,v,r
def __call__(self, input1, input2, scope=None, input_keep_prob=1.0, wd=0.0, is_train=None, epsilon=1e-8): N = tf.shape(input1)[0] d1 = input1.get_shape().as_list()[-1] d2 = input2.get_shape().as_list()[-1] with tf.variable_scope(scope or "hyperbolic_distance"): x = 1 + 2 * tf.divide(tf.norm(input1 - input2, axis=-1), \ (1 - tf.norm(input1, axis=-1)) * (1 - tf.norm(input2, axis=-1)) + epsilon) out = tf.log(x + tf.sqrt(tf.square(x) - 1) + epsilon) out = linear(tf.expand_dims(out, 1), 1, True, scope='out') return tf.reshape(out, [N])
def prediction(prev_inputs, prev_state, context, keep_prob=1.0): features = [prev_state, prev_inputs, context] maxhid = nn.maxout(features, [[thdim, tedim, 2 * shdim], maxdim], maxpart, True) readout = nn.linear(maxhid, [maxdim, deephid], False, scope="deepout") if keep_prob < 1.0: readout = nn.dropout(readout, keep_prob=keep_prob) logits = nn.linear(readout, [deephid, tvsize], True, scope="logits") if logits.ndim == 3: new_shape = [logits.shape[0] * logits.shape[1], -1] logits = logits.reshape(new_shape) probs = theano.tensor.nnet.softmax(logits) return probs
def __topic_trace_update(self, topic_trace, key_align, key_states): #topic_trace [batch_size,topic_trace_size+key_slots] key_align [batch_size,key_slots] key_states [batch_size,key_slots,hidden_size*2] with variable_scope.variable_scope("topic_trace_update"): key_used = math_ops.reduce_mean(tf.multiply(key_states, tf.expand_dims(key_align, axis=2)), axis=1) #[batch_size,key_slots,1] => [batch_size,key_slots,2*hidden_size] => [batch_size,2*hidden_size] new_topic_trace = linear([topic_trace[:, 0:self.hps.topic_trace_size], key_used], self.hps.topic_trace_size, True) new_topic_trace = tf.tanh(new_topic_trace) #[batch_size,topic_trace_size] attn_los = topic_trace[:, self.hps.topic_trace_size:] + key_align #[batch_size,key_slots] fin_topic_trace = array_ops.concat([new_topic_trace, attn_los], 1) return fin_topic_trace
def __topic_trace_update(self, topic_trace, key_align, key_states): with variable_scope.variable_scope("topic_trace_update"): key_used = math_ops.reduce_mean(tf.multiply( key_states, tf.expand_dims(key_align, axis=2)), axis=1) new_topic_trace = linear( [topic_trace[:, 0:self.hps.topic_trace_size], key_used], self.hps.topic_trace_size, True) new_topic_trace = tf.tanh(new_topic_trace) attn_los = topic_trace[:, self.hps.topic_trace_size:] + key_align fin_topic_trace = array_ops.concat([new_topic_trace, attn_los], 1) return fin_topic_trace
def __call__(self, inputs, y, is_training): """ INPUTS: z: latent space y: one-hot vectors with shpae [num_class + 1] (+1: for unlabelled data) RETURN: `logits` (i.e. no activation function like sigmoid, softmax, ...) """ h = tf.concat([inputs, y], axis=1) # inputs's shape: (batch_size, z_dim + num_classes + 1) with tf.variable_scope('discriminator'): h = nn.linear(h, (self.z_dim + self.num_classes + 1, 500), 'disc_l1') h = nn.batch_normalization(h, 500, 'disc_b1') h = nn.leaky_relu(h) h = nn.linear(h, (500, 500), 'disc_l2') h = nn.batch_normalization(h, 500, 'disc_b2') h = nn.leaky_relu(h) logits = nn.linear(h, (500, 1), 'disc_l3') return logits
def encode(self,x,share=None): # X -> H_ENC with tf.variable_scope("encoder",reuse=share): for l in range(3): x=nn.ReLU(x,5,"l"+str(l)) return nn.linear(x,2*self.z_dim)
def encode(self,x,share=None): with tf.variable_scope("encoder",reuse=share): l1=nn.ReLU(x,150,"l1") l2=nn.ReLU(l1,150,"l2") h_enc=nn.linear(l2,2*self.z_dim) return h_enc
def decode(self, z, share=None): with tf.variable_scope("decoder",reuse=share): l1=nn.ReLU(z,200,"l1") l2=nn.ReLU(l1,200,"l2") h_dec=nn.linear(l2,self.x_dim_flat) return h_dec