def call(self, x, mask=None): features_dim = self.features_dim step_dim = self.step_dim #xw = K.reshape(K.dot(x[0], K.reshape(self.W, (features_dim, features_dim))), (-1, features_dim)) #yavg=K.reshape(K.mean(K.mean(x[1], axis=1, keepdims=True),axis=0, keepdims=True), (features_dim,-1)) xw1 = K.dot(x[0], K.reshape(self.W1, (features_dim, features_dim))) xw2 = K.dot(x[1], K.reshape(self.W2, (features_dim, features_dim))) xw1t = K.permute_dimensions(xw1, [0, 2, 1]) xw2t = K.permute_dimensions(xw2, [0, 2, 1]) xw11 = K.batch_dot(xw1, xw1t) / (step_dim**0.5) xw12 = K.batch_dot(xw1, xw2t) / (step_dim**0.5) s11 = self.ll * K.softmax(xw11) s12 = (1 - self.ll) * K.softmax(xw12) eij = s11 + s12 print(eij.get_shape()) V = x[0] * K.mean(eij, axis=2, keepdims=True) if self.get_alpha: return eij else: if self.get_sequence: return V else: return K.sum(V, axis=1)
def call(self, inputs, training=None): def _l2normalize(v, eps=1e-12): return v / (K.sum(v ** 2) ** 0.5 + eps) def power_iteration(W, u): _u = u _v = _l2normalize(K.dot(_u, K.transpose(W))) _u = _l2normalize(K.dot(_v, W)) return _u, _v W_shape = self.kernel.shape.as_list() #Flatten the Tensor W_reshaped = K.reshape(self.kernel, [-1, W_shape[-1]]) _u, _v = power_iteration(W_reshaped, self.u) #Calculate Sigma sigma=K.dot(_v, W_reshaped) sigma=K.dot(sigma, K.transpose(_u)) #normalize it W_bar = W_reshaped / sigma #reshape weight tensor if training in {0, False}: W_bar = K.reshape(W_bar, W_shape) else: with tf.control_dependencies([self.u.assign(_u)]): W_bar = K.reshape(W_bar, W_shape) output = K.dot(inputs, W_bar) if self.use_bias: output = K.bias_add(output, self.bias, data_format='channels_last') if self.activation is not None: output = self.activation(output) return output
def call(self, x, mask=None): ''' shape=(batch_size,new_time_step,filters) x_cont=Tensor("layer_dropout_5/cond/Identity:0", shape=(None, None, 128), dtype=float32) x_ques=Tensor("layer_dropout_11/cond/Identity:0", shape=(None, None, 128), dtype=float32) c_mask=Tensor("batch_slice_4/Slice:0", shape=(None, None), dtype=bool)# q_mask=Tensor("batch_slice_5/Slice:0", shape=(None, None), dtype=bool) ''' x_cont, x_ques, c_mask, q_mask = x # get similarity matrix S ##K.dot(x_cont, self.W0)维度变化: [batch_size,time_step,dim] *[dim,1] =[batch_size,time_step,1] subres0 = K.tile(K.dot(x_cont, self.W0), [1, 1, self.q_maxlen]) subres1 = K.tile( K.permute_dimensions(K.dot(x_ques, self.W1), pattern=(0, 2, 1)), [1, self.c_maxlen, 1]) subres2 = K.batch_dot(x_cont * self.W2, K.permute_dimensions(x_ques, pattern=(0, 2, 1))) S = subres0 + subres1 + subres2 S += self.bias q_mask = tf.expand_dims(q_mask, 1) #默认是对最后一维度,即axis=-1 S_ = tf.nn.softmax(self.mask_logits(S, q_mask)) c_mask = tf.expand_dims(c_mask, 2) S_T = K.permute_dimensions( tf.nn.softmax(self.mask_logits(S, c_mask), axis=1), (0, 2, 1)) c2q = tf.matmul(S_, x_ques) q2c = tf.matmul(tf.matmul(S_, S_T), x_cont) result = K.concatenate([x_cont, c2q, x_cont * c2q, x_cont * q2c], axis=-1) return result
def energy_step(inputs, states): assert_msg = "States must be a list. However states {} is of type {}".format( states, type(states)) assert isinstance(states, list) or isinstance(states, tuple), assert_msg en_seq_len, en_hidden = encoder_out_seq.shape[ 1], encoder_out_seq.shape[2] de_hidden = inputs.shape[-1] reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden)) W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden)) if verbose: print('wa.s > ', W_a_dot_s.shape) U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1) # (batch_size, 1, latent_dim) if verbose: print('Ua.h > ', U_a_dot_h.shape) reshaped_Ws_plus_Uh = K.tanh( K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden))) if verbose: print('Ws+Uh > ', reshaped_Ws_plus_Uh.shape) e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len)) e_i = K.softmax(e_i) if verbose: print('ei > ', e_i.shape) return e_i, [e_i]
def call(self, inputs, **kwargs): if not (isinstance(inputs, list) and len(inputs) == 2): raise ValueError( 'You can call this layer only with a list of two tensors ' '(for keys/values and queries)') key_values_input, query_input = inputs _, value_seq_len, d_model = K.int_shape(key_values_input) query_seq_len = K.int_shape(inputs[1])[-2] # The first thing we need to do is to perform affine transformations # of the inputs to get the Queries, the Keys and the Values. kv = K.dot(K.reshape(key_values_input, [-1, d_model]), self.kv_weights) # splitting the keys, the values and the queries before further # processing pre_k, pre_v = [ K.reshape( # K.slice(kv, (0, i * d_model), (-1, d_model)), kv[:, i * d_model: (i + 1) * d_model], (-1, value_seq_len, self.num_heads, d_model // self.num_heads)) for i in range(2)] pre_q = K.reshape( K.dot(K.reshape(query_input, [-1, d_model]), self.q_weights), (-1, query_seq_len, self.num_heads, d_model // self.num_heads)) return self.attention(pre_q, pre_v, pre_k, query_seq_len, d_model, training=kwargs.get('training'))
def _compute_carry_and_output(self, x, h_tm1, c_tm1): """Computes carry and output using split kernels.""" # x, h_tm1, c_tm1 : complex64 # c_tm1 = c_ops.check_nan(c_tm1, 'carry c_tm1') x_i, x_f, x_c, x_o = x h_tm1_i, h_tm1_f, h_tm1_c, h_tm1_o = h_tm1 recurrent_kernel_complex = c_ops.tf_to_complex( self.recurrent_kernel_real, self.recurrent_kernel_imag) i = x_i + K.dot(h_tm1_i, recurrent_kernel_complex[:, :self.units]) i = self.activate_complex_real_imag_independently( self.recurrent_activation, i) # i = c_ops.check_nan(i, 'carry i') i *= tf.complex(0.5, 0.0) f = self.activate_complex_real_imag_independently( self.recurrent_activation, x_f + K.dot(h_tm1_f, recurrent_kernel_complex[:, self.units:self.units * 2])) # f = c_ops.check_nan(f, 'carry f') f *= tf.complex(0.5, 0.0) c = f * c_tm1 + i * self.activate_complex_real_imag_independently( self.activation, x_c + K.dot(h_tm1_c, recurrent_kernel_complex[:, self.units * 2:self.units * 3])) # c = c_ops.check_nan(c, 'carry c') o = self.activate_complex_real_imag_independently( self.recurrent_activation, x_o + K.dot(h_tm1_o, recurrent_kernel_complex[:, self.units * 3:])) # o = c_ops.check_nan(o, 'carry o') return c, o # complex
def call(self, x): eij1 = K.reshape( K.dot(K.reshape(x[:, :, 0:768], (-1, self.features_dim)), K.reshape(self.W, (self.features_dim, 1))), (-1, self.step_dim)) eij1 += self.b eij1 = K.expand_dims(eij1) eij2 = K.reshape( K.dot(K.reshape(x[:, :, 768:768*2], (-1, self.features_dim)), K.reshape(self.W, (self.features_dim, 1))), (-1, self.step_dim)) eij2 += self.b eij2 = K.expand_dims(eij2) eij3 = K.reshape( K.dot(K.reshape(x[:, :, 768*2:768*3], (-1, self.features_dim)), K.reshape(self.W, (self.features_dim, 1))), (-1, self.step_dim)) eij3 += self.b eij3 = K.expand_dims(eij3) eij = keras.layers.concatenate([eij1, eij2, eij3], axis=2) print(eij) eij = K.tanh(eij) a = K.exp(eij) a /= K.cast(K.sum(a, axis=2, keepdims=True) + K.epsilon(), K.floatx()) print(a) temp = a[:,:,0:1] * x[:, :, 0:768] + a[:,:,1:2] * x[:, :, 768:768*2] + a[:,:,2:3] * x[:, :, 768*2:768*3] print(temp) return temp
def call(self, inputs, **kwargs): main_input, embedding_matrix = inputs input_shape_tensor = K.shape(main_input) last_input_dim = K.int_shape(main_input)[-1] emb_input_dim, emb_output_dim = K.int_shape(embedding_matrix) projected = K.dot(K.reshape(main_input, (-1, last_input_dim)), self.embedding_weights['projection']) if self.add_biases: projected = K.bias_add(projected, self.embedding_weights['biases'], data_format='channels_last') if 0 < self.projection_dropout < 1: projected = K.in_train_phase( lambda: K.dropout(projected, self.projection_dropout), projected, training=kwargs.get('training')) attention = K.dot(projected, K.transpose(embedding_matrix)) if self.scaled_attention: # scaled dot-product attention, described in # "Attention is all you need" (https://arxiv.org/abs/1706.03762) sqrt_d = K.constant(math.sqrt(emb_output_dim), dtype=K.floatx()) attention = attention / sqrt_d result = K.reshape( self.activation(attention), (input_shape_tensor[0], input_shape_tensor[1], emb_input_dim)) return result
def call(self, inputs, states, training=None): vh = states[0] dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=2) rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(vh, training, count=2) if 0. < self.dropout < 1.: input1 = inputs * dp_mask[0] input2 = inputs * dp_mask[1] else: input1 = inputs input2 = inputs p11 = K.dot(input1, self.kernel[:, :self.units]) p21 = K.dot(input2, self.kernel[:, self.units:]) if self.use_bias: p11 = K.bias_add(p11, self.bias[:self.units]) p21 = K.bias_add(p21, self.bias[self.units:]) if 0. < self.recurrent_dropout < 1.: vh1 = vh * rec_dp_mask[0] vh2 = vh * rec_dp_mask[1] else: vh1 = vh vh2 = vh v1 = self.recurrent_activation( p11 + K.dot(vh1, self.recurrent_kernel[:, :self.units])) v2 = self.activation(p21 + K.dot(vh2 * v1, self.recurrent_kernel[:, self.units:])) vh = (1 - v1) * vh + v1 * v2 return vh, [vh]
def call(self, inputs, states, training=None): # get the standard hidden state from super output = super(STTAUCell, self).call(inputs, states) h_before = output[0] c = output[1][1] # the following part modifies the hidden state to create STTAU # sizes: B = batch size, H = hidden dimension size, # C = number of centroids # BxC = BxH & HxC unnormalized_probs = K.dot(h_before, self.centroid_kernel) # Gumbel-Softmax sample with (learnt) temperature & unnormalized_probs q_y = tfp.distributions.RelaxedOneHotCategorical( self.temperature_weight, unnormalized_probs) # BxC y = q_y.sample() if self.hard_sample is True: # y_hard is a one-hot vector with BxC y_hard = tf.cast(tf.one_hot(tf.argmax(y, -1), self.centroids), y.dtype) y = tf.stop_gradient(y_hard - y) + y # BxH = BxC & CxH h_after = K.dot(y, K.transpose(self.centroid_kernel)) # end of STTAU modification if 0 < self.dropout + self.recurrent_dropout: if training is None: h_after._uses_learning_phase = True return h_before, [h_after, c]
def call(self, x): print(x) features_dim = x.shape[-1].value step_dim = x.shape[-2].value print(K.reshape(self.kernel, (-1, features_dim))) # n, d print(K.reshape(self.W, (features_dim, 1))) # w= dx1 print(K.dot(K.reshape(self.kernel, (-1, features_dim)), K.reshape(self.W, (features_dim, 1)))) # nx1 eij = K.reshape(K.dot(K.reshape(self.kernel, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) # batch,step print(eij) eij += self.b eij = K.tanh(eij) a = K.exp(eij) a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a = tf.transpose(a,(1,0)) print(a) print("x:") print(self.kernel) weighted_input = self.kernel * a # 自动填充为相同的维度相乘 N T K print(weighted_input.shape) temp = K.sum(weighted_input, axis=0) # N K 权重相加 temp = K.tile(K.expand_dims(temp, 0), [step_dim, 1]) temp = keras.layers.concatenate([self.kernel, temp]) temp = K.dot(temp, self.W2) + self.b2 return x + temp
def energy_step(inputs, states): """ Step function for computing energy for a single decoder state """ # input: (batch_size, latent_dim) assert_msg = "States must be a list. However states {} is of type {}".format( states, type(states)) assert isinstance(states, list) or isinstance(states, tuple), assert_msg """ Computing sj.Ua """ # (batch_size, 1, d3) U_a_dot_s = K.expand_dims(K.dot(inputs, self.U_a), 1) if verbose: print('Ua.h>', K.int_shape(U_a_dot_s)) """ tanh(h.Wa + s.Ua) """ # (batch_size, h1*h2*...*hn, d3) = (batch_size, h1*h2*...*hn, d3) + (batch_size, 1, d3) Wh_plus_Us = K.tanh(W_hi + U_a_dot_s) # (batch_size, d3, h1*h2*...*hn) Wh_plus_Us = K.permute_dimensions(Wh_plus_Us, (0, 2, 1)) if verbose: print('Wh+Us>', K.int_shape(Wh_plus_Us)) """ softmax(va.tanh(S.Wa + hj.Ua)) """ # (1, batch_size, h1*h2*...*hn) = (1, d3) . (batch_size, d3, h1*h2*...*hn) Wh_plus_Us_dot_Va = K.dot(self.V_a, Wh_plus_Us) # (batch_size, h1*h2*...*hn) e_i = K.squeeze(Wh_plus_Us_dot_Va, 0) e_i = K.softmax(e_i) if verbose: print('ei>', K.int_shape(e_i)) # (batch_size, h1*h2*...*hn) return e_i, states
def energy_step(inputs, states): """ Step function for computing energy for a single decoder state inputs: (batchsize * 1 * de_in_dim) states: (batchsize * 1 * de_latent_dim) """ """ Some parameters required for shaping tensors""" en_seq_len, en_hidden = encoder_out_seq.shape[ 1], encoder_out_seq.shape[2] de_hidden = inputs.shape[-1] """ Computing S.Wa where S=[s0, s1, ..., si]""" # <= batch size * en_seq_len * latent_dim W_a_dot_s = K.dot(encoder_out_seq, self.W_a) """ Computing hj.Ua """ U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1) # <= batch_size, 1, latent_dim """ tanh(S.Wa + hj.Ua) """ # <= batch_size*en_seq_len, latent_dim Ws_plus_Uh = K.tanh(W_a_dot_s + U_a_dot_h) """ softmax(va.tanh(S.Wa + hj.Ua)) """ # <= batch_size, en_seq_len e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=-1) # <= batch_size, en_seq_len e_i = K.softmax(e_i) return e_i, [e_i]
def energy_step(decode_outs, states): # decode_outs(batch,dim) # decoder_seq [N,30,512] 30是字符串长度 en_seq_len, en_hidden = encoder_out_seq.shape[ 1], encoder_out_seq.shape[2] # 30, 512 de_hidden = decode_outs.shape[-1] # W * h_j reshaped_enc_outputs = K.reshape( encoder_out_seq, (-1, en_hidden)) #[b,64,512]=> [b*64,512] # W_a[512x512],reshaped_enc_outputs[b*64,512] => [b*64,512] => [b,64,512] W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden)) # U * S_t - 1,decode_outs[b,512],U_a[512,512] => [b,512] => [b,1,512] U_a_dot_h = K.expand_dims(K.dot(decode_outs, self.U_a), axis=1) # <= batch_size, 1, latent_dim # 这个细节很变态,其实就是完成了decoder的输出复制time(64)个,和encoder的输出【64,512】,相加的过程 # tanh ( W * h_j + U * S_t-1 + b ),[b,64,512] = [b*64,512] reshaped_Ws_plus_Uh = K.tanh( K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden))) # V * tanh ( W * h_j + U * S_t-1 + b ), [b*64,512]*[512,1] => [b*64,1] => [b,64] e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len)) e_i = K.softmax(e_i) return e_i, [e_i]
def call(self, inputs, prev_projection, states, training=None): prev_output = states[0] dp_mask = self.get_dropout_mask_for_cell(inputs, training) rec_dp_mask = self.get_recurrent_dropout_mask_for_cell( prev_output, training) if dp_mask is not None: inputs = inputs * dp_mask output = K.dot(inputs, self.kernel) if self.use_recurrent: if rec_dp_mask is not None: prev_output = prev_output * rec_dp_mask output += K.dot(prev_output, self.recurrent_kernel) if self.use_feedback: if self.projection_activation is not None: prev_projection = self.projection_activation(prev_projection) output += K.dot(prev_projection, self.feedback_kernel) if self.bias is not None: output = K.bias_add(output, self.bias) if self.activation is not None: output = self.activation(output) projection = K.dot(output, self.projection_kernel) if self.projection_bias is not None: projection = K.bias_add(projection, self.projection_bias) return output, projection, [output]
def generator(self, src_enc): G_h = K.bias_add(K.dot(src_enc, self.G_w1), self.G_b1) G_h_relu = tf.nn.relu(G_h) G_log_prob = K.bias_add(K.dot(G_h_relu, self.G_w2), self.G_b2) G_prob = tf.nn.sigmoid(G_log_prob) return G_prob
def call(self, inputs): if K.dtype(inputs) != 'int32': inputs = K.cast(inputs, 'int32') def _l2normalize(v, eps=1e-12): return v / (K.sum(v ** 2) ** 0.5 + eps) def power_iteration(W, u): #Accroding the paper, we only need to do power iteration one time. _u = u _v = _l2normalize(K.dot(_u, K.transpose(W))) _u = _l2normalize(K.dot(_v, W)) return _u, _v W_shape = self.embeddings.shape.as_list() #Flatten the Tensor W_reshaped = K.reshape(self.embeddings, [-1, W_shape[-1]]) _u, _v = power_iteration(W_reshaped, self.u) #Calculate Sigma sigma=K.dot(_v, W_reshaped) sigma=K.dot(sigma, K.transpose(_u)) #normalize it W_bar = W_reshaped / sigma #reshape weight tensor if training in {0, False}: W_bar = K.reshape(W_bar, W_shape) else: with tf.control_dependencies([self.u.assign(_u)]): W_bar = K.reshape(W_bar, W_shape) self.embeddings = W_bar out = K.gather(self.embeddings, inputs) return out
def call(self, inputs, training=None): def _l2normalize(v, eps=1e-12): return v / (K.sum(v**2)**0.5 + eps) def power_iteration(W, u): _u = u _v = _l2normalize(K.dot(_u, K.transpose(W))) _u = _l2normalize(K.dot(_v, W)) return _u, _v if self.spectral_normalization: W_shape = self.kernel.shape.as_list() # Flatten the Tensor W_reshaped = K.reshape(self.kernel, [-1, W_shape[-1]]) _u, _v = power_iteration(W_reshaped, self.u) # Calculate Sigma sigma = K.dot(_v, W_reshaped) sigma = K.dot(sigma, K.transpose(_u)) # normalize it W_bar = W_reshaped / sigma # reshape weight tensor if training in {0, False}: W_bar = K.reshape(W_bar, W_shape) else: with tf.control_dependencies([self.u.assign(_u)]): W_bar = K.reshape(W_bar, W_shape) # update weitht self.kernel = W_bar if self.rank == 1: outputs = K.conv1d(inputs, self.kernel, strides=self.strides[0], padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate[0]) if self.rank == 2: outputs = K.conv2d(inputs, self.kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.rank == 3: outputs = K.conv3d(inputs, self.kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.use_bias: outputs = K.bias_add(outputs, self.bias, data_format=self.data_format) if self.activation is not None: return self.activation(outputs) return outputs
def power_iteration(self, u, W): ''' Accroding the paper, we only need to do power iteration one time. ''' v = self._l2normalize(K.dot(u, K.transpose(W))) u = self._l2normalize(K.dot(v, W)) return u, v
def call(self, inputs): """ Args: (query, context) -> query: a Tensor with shape [batch_size, query_length, channels] context: a Tensor with shape [batch_size, context_length, channels] Returns: similarity: a Tensor with shape [batch_size, context_length, query_length] """ query, context = inputs if self.dropout: query = self.dropout(query) context = self.dropout(context) # context_weighted -> Tensor with shape [batch_size, context_length, 1] context_weighted = K.dot(context, self.context_weights) # query_weighted -> Tensor with shape [batch_size, 1, query_length] query_weighted = tf.transpose( K.dot(query, self.query_weights), (0, 2, 1)) # weighted_context_query -> Tensor with shape [batch_size, context_length, query_length] weighted_context_query = tf.matmul( K.dot(context, self.dot_weights), query, transpose_b=True) similarity = weighted_context_query + context_weighted + query_weighted return similarity
def call(self, inputs,**kwargs): if K.ndim(inputs[0]) != 3: raise ValueError("Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs))) embeds_vec_list = inputs row = [] col = [] num_inputs = len(embeds_vec_list) for i in range(num_inputs - 1): for j in range(i + 1, num_inputs): row.append(i) col.append(j) p = concatenate([embeds_vec_list[idx] for idx in row],axis=1)# batch num_pairs k q = concatenate([embeds_vec_list[idx] for idx in col],axis=1) # Reshape([num_pairs, self.embedding_size]) inner_product = p * q bi_interaction = inner_product attention_temp = Dense(self.attention_factor,'relu',kernel_regularizer=l2(self.l2_reg_w))(bi_interaction) attention_weight = softmax(K.dot(attention_temp, self.projection_h),axis=1) attention_output = K.sum(attention_weight*bi_interaction,axis=1) attention_output = tf.nn.dropout(attention_output,self.keep_prob,seed=1024) # Dropout(1-self.keep_prob)(attention_output) afm_out = K.dot(attention_output, self.projection_p) return afm_out
def step_gru(cell_inputs, cell_state, kernel, recurrent_kernel, input_bias, recurrent_bias): """Step function that will be used by Keras RNN backend.""" h_tm1 = cell_state # inputs projected by all gate matrices at once matrix_x = K.dot(cell_inputs, kernel) matrix_x = K.bias_add(matrix_x, input_bias) x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1) # hidden state projected by all gate matrices at once matrix_inner = K.dot(h_tm1, recurrent_kernel) matrix_inner = K.bias_add(matrix_inner, recurrent_bias) recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3, axis=1) z = nn.sigmoid(x_z + recurrent_z) r = nn.sigmoid(x_r + recurrent_r) hh = nn.tanh(x_h + r * recurrent_h) # previous and candidate state mixed by update gate h = z * h_tm1 + (1 - z) * hh return h, [h]
def call(self, inputs, states): prev_output = states[0] h = K.dot(inputs, self.kernel) output = h + K.dot(prev_output, self.recurrent_kernel) activation = activations.get(self.activation) output = activation(output) return output, [output]
def call(self, inputs, mask=None): # output = softmax(score) k, q = inputs if len(q.shape) == 2: q = K.expand_dims(q, axis=1) # k: (?, K_LEN, EMBED_DIM,) # q: (?, Q_LEN, EMBED_DIM,) # score: (?, Q_LEN, K_LEN,) if self.score_function == 'scaled_dot_product': kt = K.permute_dimensions(k, (0, 2, 1)) qkt = K.batch_dot(q, kt) score = qkt / self.EMBED_DIM elif self.score_function == 'mlp': kq = K.concatenate([k, q], axis=1) kqw2 = K.tanh(K.dot(kq, self.W2)) score = K.permute_dimensions(K.dot(self.W1, kqw2), (1, 0, 2)) elif self.score_function == 'bi_linear': qw = K.dot(q, self.W) kt = K.permute_dimensions(k, (0, 2, 1)) score = K.batch_dot(qw, kt) else: raise RuntimeError('invalid score_function') score = K.softmax(score) # if mask is not None: # score *= K.cast(mask[0], K.floatx()) # output: (?, Q_LEN, EMBED_DIM,) output = K.batch_dot(score, k) return output
def call(self, inputs): X = inputs[0] # Node features (N x F) A = inputs[1] # Adjacency matrix (N x N) outputs = [] for head in range(self.attn_heads): kernel = self.kernels[head] # W in the paper (F x F") attention_kernel = self.attn_kernels[ head] # Attention kernel a in the paper (2F" x 1) # Compute inputs to attention network features = K.dot(X, kernel) # (N x F") # Compute feature combinations # Note: [[a_1], [a_2]]^T [[Wh_i], [Wh_2]] = [a_1]^T [Wh_i] + [a_2]^T [Wh_j] attn_for_self = K.dot( features, attention_kernel[0]) # (N x 1), [a_1]^T [Wh_i] attn_for_neighs = K.dot( features, attention_kernel[1]) # (N x 1), [a_2]^T [Wh_j] # Attention head a(Wh_i, Wh_j) = a^T [[Wh_i], [Wh_j]] dense = attn_for_self + K.transpose( attn_for_neighs) # (N x N) via broadcasting # Add nonlinearty dense = LeakyReLU(alpha=0.2)(dense) # Mask values before activation (Vaswani et al., 2017) mask = -10e9 * (1.0 - A) dense += mask # Apply softmax to get attention coefficients dense = K.softmax(dense) # (N x N) # Apply dropout to features and attention coefficients dropout_attn = Dropout(self.dropout_rate)(dense) # (N x N) dropout_feat = Dropout(self.dropout_rate)(features) # (N x F") # Linear combination with neighbors" features node_features = K.dot(dropout_attn, dropout_feat) # (N x F") if self.use_bias: node_features = K.bias_add(node_features, self.biases[head]) if self.attn_heads_reduction == "concat": # If "concat", compute the activation here (Eq. 5) node_features = self.activation(node_features) # Add output of attention head to final output outputs.append(node_features) # Aggregate the heads" output according to the reduction method if self.attn_heads_reduction == "concat": output = K.concatenate(outputs) # (N x KF") else: output = K.mean(K.stack(outputs), axis=0) # N x F") output = self.activation(output) return output
def call(self, inputs): X = inputs[0] # Node features (B x N x F) A = inputs[1] # Adjacency matrix (B x N x N) X_dims = X.get_shape().as_list() B, N, F = X_dims outputs = [] attentions = [] for head in range(self.attn_heads): # W in the paper (F x F") kernel = self.kernels[head] # Compute inputs to attention network features = K.dot(X, kernel) # (B x N x F") dropout_feat = Dropout(self.dropout_rate)(features) # (B x N x F") neighbor_kernel = self.neighbor_kernels[head] attn_kernel = self.attn_kernels[head] neighbor_features = K.dot(X, neighbor_kernel) dropout_neighbor = Dropout(self.dropout_rate)(neighbor_features) merged = tf.matmul(K.dot(dropout_feat, attn_kernel), tf.transpose(dropout_neighbor, (0, 2, 1))) attention = tf.nn.tanh(merged) attention = K.reshape(attention, (-1, N, N)) mask = -10e9 * (1.0 - A) attention += mask attention = tf.nn.softmax(attention) dropout_attn = Dropout(self.dropout_rate)(attention) node_features = tf.matmul(dropout_attn, dropout_feat) if self.use_bias: node_features = K.bias_add(node_features, self.biases[head]) if self.return_attention: attentions.append(attention) # Add output of attention head to final output outputs.append(node_features) # Aggregate the heads" output according to the reduction method if self.attn_heads_reduction == "concat": output = K.concatenate(outputs, axis=-1) # (B x N x KF") else: output = K.mean(K.stack(outputs), axis=0) # (B x N x F") # If "average", compute the activation here (Eq. 6) output = self.activation(output) if self.return_attention: attentions = K.stack(attentions, axis=1) return (output, attentions) else: return output
def call(self, inputs, mask=None, training=None): inputs, relatives, memories, bias_context, bias_relative = inputs full = K.concatenate([memories, inputs], axis=1) # (batch, prev_len + seq_len, units) w_q = K.dot(inputs, self.kernel_q) # (batch, seq_len, units) w_kv = K.dot(full, self.kernel_kv) # (batch, prev_len + seq_len, units * 2) w_r = K.dot(relatives, self.kernel_r) # (batch, prev_len + seq_len, units) if self.use_bias: w_q = K.bias_add(w_q, self.bias_q) w_kv = K.bias_add(w_kv, self.bias_kv) w_r = K.bias_add(w_r, self.bias_r) if self.activation is not None: w_q = self.activation(w_q) w_kv = self.activation(w_kv) w_r = self.activation(w_r) w_k = w_kv[:, :, :self.units] # (batch, prev_len + seq_len, units) w_v = w_kv[:, :, self.units:] # (batch, prev_len + seq_len, units) w_qc = K.bias_add(w_q, bias_context) w_qc = self._reshape_to_batches(w_qc) # (batch * n_head, seq_len, units_head) w_k = self._reshape_to_batches(w_k) # (batch * n_head, prev_len + seq_len, units_head) a_context = K.batch_dot(w_qc, w_k, axes=2) # (batch * n_head, seq_len, prev_len + seq_len) w_qr = K.bias_add(w_q, bias_relative) w_qr = self._reshape_to_batches(w_qr) # (batch * n_head, seq_len, units_head) w_r = self._reshape_to_batches(w_r) # (batch * n_head, prev_len + seq_len, units_head) a_relative = K.batch_dot(w_qr, w_r, axes=2) # (batch * n_head, seq_len, prev_len + seq_len) a_relative = self._relative_shift(a_relative) # (batch * n_head, seq_len, prev_len + seq_len) att = (a_context + a_relative) / K.sqrt(K.constant(self.units_head, dtype=K.floatx())) exp = K.exp(att - K.max(att, axis=-1, keepdims=True)) q_len, k_len = K.shape(w_q)[1], K.shape(w_k)[1] indices = K.expand_dims(K.arange(0, k_len), axis=0) upper = K.expand_dims(K.arange(k_len - q_len, k_len), axis=-1) exp *= K.expand_dims(K.cast(indices <= upper, K.floatx()), axis=0) if mask is not None and mask[0] is not None: mask = K.cast(mask[0], K.floatx()) mask = K.concatenate([K.ones_like(memories[:, :, 0]), mask], axis=1) exp *= K.expand_dims(self._reshape_mask(mask), axis=1) att = exp / K.sum(exp, axis=-1, keepdims=True) if self.att_drop_layer is not None: att = self.att_drop_layer(att, training=training) w_v = self._reshape_to_batches(w_v) # (batch * n_head, prev_len + seq_len, units_head) w_o = K.batch_dot(att, w_v) # (batch * n_head, seq_len, units_head) w_o = self._reshape_from_batches(w_o) # (batch, seq_len, units) w_o = K.dot(w_o, self.kernel_o) # (batch, seq_len, units) if self.use_bias: w_o = K.bias_add(w_o, self.bias_o) if self.activation is not None: w_o = self.activation(w_o) # Add shape information to tensor when using `tf.keras` input_shape = K.int_shape(inputs) if input_shape[1] is not None: w_o = K.reshape(w_o, (-1,) + input_shape[1:]) return w_o
def call(self, x, mask=None): energy = self.activation(K.dot(x, self.W0) + self.b0) #energy=self.activation(K.dot(energy, self.W) + self.b) energy = K.dot(energy, self.W) + self.b energy = K.reshape(energy, (-1, self.input_length)) energy = K.softmax(energy) xx = K.batch_dot(energy, x, axes=(1, 1)) all = K.concatenate([xx, energy]) return all
def power_iteration(W, u, rounds=1): ''' Accroding the paper, we only need to do power iteration one time. ''' _u = u for i in range(rounds): _v = _l2normalizer(K.dot(_u, W)) _u = _l2normalizer(K.dot(_v, K.transpose(W))) W_sn = K.sum(K.dot(_u, W) * _v) return W_sn, _u, _v
def call(self, inputs, **kwargs): W = K.tanh(self.W_hat) * K.sigmoid(self.M_hat) a = K.dot(inputs, W) if self.nac_only: outputs = a else: m = K.exp(K.dot(K.log(K.abs(inputs) + self.epsilon), W)) g = K.sigmoid(K.dot(inputs, self.G)) outputs = g * a + (1. - g) * m return outputs
def step(cell_inputs, cell_states): """Step function that will be used by Keras RNN backend.""" h_tm1 = cell_states[0] # previous memory state c_tm1 = cell_states[1] # previous carry state z = K.dot(cell_inputs, kernel) z += K.dot(h_tm1, recurrent_kernel) z = K.bias_add(z, bias) z0, z1, z2, z3 = array_ops.split(z, 4, axis=1) i = recurrent_activation(z0) f = recurrent_activation(z1) c = f * c_tm1 + i * activation(z2) o = recurrent_activation(z3) h = o * activation(c) return h, [h, c]
def step(cell_inputs, cell_states): h_tm1 = cell_states[0] # previous memory state c_tm1 = cell_states[1] # previous carry state # Only use the second half of the bias weights. _, real_bias = array_ops.split(bias, 2) z = K.dot(cell_inputs, kernel) z += K.dot(h_tm1, recurrent_kernel) z = K.bias_add(z, real_bias) z0 = z[:, :units] z1 = z[:, units:2 * units] z2 = z[:, 2 * units:3 * units] z3 = z[:, 3 * units:] i = recurrent_activation(z0) f = recurrent_activation(z1) c = f * c_tm1 + i * activation(z2) o = recurrent_activation(z3) h = o * activation(c) return h, [h, c]
def step(cell_inputs, cell_states): """Step function that will be used by Keras RNN backend.""" h_tm1 = cell_states[0] # inputs projected by all gate matrices at once matrix_x = K.dot(cell_inputs, kernel) matrix_x = K.bias_add(matrix_x, input_bias) x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1) # hidden state projected by all gate matrices at once matrix_inner = K.dot(h_tm1, recurrent_kernel) matrix_inner = K.bias_add(matrix_inner, recurrent_bias) recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3, axis=1) z = recurrent_activation(x_z + recurrent_z) r = recurrent_activation(x_r + recurrent_r) hh = activation(x_h + r * recurrent_h) # previous and candidate state mixed by update gate h = z * h_tm1 + (1 - z) * hh return h, [h]