def normalize_func(mean_batch, variance_batch): mean_batch = K.reshape(mean_batch, broadcast_shape) variance_batch = K.reshape(variance_batch, broadcast_shape) mean_weights = K.softmax(self.mean_weights, axis=0) variance_weights = K.softmax(self.variance_weights, axis=0) mean = (mean_weights[0] * mean_instance + mean_weights[1] * mean_layer + mean_weights[2] * mean_batch) variance = (variance_weights[0] * variance_instance + variance_weights[1] * variance_layer + variance_weights[2] * variance_batch) outputs = (inputs - mean) / (K.sqrt(variance + self.epsilon)) if self.scale: broadcast_gamma = K.reshape(self.gamma, broadcast_shape) outputs = outputs * broadcast_gamma if self.center: broadcast_beta = K.reshape(self.beta, broadcast_shape) outputs = outputs + broadcast_beta return outputs
def call(self, x, mask=None): features_dim = self.features_dim step_dim = self.step_dim #xw = K.reshape(K.dot(x[0], K.reshape(self.W, (features_dim, features_dim))), (-1, features_dim)) #yavg=K.reshape(K.mean(K.mean(x[1], axis=1, keepdims=True),axis=0, keepdims=True), (features_dim,-1)) xw1 = K.dot(x[0], K.reshape(self.W1, (features_dim, features_dim))) xw2 = K.dot(x[1], K.reshape(self.W2, (features_dim, features_dim))) xw1t = K.permute_dimensions(xw1, [0, 2, 1]) xw2t = K.permute_dimensions(xw2, [0, 2, 1]) xw11 = K.batch_dot(xw1, xw1t) / (step_dim**0.5) xw12 = K.batch_dot(xw1, xw2t) / (step_dim**0.5) s11 = self.ll * K.softmax(xw11) s12 = (1 - self.ll) * K.softmax(xw12) eij = s11 + s12 print(eij.get_shape()) V = x[0] * K.mean(eij, axis=2, keepdims=True) if self.get_alpha: return eij else: if self.get_sequence: return V else: return K.sum(V, axis=1)
def modified_kd_targets_from_logits(train_logits, test_logits, temp=1): # create soft targets from loaded logits if temp <= 0: temp = 1 train_logits_t = train_logits / temp test_logits_t = test_logits / temp Y_train_soft = K.softmax(train_logits_t) Y_test_soft = K.softmax(test_logits_t) sess = K.get_session() Y_train_soft = sess.run(Y_train_soft) Y_test_soft = sess.run(Y_test_soft) return Y_train_soft, Y_test_soft
def modified_kd_targets_from_logits(Y_train, Y_test, train_logits, test_logits, temp): # create soft targets from loaded logits train_logits_t = train_logits / temp test_logits_t = test_logits / temp Y_train_soft = K.softmax(train_logits_t) Y_test_soft = K.softmax(test_logits_t) sess = K.get_session() Y_train_soft = sess.run(Y_train_soft) Y_test_soft = sess.run(Y_test_soft) # concatenate hard and soft targets to create the knowledge distillation targets Y_train_new = np.concatenate([Y_train, Y_train_soft], axis=1) Y_test_new = np.concatenate([Y_test, Y_test_soft], axis=1) return Y_train_new, Y_test_new
def call(self, inputs, mask=None): if mask is not None: adder = (math_ops.cast(mask, inputs.dtype)) * ( _large_compatible_negative(inputs.dtype)) inputs += adder if isinstance(self.axis, (tuple, list)): if len(self.axis) > 1: return math_ops.exp(inputs - math_ops.reduce_logsumexp( inputs, axis=self.axis, keepdims=True)) else: return K.softmax(inputs, axis=self.axis[0]) return K.softmax(inputs, axis=self.axis)
def energy_step(inputs, states): assert_msg = "States must be a list. However states {} is of type {}".format( states, type(states)) assert isinstance(states, list) or isinstance(states, tuple), assert_msg en_seq_len, en_hidden = encoder_out_seq.shape[ 1], encoder_out_seq.shape[2] de_hidden = inputs.shape[-1] reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden)) W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden)) if verbose: print('wa.s > ', W_a_dot_s.shape) U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1) # (batch_size, 1, latent_dim) if verbose: print('Ua.h > ', U_a_dot_h.shape) reshaped_Ws_plus_Uh = K.tanh( K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden))) if verbose: print('Ws+Uh > ', reshaped_Ws_plus_Uh.shape) e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len)) e_i = K.softmax(e_i) if verbose: print('ei > ', e_i.shape) return e_i, [e_i]
def energy_step(inputs, states): """ Step function for computing energy for a single decoder state inputs: (batchsize * 1 * de_in_dim) states: (batchsize * 1 * de_latent_dim) """ """ Some parameters required for shaping tensors""" en_seq_len, en_hidden = encoder_out_seq.shape[ 1], encoder_out_seq.shape[2] de_hidden = inputs.shape[-1] """ Computing S.Wa where S=[s0, s1, ..., si]""" # <= batch size * en_seq_len * latent_dim W_a_dot_s = K.dot(encoder_out_seq, self.W_a) """ Computing hj.Ua """ U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1) # <= batch_size, 1, latent_dim """ tanh(S.Wa + hj.Ua) """ # <= batch_size*en_seq_len, latent_dim Ws_plus_Uh = K.tanh(W_a_dot_s + U_a_dot_h) """ softmax(va.tanh(S.Wa + hj.Ua)) """ # <= batch_size, en_seq_len e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=-1) # <= batch_size, en_seq_len e_i = K.softmax(e_i) return e_i, [e_i]
def simple_context(X, mask): """ Simple context calculation layer logic X = (batch_size, time_steps, units) time_steps are nothing but number of words in our case. """ # segregrate heading and desc desc, head = X[:, :parameters.max_len_desc, :], X[:, parameters.max_len_desc:, :] # segregrate activation and context part head_activations, head_words = head[:, :, :parameters.activation_rnn_size], head[:, :, parameters.activation_rnn_size:] desc_activations, desc_words = desc[:, :, :parameters.activation_rnn_size], desc[:, :, parameters.activation_rnn_size:] # p=(bacth_size, length_desc_words, rnn_units) # q=(bacth_size, length_headline_words, rnn_units) # K.dot(p,q) = (bacth_size, length_desc_words,length_headline_words) activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2, 2)) # make sure we dont use description words that are masked out activation_energies = activation_energies + -1e20 * K.expand_dims(1. - K.cast(mask[:, :parameters.max_len_desc], 'float32'), 1) # for every head word compute weights for every desc word activation_energies = K.reshape(activation_energies, (-1, parameters.max_len_desc)) activation_weights = K.softmax(activation_energies) activation_weights = K.reshape(activation_weights, (-1, parameters.max_len_head, parameters.max_len_desc)) # for every head word compute weighted average of desc words desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2, 1)) return K.concatenate((desc_avg_word, head_words))
def energy_step(decode_outs, states): # decode_outs(batch,dim) # decoder_seq [N,30,512] 30是字符串长度 en_seq_len, en_hidden = encoder_out_seq.shape[ 1], encoder_out_seq.shape[2] # 30, 512 de_hidden = decode_outs.shape[-1] # W * h_j reshaped_enc_outputs = K.reshape( encoder_out_seq, (-1, en_hidden)) #[b,64,512]=> [b*64,512] # W_a[512x512],reshaped_enc_outputs[b*64,512] => [b*64,512] => [b,64,512] W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden)) # U * S_t - 1,decode_outs[b,512],U_a[512,512] => [b,512] => [b,1,512] U_a_dot_h = K.expand_dims(K.dot(decode_outs, self.U_a), axis=1) # <= batch_size, 1, latent_dim # 这个细节很变态,其实就是完成了decoder的输出复制time(64)个,和encoder的输出【64,512】,相加的过程 # tanh ( W * h_j + U * S_t-1 + b ),[b,64,512] = [b*64,512] reshaped_Ws_plus_Uh = K.tanh( K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden))) # V * tanh ( W * h_j + U * S_t-1 + b ), [b*64,512]*[512,1] => [b*64,1] => [b,64] e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len)) e_i = K.softmax(e_i) return e_i, [e_i]
def call(self, inputs, mask=None): # output = softmax(score) k, q = inputs if len(q.shape) == 2: q = K.expand_dims(q, axis=1) # k: (?, K_LEN, EMBED_DIM,) # q: (?, Q_LEN, EMBED_DIM,) # score: (?, Q_LEN, K_LEN,) if self.score_function == 'scaled_dot_product': kt = K.permute_dimensions(k, (0, 2, 1)) qkt = K.batch_dot(q, kt) score = qkt / self.EMBED_DIM elif self.score_function == 'mlp': kq = K.concatenate([k, q], axis=1) kqw2 = K.tanh(K.dot(kq, self.W2)) score = K.permute_dimensions(K.dot(self.W1, kqw2), (1, 0, 2)) elif self.score_function == 'bi_linear': qw = K.dot(q, self.W) kt = K.permute_dimensions(k, (0, 2, 1)) score = K.batch_dot(qw, kt) else: raise RuntimeError('invalid score_function') score = K.softmax(score) # if mask is not None: # score *= K.cast(mask[0], K.floatx()) # output: (?, Q_LEN, EMBED_DIM,) output = K.batch_dot(score, k) return output
def call(self, inputs, **kwargs): inputs = inputs if isinstance(inputs, list) else [inputs] if len(inputs) < 1 or len(inputs) > 2: raise ValueError("AttentionLayer expect one or two inputs.") actual_input = inputs[0] mask = inputs[1] if len(inputs) > 1 else None if mask is not None and not ( ((len(mask.shape) == 3 and mask.shape[2] == 1) or len(mask.shape) == 2) and mask.shape[1] == self.input_length): raise ValueError( "`mask` should be of shape (batch, input_length) or (batch, input_length, 1) " "when calling an AttentionLayer.") assert actual_input.shape[-1] == self.attention_param.shape[0] # (batch, input_length, input_dim) * (input_dim, 1) ==> (batch, input_length, 1) attention_weights = K.dot(actual_input, self.attention_param) if mask is not None: if len(mask.shape) == 2: mask = K.expand_dims(mask, axis=2) # (batch, input_length, 1) mask = K.log(mask) attention_weights += mask attention_weights = K.softmax(attention_weights, axis=1) # (batch, input_length, 1) result = K.sum( actual_input * attention_weights, axis=1) # (batch, input_length) [multiplication uses broadcast] return result, attention_weights
def energy_step(inputs, states): """ Step function for computing energy for a single decoder state """ # input: (batch_size, latent_dim) assert_msg = "States must be a list. However states {} is of type {}".format( states, type(states)) assert isinstance(states, list) or isinstance(states, tuple), assert_msg """ Computing sj.Ua """ # (batch_size, 1, d3) U_a_dot_s = K.expand_dims(K.dot(inputs, self.U_a), 1) if verbose: print('Ua.h>', K.int_shape(U_a_dot_s)) """ tanh(h.Wa + s.Ua) """ # (batch_size, h1*h2*...*hn, d3) = (batch_size, h1*h2*...*hn, d3) + (batch_size, 1, d3) Wh_plus_Us = K.tanh(W_hi + U_a_dot_s) # (batch_size, d3, h1*h2*...*hn) Wh_plus_Us = K.permute_dimensions(Wh_plus_Us, (0, 2, 1)) if verbose: print('Wh+Us>', K.int_shape(Wh_plus_Us)) """ softmax(va.tanh(S.Wa + hj.Ua)) """ # (1, batch_size, h1*h2*...*hn) = (1, d3) . (batch_size, d3, h1*h2*...*hn) Wh_plus_Us_dot_Va = K.dot(self.V_a, Wh_plus_Us) # (batch_size, h1*h2*...*hn) e_i = K.squeeze(Wh_plus_Us_dot_Va, 0) e_i = K.softmax(e_i) if verbose: print('ei>', K.int_shape(e_i)) # (batch_size, h1*h2*...*hn) return e_i, states
def simple_context(X, mask): desc, head = X[:, :parameters.max_len_desc, :], X[:, parameters. max_len_desc:, :] head_activations, head_words = head[:, :, :parameters. activation_rnn_size], head[:, :, parameters. activation_rnn_size:] desc_activations, desc_words = desc[:, :, :parameters. activation_rnn_size], desc[:, :, parameters. activation_rnn_size:] activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2, 2)) activation_energies = activation_energies + -1e20 * K.expand_dims( 1. - K.cast(mask[:, :parameters.max_len_desc], 'float32'), 1) activation_energies = K.reshape(activation_energies, (-1, parameters.max_len_desc)) activation_weights = K.softmax(activation_energies) activation_weights = K.reshape( activation_weights, (-1, parameters.max_len_head, parameters.max_len_desc)) desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2, 1)) return K.concatenate((desc_avg_word, head_words))
def call(self, inputs): X = inputs[0] # Node features (N x F) A = inputs[1] # Adjacency matrix (N x N) outputs = [] for head in range(self.attn_heads): kernel = self.kernels[head] # W in the paper (F x F") attention_kernel = self.attn_kernels[ head] # Attention kernel a in the paper (2F" x 1) # Compute inputs to attention network features = K.dot(X, kernel) # (N x F") # Compute feature combinations # Note: [[a_1], [a_2]]^T [[Wh_i], [Wh_2]] = [a_1]^T [Wh_i] + [a_2]^T [Wh_j] attn_for_self = K.dot( features, attention_kernel[0]) # (N x 1), [a_1]^T [Wh_i] attn_for_neighs = K.dot( features, attention_kernel[1]) # (N x 1), [a_2]^T [Wh_j] # Attention head a(Wh_i, Wh_j) = a^T [[Wh_i], [Wh_j]] dense = attn_for_self + K.transpose( attn_for_neighs) # (N x N) via broadcasting # Add nonlinearty dense = LeakyReLU(alpha=0.2)(dense) # Mask values before activation (Vaswani et al., 2017) mask = -10e9 * (1.0 - A) dense += mask # Apply softmax to get attention coefficients dense = K.softmax(dense) # (N x N) # Apply dropout to features and attention coefficients dropout_attn = Dropout(self.dropout_rate)(dense) # (N x N) dropout_feat = Dropout(self.dropout_rate)(features) # (N x F") # Linear combination with neighbors" features node_features = K.dot(dropout_attn, dropout_feat) # (N x F") if self.use_bias: node_features = K.bias_add(node_features, self.biases[head]) if self.attn_heads_reduction == "concat": # If "concat", compute the activation here (Eq. 5) node_features = self.activation(node_features) # Add output of attention head to final output outputs.append(node_features) # Aggregate the heads" output according to the reduction method if self.attn_heads_reduction == "concat": output = K.concatenate(outputs) # (N x KF") else: output = K.mean(K.stack(outputs), axis=0) # N x F") output = self.activation(output) return output
def call(self, x, mask=None): energy = self.activation(K.dot(x, self.W0) + self.b0) #energy=self.activation(K.dot(energy, self.W) + self.b) energy = K.dot(energy, self.W) + self.b energy = K.reshape(energy, (-1, self.input_length)) energy = K.softmax(energy) xx = K.batch_dot(energy, x, axes=(1, 1)) all = K.concatenate([xx, energy]) return all
def call(self, inputs, mask=None): x = K.permute_dimensions(inputs, (0, 2, 1)) a = K.softmax(K.tanh(K.dot(x, self.W))) a = K.permute_dimensions(a, (0, 2, 1)) outputs = a * inputs outputs = K.sum(outputs, axis=1) return outputs
def call(self, inputs, mask=None): if mask is not None: # Since mask is 1.0 for positions we want to keep and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -1e.9 for masked positions. adder = (1.0 - math_ops.cast(mask, inputs.dtype)) * ( _large_compatible_negative(inputs.dtype)) # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. inputs += adder if isinstance(self.axis, (tuple, list)): if len(self.axis) > 1: return math_ops.exp(inputs - math_ops.reduce_logsumexp( inputs, axis=self.axis, keepdims=True)) else: return K.softmax(inputs, axis=self.axis[0]) return K.softmax(inputs, axis=self.axis)
def softmax(x, axis=1): ndim = K.ndim(x) if ndim == 2: return K.softmax(x) elif ndim > 2: e = K.exp(x - K.max(x, axis=axis, keepdims=True)) s = K.sum(e, axis=axis, keepdims=True) return e / s else: raise ValueError('Cannot apply softmax to a tensor that is 1D')
def call(self, inputs, training=None, mask=None): q = inputs[0] k = inputs[1] v = inputs[2] qkTensor = K.math_ops.matmul(q,k,transpose_b=True) scaleTensor = K.math_ops.multiply(K.stop_gradient(1. / K.math_ops.sqrt(self.dk)) , qkTensor) softMaxTensor = K.softmax(scaleTensor) drT = self.dropout(softMaxTensor,training = training) vTensor = K.math_ops.matmul(drT,v) return vTensor
def loss(y_true, y_pred): loss_val = -1 * K.sum( K.log(K.softmax(y_pred[:, :-1])) * y_true[:, :-1], axis=-1) return K.mean( K.switch( K.equal(task, 1005), loss_weights[task] * loss_val, K.switch(K.equal(y_true[:, -1], task), loss_val, loss_weights[task] * loss_val)))
def call(self, inputs, mask=None): # inputs.shape = (batch_size, time_steps, seq_len) x = K.permute_dimensions(inputs, (0, 2, 1)) # x.shape = (batch_size, seq_len, time_steps) # general a = K.softmax(K.tanh(K.dot(x, self.W))) a = K.permute_dimensions(a, (0, 2, 1)) outputs = a * inputs outputs = K.sum(outputs, axis=1) return outputs
def call(self, inputs, **kwargs): query, values, keys = inputs hidden_with_time_axis = K.expand_dims(query, 1) score = self.attention_variable( K.tanh(keys + self.query_layer(hidden_with_time_axis)) ) # TODO Mask option for score with infinity alignment = K.softmax(score, axis=1) attention = alignment * values alignment = K.squeeze(alignment, axis=2) attention = K.sum(attention, axis=1) return attention, alignment
def convert_logits_to_soft_targets(temp, teacher_train_logits, teacher_test_logits, Y_train, Y_test): # softmax at raised temperature train_logits_T = teacher_train_logits / temp test_logits_T = teacher_test_logits / temp Y_train_soft = K.softmax(train_logits_T) Y_test_soft = K.softmax(test_logits_T) sess = K.get_session() Y_train_soft = sess.run(Y_train_soft) Y_test_soft = sess.run(Y_test_soft) # # TODO remove if negative test feedback! # Y_train_soft, Y_test_soft = normalizeStudentSoftTargets(Y_train_soft, Y_test_soft) # for i in range(0, len(Y_train_soft)): # Y_train_soft[i] = (1 / find_largest_value(Y_train_soft[i])) * Y_train_soft[i] # for i in range(0, len(Y_test_soft)): # Y_test_soft[i] = (1 / find_largest_value(Y_test_soft[i])) * Y_test_soft[i] # Concatenate so that this becomes a (num_classes + num_classes) dimensional vector Y_train_new = np.concatenate([Y_train, Y_train_soft], axis=1) Y_test_new = np.concatenate([Y_test, Y_test_soft], axis=1) return Y_train_new, Y_test_new
def FCN(input_shape): vgg16_model = VGG16(weights = 'imagenet', include_top = False, input_shape = input_shape); #Sq_net = squeezenet(float(input_shape)); fire8 = extract_layer_from_model(vgg16_model, layer_name = 'block4_pool'); pool8 = MaxPooling2D((3,3), strides = (2,2), name = 'pool8')(fire8.output); fc1 = Conv2D(64, (6,6), strides= (1, 1), padding = 'same', name = 'fc1')(pool8); fc1 = Dropout(rate = 0.5)(fc1); if SEPERATE_CONFIDENCE: fc2 = Conv2D(4 , (1, 1), strides = (1, 1), padding = 'same', activation = 'relu', name = 'fc2')(fc1); rgb = K.l2_normalize(fc2[:, :, :, 0:3], axis = 3); w, h = map(int, fc2.get_shape()[1:3]); confidence = fc2[:, :, :, 3:4]; confidence = np.reshape(confidence, [-1, w*h]); confidence = K.softmax(confidence); confidence = np.reshape(confidence, shape=[-1, w, h, 1]); fc2 = rgb * confidence; else: fc2 = Conv2D(3, (1, 1), strides = (1, 1), padding = 'same', name = 'fc2')(fc1); fc2 = Activation('relu')(fc2); fc2 = Conv2D(3, (15, 15), padding = 'valid', name = 'fc_pooling')(fc2); def norm(fc2): fc2_norm = K.l2_normalize(fc2, axis = 3); illum_est = K.tf.reduce_sum(fc2_norm, axis = (1, 2)); illum_est = K.l2_normalize(illum_est); return illum_est; #illum_est = Dense(3)(fc2); illum_est = Lambda(norm)(fc2); FCN_model = Model(inputs = vgg16_model.input, outputs = illum_est, name = 'FC4'); return FCN_model;
def selfattoptions(args): q = args[0] k = args[1] v = args[2] q = tf.expand_dims(q, -1) k = tf.expand_dims(k, -1) v = tf.expand_dims(v, -1) QK = K.batch_dot(q, K.permute_dimensions(k, [0, 2, 1])) QK = QK / (20**0.5) QK = K.softmax(QK) MV = K.batch_dot(QK, v) MV = tf.squeeze(MV, -1) return MV
def energy_step(inputs, states): """ Step function for computing energy for a single decoder state """ assert_msg = "States must be a list. However states {} is of type {}".format( states, type(states)) assert isinstance(states, list) or isinstance(states, tuple), assert_msg """ Some parameters required for shaping tensors""" en_seq_len, en_hidden = encoder_out_seq.shape[ 1], encoder_out_seq.shape[2] de_hidden = inputs.shape[-1] """ Computing S.Wa where S=[s0, s1, ..., si]""" # <= batch_size*en_seq_len, latent_dim reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden)) # <= batch_size*en_seq_len, latent_dim W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden)) if verbose: print('wa.s>', W_a_dot_s.shape) """ Computing hj.Ua """ U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1) # <= batch_size, 1, latent_dim if verbose: print('Ua.h >', U_a_dot_h.shape) print('U_a >', self.U_a.shape) print('inputs.shape >', inputs.shape) """ tanh(S.Wa + hj.Ua) """ # <= batch_size*en_seq_len, latent_dim reshaped_Ws_plus_Uh = K.tanh( K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden))) if verbose: print('Ws+Uh>', reshaped_Ws_plus_Uh.shape) """ softmax(va.tanh(S.Wa + hj.Ua)) """ # <= batch_size, en_seq_len e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, K.tanh(self.V_a)), (-1, en_seq_len)) # <= batch_size, en_seq_len e_i = K.softmax(e_i) if verbose: print('ei>', e_i.shape) K.print_tensor(reshaped_Ws_plus_Uh, message='reshaped_Ws_plus_Uh') K.print_tensor(self.V_a, message='V_a') K.print_tensor(e_i, message='e_i') return e_i, [e_i]
def _body(i, logits, activations): """Routing while loop.""" # route: [batch, input_dim, output_dim, ...] # route = tf.nn.softmax(logits, dim=-1) route = K.softmax(logits) preactivate_unrolled = route * votes_trans preact_trans = tf.transpose(preactivate_unrolled, r_t_shape) preactivate = tf.reduce_sum(preact_trans, axis=1) + biases activation = _squash(preactivate) activations = activations.write(i, activation) act_3d = K.expand_dims(activation, 1) tile_shape = np.ones(num_dims, dtype=np.int32).tolist() tile_shape[1] = input_dim act_replicated = tf.tile(act_3d, tile_shape) distances = tf.reduce_sum(votes * act_replicated, axis=-1) logits += distances return (i + 1, logits, activations)
def call(self, x): row = [] col = [] # 对特征进行两两组合 for r, c in combinations(x, 2): # [field * (field - 1)] / 2 row.append(r) col.append(c) p = K.concatenate( row, axis=1) # [batch_size, [field * (field - 1)] / 2, embedding_size] q = K.concatenate(col, axis=1) inner_product = p * q # 对应元素相乘 # 添加非线性, 进行激活 attention_tmp = K.relu( K.bias_add(K.dot(inner_product, self.attention_W), self.attention_b)) # [batch_size, [field * (field - 1)] / 2, embedding_size] * [embedding_size, attention_units] = > [batch_size, [field * (field - 1)] / 2, attention_units] # context 向量 attention_tmp_dot = K.dot( attention_tmp, self.projection_h) # [batch_size, [field * (field - 1)] / 2, 1] # 计算的是一个样本的sofmax, sum的是一个样本的所有特征 attention_weight = K.softmax( attention_tmp_dot, axis=1 ) # 等价于 K.exp(attention_tmp_dot) / K.sum(attention_tmp_dot, axis=1, keepdims=True) # [batch_size, [field * (field - 1)] / 2, 1] # 权重乘以内积 attention_output = K.sum(inner_product * attention_weight, axis=1) # [batch_size, embedding_size] # 经过dropout操作 attention_output = K.dropout( attention_output, self.dropout_rate) # [batch_size, embedding_size] # 等价于dense层 afm_out = K.dot(attention_output, self.projection_p) # [batch_size, 1] return afm_out
def call(self, inputs, mask=None): ''' :param inputs: a list of tensor of length not larger than 2, or a memory tensor of size BxTXD1. If a list, the first entry is memory, and the second one is query tensor of size BxD2 if any :param mask: the masking entry will be directly discarded :return: a tensor of size BxD1, weighted summing along the sequence dimension ''' if isinstance(inputs, list) and len(inputs) == 2: memory, query = inputs if self.method is None: return memory[:, -1, :] elif self.method == 'cba': hidden = K.dot(memory, self.Wh) + K.expand_dims(K.dot(query, self.Wq), 1) hidden = K.tanh(hidden) s = K.squeeze(K.dot(hidden, self.v), -1) elif self.method == 'ga': s = K.sum(K.expand_dims(K.dot(query, self.Wq), 1) * memory, axis=-1) else: s = K.squeeze(K.dot(memory, self.v), -1) if mask is not None: mask = mask[0] else: if isinstance(inputs, list): if len(inputs) != 1: raise ValueError('inputs length should not be larger than 2') memory = inputs[0] else: memory = inputs if self.method is None: return memory[:, -1, :] elif self.method == 'cba': hidden = K.dot(memory, self.Wh) hidden = K.tanh(hidden) s = K.squeeze(K.dot(hidden, self.v), -1) elif self.method == 'ga': raise ValueError('general attention needs the second input') else: s = K.squeeze(K.dot(memory, self.v), -1) s = K.softmax(s) if mask is not None: s *= K.cast(mask, dtype='float32') sum_by_time = K.sum(s, axis=-1, keepdims=True) s = s / (sum_by_time + K.epsilon()) return K.sum(memory * K.expand_dims(s), axis=1)
def call(self, x, mask=None): ''' i_emb: [Batch_size, Hidden_units] hist_emb: [Batch_size, max_len, Hidden_units] hist_len: [Batch_size] ''' assert len(x) == 3 i_emb, hist_emb, hist_len = x[0], x[1], x[2] hidden_units = K.int_shape(hist_emb)[-1] max_len = tf.shape(hist_emb)[1] i_emb = tf.tile(i_emb, [1, max_len]) # (batch_size, max_len * hidden_units) i_emb = tf.reshape( i_emb, [-1, max_len, hidden_units]) # (batch_size, max_len, hidden_units) concat = K.concatenate( [i_emb, hist_emb, i_emb - hist_emb, i_emb * hist_emb], axis=2) # (batch_size, max_len, hidden_units * 4) for i in range(len(self.attention_hidden_units)): activation = None if i == 2 else self.attention_activation outputs = keras.layers.Dense(self.attention_hidden_units[i], activation=activation)(concat) concat = outputs outputs = tf.reshape(outputs, [-1, 1, max_len]) # (batch_size, 1, max_len) if self.supports_masking: mask = tf.sequence_mask(hist_len, max_len) # (batch_size, 1, max_len) padding = tf.ones_like(outputs) * (-1e12) outputs = tf.where(mask, outputs, padding) # 对outputs进行scale outputs = outputs / (hidden_units**0.5) outputs = K.softmax(outputs) outputs = tf.matmul(outputs, hist_emb) # batch_size, 1, hidden_units) outputs = tf.squeeze(outputs) # (batch_size, hidden_units) return outputs
def call(self, inputs): return K.softmax(inputs, axis=self.axis)