def loss(y_true, y_pred): y_soft = K.softmax(old_logits / temp) logits_pred = new_logits[:, :old_classes] y_pred_soft = K.softmax(logits_pred / temp) return sparselogloss(y_true, y_pred) + L * logloss(y_soft, y_pred_soft)
def call(self, x): f = K.conv2d(x, kernel=self.kernel_f, strides=(1, 1), padding='same') # [bs, h, w, c'] g = K.conv2d(x, kernel=self.kernel_g, strides=(1, 1), padding='same') # [bs, h, w, c'] h = K.conv2d(x, kernel=self.kernel_h, strides=(1, 1), padding='same') # [bs, h, w, c'] f_ = K.permute_dimensions(self._hw_flatten(f), (0, 2, 1)) # [bs, 3c', N] s = K.batch_dot(self._hw_flatten(g), f_) # [bs, N, N] beta = K.softmax(s, axis=-1) # attention map double_attn = K.batch_dot(f_, self._hw_flatten(x)) # [bs, 3c', 3c] double_attn = K.softmax(double_attn, axis=1) h_tmp, shape_tmp = self._hw_flatten(h, return_shape=True) # [bs, N, 3c'] o_tmp = K.batch_dot(beta, h_tmp) # [bs, N, 3c'] o = K.batch_dot(o_tmp, double_attn) # [bs, N, 3c] o = self._hw_recover(o, shape_tmp) # [bs, h, w, C] x = self.gamma * o + x return x
def __call__(self, x): regularization = 0. if self.l1: regularization += self.l1 * K.sum(K.softmax(x)) if self.l2: regularization += self.l2 * K.sum(K.square(K.softmax(x))) return regularization
def normalize_func(mean_batch, variance_batch): mean_batch = K.reshape(mean_batch, broadcast_shape) variance_batch = K.reshape(variance_batch, broadcast_shape) mean_weights = K.softmax(self.mean_weights, axis=0) variance_weights = K.softmax(self.variance_weights, axis=0) mean = (mean_weights[0] * mean_instance + mean_weights[1] * mean_layer + mean_weights[2] * mean_batch) variance = (variance_weights[0] * variance_instance + variance_weights[1] * variance_layer + variance_weights[2] * variance_batch) outputs = (inputs - mean) / (K.sqrt(variance + self.epsilon)) if self.scale: broadcast_gamma = K.reshape(self.gamma, broadcast_shape) outputs = outputs * broadcast_gamma if self.center: broadcast_beta = K.reshape(self.beta, broadcast_shape) outputs = outputs + broadcast_beta return outputs
def call(self, x): # Input is a 3-D or 4-D Tensor ndim = K.ndim(x) if ndim == 4: dims = K.int_shape(x) x = K.reshape(x, (-1, dims[1] * dims[2], 1, self.D)) elif ndim != 3: raise ValueError( 'Encoding input should have shape BxNxD or BxHxWxD') # Residual vectors R = x - self.codes ''' OLD WAY _x_i = K.repeat_elements(x, self.K, 1) _c_k = K.tile(self.codes, (n, 1)) R = K.reshape(_x_i - _c_k, (-1, n, self.K, self.D)) ''' # Assignment weights, optional dropout if self.dropout_rate is not None: W_ik = K.softmax( scaledL2(R, K.dropout(self.scale, self.dropout_rate))) else: W_ik = K.softmax(scaledL2(R, self.scale)) # Aggregation E = tf.einsum('bik,bikd->bkd', W_ik, R) # Normalize encoding vectors if self.l2_normalize: E = tf.nn.l2_normalize(E, axis=-1) E = tf.layers.Flatten()(E) return E
def fit(self, X, Y=None, val_X=None, val_Y=None, num_epochs=300, batch_size=None, start_temp=10.0, min_temp=0.1, tryout_limit=1, class_weight=None): if Y is None: Y = X assert len(X) == len(Y) validation_data = None if val_X is not None and val_Y is not None: assert len(val_X) == len(val_Y) validation_data = (val_X, val_Y) if batch_size is None: batch_size = max(len(X) // 256, 16) steps_per_epoch = (len(X) + batch_size - 1) // batch_size for i in range(tryout_limit): K.set_learning_phase(1) inputs = layers.Input(shape=X.shape[1:]) alpha = np.exp(np.log(min_temp / start_temp) / (num_epochs * steps_per_epoch)) self.concrete_select = ConcreteSelect(self.K, start_temp, min_temp, alpha, name='concrete_select') selected_features = self.concrete_select(inputs) outputs = self.output_function(selected_features) self.model = models.Model(inputs, outputs) self.model.compile( loss=LinearSVC.loss_function(loss_function, class_weight), optimizer=optimizer_class(lr=initial_lr), metrics=[LinearSVC.accuracy] ) print(self.model.summary()) stopper_callback = StopperCallback() hist = self.model.fit(X, Y, batch_size, num_epochs, verbose=0, callbacks=[stopper_callback], validation_data=validation_data) # , validation_freq = 10) if K.get_value( K.mean(K.max(K.softmax(self.concrete_select.logits, axis=-1)))) >= stopper_callback.mean_max_target: break num_epochs *= 2 self.probabilities = K.get_value(K.softmax(self.model.get_layer('concrete_select').logits)) self.indices = K.get_value(K.argmax(self.model.get_layer('concrete_select').logits)) return self
def gumbel_softmax(x, tau, from_logits=False, straight_through=False): # ref: https://arxiv.org/abs/1611.01144 eps = 1e-20 u = K.random_uniform(K.shape(x), eps, 1 - eps) if not from_logits: x = K.log(K.maximum(eps, x)) y = x - K.log(-K.log(u)) if tau > 0: if straight_through: return combine_value_gradient(hardmax(y), K.softmax(y / tau, axis=-1)) else: return K.softmax(y / tau, axis=-1) else: return hardmax(y)
def MultiHeadAttention(l=8 * 8, d=512, dv=64, dim_out=512, nv=8): """ Args: l: number of blocks in feature map d: dimension of the block dv: dimension of linear space to be projected nv: number of project for each block """ value_vector_1 = Input(shape=(l, d)) query_vector_1 = Input(shape=(l, d)) key_vector_1 = Input(shape=(l, d)) value_vector_2 = Dense(dv * nv, activation="relu")(value_vector_1) query_vector_2 = Dense(dv * nv, activation="relu")(query_vector_1) key_vector_2 = Dense(dv * nv, activation="relu")(key_vector_1) value = Reshape([l, nv, dv])(value_vector_2) query = Reshape([l, nv, dv])(query_vector_2) key = Reshape([l, nv, dv])(key_vector_2) attention = tf.einsum('baik,baij->bakj', query, key) / np.sqrt(dv) attention = Lambda(lambda x: K.softmax(x), output_shape=(l, nv, nv))(attention) output = tf.einsum('bajk,baik->baji', attention, value) output = Reshape([l, d])(output) output = Add()([output, query_vector_1]) output = Dense(dim_out, activation='relu')(output) return Model(inputs=[query_vector_1, key_vector_1, value_vector_1], outputs=output)
def test_step(inp, tar): outputs = 0 for j in range(args.eva_iter): current_batch = net(inp) outputs = outputs + K.softmax(current_batch, axis=1) outputs = outputs / args.eva_iter test_accuracy(tar, outputs)
def call(self, inputs, **kwargs): inputs = inputs if isinstance(inputs, list) else [inputs] if len(inputs) < 1 or len(inputs) > 2: raise ValueError("AttentionLayerWithBatchNormalization expect one or two inputs.") actual_input = inputs[0] mask = inputs[1] if len(inputs) > 1 else None if mask is not None and not (((len(mask.shape) == 3 and mask.shape[2] == 1) or len(mask.shape) == 2) and mask.shape[1] == self.input_length): raise ValueError("`mask` should be of shape (batch, input_length) or (batch, input_length, 1) " "when calling an AttentionLayerWithBatchNormalization.") assert actual_input.shape[-1] == self.attention_param.shape[0] # (batch, input_length, input_dim) * (input_dim, 1) ==> (batch, input_length, 1) attention_weights = K.dot(actual_input, self.attention_param) if mask is not None: if len(mask.shape) == 2: mask = K.expand_dims(mask, axis=2) # (batch, input_length, 1) mask = K.log(mask) attention_weights += mask # batch normalization attention_weights = BatchNormalization()(attention_weights) attention_weights = K.softmax(attention_weights, axis=1) # (batch, input_length, 1) result = K.sum(actual_input * attention_weights, axis=1) # (batch, input_length) [multiplication uses broadcast] return result
def step(self, x, states): # x.shape=(1, 512, 30, 40) # states : lista de tensores shape=(1, 512, 30, 40) h_tm1 = states[0] c_tm1 = states[1] #print("Checkpoint 1--------------") e = self.V_a(K.tanh(self.W_a(h_tm1) + self.U_a(x))) #e.shape (1, 1, 30, 40) #print("Checkpoint 2--------------") a = K.reshape(K.softmax(K.batch_flatten(e)), (x.shape[0], 1, x.shape[2], x.shape[3])) #Nueva version a.shape (1, 1, 30, 40) #a = K.reshape(K.softmax(K.batch_flatten(e)), (x_shape[0], 1, x_shape[2], x_shape[3])) #print("Checkpoint 3--------------") x_tilde = x * K.repeat_elements(a, x.shape[1], 1) #Nueva version x_tilde.shape=(1, 512, 30, 40) #x_tilde = x * K.repeat_elements(a, x_shape[1], 1) #print("Checkpoint 4--------------") x_i = self.W_i(x_tilde) x_f = self.W_f(x_tilde) x_c = self.W_c(x_tilde) x_o = self.W_o(x_tilde) i = self.inner_activation(x_i + self.U_i(h_tm1)) f = self.inner_activation(x_f + self.U_f(h_tm1)) c = f * c_tm1 + i * self.activation(x_c + self.U_c(h_tm1)) o = self.inner_activation(x_o + self.U_o(h_tm1)) h = o * self.activation(c) #print("Dime que llegaste/////////////////////") return h, [h, c]
def kl_divergence(self, other): self._check_other(other) p_self = K.softmax(self.logits) logp_self = log_softmax_tf(self.logits) logp_other = log_softmax_tf(other.logits) kl_div = tf.einsum('ij,ij->i', p_self, logp_self - logp_other) return self._rename(kl_div, 'kl_divergence')
def step(self, x, states): h = states[0] # states[1] necessary? # comes from the constants X_static = states[-2] # equals K.dot(static_x, self._W1) + self._b2 with X.shape=[bs, L, static_input_dim] total_x_static_prod = states[-1] # expand dims to add the vector which is only valid for this time step # to total_x_prod which is valid for all time steps hw = K.expand_dims(K.dot(h, self._W2), 1) additive_atn = total_x_static_prod + hw attention = K.softmax(K.dot(additive_atn, self._V), axis=1) static_x_weighted = K.sum(attention * X_static, [1]) x = K.dot(K.concatenate([x, static_x_weighted], 1), self._W3) + self._b3 h, new_states = self.layer.cell.call(x, states[:-2]) # append attention to the states to "smuggle" it out of the RNN wrapper attention = K.squeeze(attention, -1) h = K.concatenate([h, attention]) return h, new_states
def get_monitor_value(self, logs): monitor_value = K.get_value( K.mean( K.max(K.softmax( self.model.get_layer('concrete_select').logits), axis=-1))) return monitor_value
def call(self, inputs): if self._masking: assert len( inputs ) == 4, "inputs should be set [queries, keys, values, masks]." queries, keys, values, masks = inputs else: assert len( inputs) == 3, "inputs should be set [queries, keys, values]." queries, keys, values = inputs if K.dtype(queries) != 'float32': queries = K.cast(queries, 'float32') if K.dtype(keys) != 'float32': keys = K.cast(keys, 'float32') if K.dtype(values) != 'float32': values = K.cast(values, 'float32') matmul = K.batch_dot(queries, tf.transpose(keys, [0, 2, 1])) # MatMul scaled_matmul = matmul / int(queries.shape[-1])**0.5 # Scale if self._masking: scaled_matmul = self.mask(scaled_matmul, masks) # Mask(opt.) if self._future: scaled_matmul = self.future_mask(scaled_matmul) softmax_out = K.softmax(scaled_matmul) # SoftMax # Dropout out = K.dropout(softmax_out, self._dropout_rate) outputs = K.batch_dot(out, values) return outputs
def MultiHeadsAttModel(l=8 * 8, d=512, dv=64, dout=512, nv=8): v1 = Input(shape=(l, d)) q1 = Input(shape=(l, d)) k1 = Input(shape=(l, d)) v2 = Dense(dv * nv, activation="relu")(v1) q2 = Dense(dv * nv, activation="relu")(q1) k2 = Dense(dv * nv, activation="relu")(k1) v = Reshape([l, nv, dv])(v2) q = Reshape([l, nv, dv])(q2) k = Reshape([l, nv, dv])(k2) att = tf.einsum('baik,baij->bakj', q, k) / np.sqrt(dv) #att = Lambda(lambda x: K.batch_dot(x[0],x[1] ,axes=[-1,-1]) / np.sqrt(dv),output_shape=(l, nv, nv))([q,k])# l, nv, nv #att = tf.einsum('', q, k) att = Lambda(lambda x: K.softmax(x), output_shape=(l, nv, nv))(att) out = tf.einsum('bajk,baik->baji', att, v) #out = Lambda(lambda x: K.batch_dot(x[0], x[1],axes=[2,2]), output_shape=(l, nv, dv))([att, v]) out = Reshape([l, d])(out) out = Add()([out, q1]) out = Dense(dout, activation="relu")(out) return Model(inputs=[q1, k1, v1], outputs=out)
def call(self, x): Q_seq, K_seq, V_seq = x Q_len, V_len = None, None print("build attention") Q_seq = K.dot(Q_seq, self.WQ) Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head)) Q_seq = K.permute_dimensions(Q_seq, (0, 2, 1, 3)) K_seq = K.dot(K_seq, self.WK) K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head)) K_seq = K.permute_dimensions(K_seq, (0, 2, 1, 3)) V_seq = K.dot(V_seq, self.WV) V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head)) V_seq = K.permute_dimensions(V_seq, (0, 2, 1, 3)) A = K.batch_dot(Q_seq, K_seq, axes=[3, 3]) / self.size_per_head ** 0.5 A = K.permute_dimensions(A, (0, 3, 2, 1)) A = self.Mask(A, V_len, "add") A = K.permute_dimensions(A, (0, 3, 2, 1)) A = K.softmax(A) O_seq = K.batch_dot(A, V_seq, axes=[3, 2]) O_seq = K.permute_dimensions(O_seq, (0, 2, 1, 3)) O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim)) O_seq = self.Mask(O_seq, Q_len, "mul") return O_seq
def masked_softmax(vector, mask): """ `K.softmax(vector)` does not work if some elements of `vector` should be masked. This performs a softmax on just the non-masked portions of `vector` (passing None in for the mask is also acceptable; you'll just get a regular softmax). We assume that both `vector` and `mask` (if given) have shape (batch_size, vector_dim). In the case that the input vector is completely masked, this function returns an array of ``0.0``. This behavior may cause ``NaN`` if this is used as the last layer of a model that uses categorial cross-entropy loss. """ # We calculate masked softmax in a numerically stable fashion, as done # in https://github.com/rkadlec/asreader/blob/master/asreader/custombricks/softmax_mask_bricks.py if mask is not None: # Here we get normalized log probabilities for # enhanced numerical stability. mask = K.cast(mask, "float32") input_masked = mask * vector shifted = mask * (input_masked - K.max(input_masked, axis=1, keepdims=True)) # We add epsilon to avoid numerical instability when # the sum in the log yields 0. normalization_constant = K.log( K.sum(mask * K.exp(shifted), axis=1, keepdims=True) + K.epsilon()) normalized_log_probabilities = mask * (shifted - normalization_constant) unmasked_probabilities = K.exp(normalized_log_probabilities) return switch(mask, unmasked_probabilities, K.zeros_like(unmasked_probabilities)) else: # There is no mask, so we use the provided ``K.softmax`` function. return K.softmax(vector)
def yolo2_head(feats, anchors, num_classes, input_shape, calc_loss=False): """Convert final layer features to bounding box parameters.""" num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) grid_shape = K.shape(feats)[1:3] # height, width grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.dtype(feats)) feats = K.reshape( feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) # Adjust preditions to each spatial grid point and anchor size. box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast( grid_shape[::-1], K.dtype(feats)) #box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast(grid_shape[::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast( input_shape[::-1], K.dtype(feats)) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.softmax(feats[..., 5:]) if calc_loss == True: return grid, feats, box_xy, box_wh return box_xy, box_wh, box_confidence, box_class_probs
def call(self, x, mask=None): q, k, v = x d_k = q.shape.as_list()[2] # in pure tensorflow: # weights = tf.matmul(x_batch, tf.transpose(y_batch, perm=[0, 2, 1])) # normalized_weights = tf.nn.softmax(weights/scaling) # output = tf.matmul(normalized_weights, x_batch) weights = K.batch_dot(q, k, axes=[2, 2]) if mask is not None: # add mask weights if isinstance(mask, (list, tuple)): if len(mask) > 0: raise ValueError( "mask can only be a Tensor or a list of length 1 containing a tensor." ) mask = mask[0] weights += -1e10 * (1 - mask) normalized_weights = K.softmax(weights / np.sqrt(d_k)) output = K.batch_dot(normalized_weights, v) if self._return_attention: return [output, normalized_weights] else: return output
def call(self, inputs, **kwargs): """Following the routing algorithm from Hinton's paper, but replace b = b + <u,v> with b = <u,v>. This change can improve the feature representation of the capsule. However, you can replace b = K.batch_dot(outputs, hat_inputs, [2, 3]) with b += K.batch_dot(outputs, hat_inputs, [2, 3]) to get standard routing. """ if self.share_weights: hat_inputs = K.conv1d(inputs, self.kernel) else: hat_inputs = K.local_conv1d(inputs, self.kernel, [1], [1]) batch_size = K.shape(inputs)[0] input_num_capsule = K.shape(inputs)[1] hat_inputs = K.reshape(hat_inputs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) hat_inputs = K.permute_dimensions(hat_inputs, (0, 2, 1, 3)) b = K.zeros_like(hat_inputs[:, :, :, 0]) print(self.routings) for i in range(self.routings): c = K.softmax(b, 1) o = self.activation(K.batch_dot(c, hat_inputs, [2, 2])) if i < self.routings - 1: b = K.batch_dot(o, hat_inputs, [2, 3]) if K.backend() == 'theano': o = K.sum(o, axis=1) return o
def call(self, u_vecs): if self.share_weights: u_hat_vecs = K.conv1d(u_vecs, self.W) else: u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1]) batch_size = K.shape(u_vecs)[0] input_num_capsule = K.shape(u_vecs)[1] u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3)) # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule] b = K.zeros_like( u_hat_vecs[:, :, :, 0]) # shape = [None, num_capsule, input_num_capsule] for i in range(self.routings): b = K.permute_dimensions( b, (0, 2, 1)) # shape = [None, input_num_capsule, num_capsule] c = K.softmax(b) c = K.permute_dimensions(c, (0, 2, 1)) b = K.permute_dimensions(b, (0, 2, 1)) outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2])) if i < self.routings - 1: b = K.batch_dot(outputs, u_hat_vecs, [2, 3]) return outputs
def accuracy_mod(y_true, y_pred): # Squeeze the shape to (None, ) from (None, 1) as we want to apply operations directly on y_true if K.ndim(y_true) == K.ndim(y_pred): y_true = K.squeeze(y_true, -1) # Normalize the y_pred values first and then take the arg at which we have a maximum value (This is the predicted label) y_pred = K.softmax(y_pred, axis = -1) y_pred = K.argmax(y_pred, axis = -1) # Since the ground labels can also have -1s for which we don't wanna calculate accuracy, we are filtering them off defa = K.constant([0], dtype=tf.float32) #Creating a boolean tensor for labels greater or equal to 0 is_valid = K.greater_equal(y_true, defa) #Get the corresponding indices indices = tf.where(is_valid) #Gather the results of y_true and y_pred at the indices we calculated above fil_y_true = K.gather(y_true, K.reshape(indices, [-1])) fil_y_pred = K.gather(y_pred, K.reshape(indices, [-1])) # K.print_tensor(res, message='res = ') # K.print_tensor(comp, message='comp = ') fil_y_true = K.cast(fil_y_true, K.floatx()) fil_y_pred = K.cast(fil_y_pred, K.floatx()) #pdb.set_trace() return K.cast(K.equal(fil_y_true, fil_y_pred), K.floatx())
def call(self, inputs, **kwargs): assert isinstance(inputs, list) and len(inputs) == 3 first, second, features = inputs[0], inputs[1], inputs[2] if not self.from_logits: first = K.clip(first, 1e-10, 1.0) second = K.clip(second, 1e-10, 1.0) first_, second_ = K.log(first), K.log(second) else: first_, second_ = first, second # embedded_features.shape = (M, T, 1) if self.use_intermediate_layer: features = K.dot(features, self.first_kernel) features = K.bias_add(features, self.first_bias, data_format="channels_last") features = self.intermediate_activation(features) embedded_features = K.dot(features, self.features_kernel) embedded_features = K.bias_add(embedded_features, self.features_bias, data_format="channels_last") if self.use_dimension_bias: tiling_shape = [1] * (K.ndim(first) - 1) + [K.shape(first)[-1]] embedded_features = K.tile(embedded_features, tiling_shape) embedded_features = K.bias_add(embedded_features, self.dimensions_bias, data_format="channels_last") sigma = K.sigmoid(embedded_features) result = weighted_sum(first_, second_, sigma, self.first_threshold, self.second_threshold) probs = K.softmax(result) if self.return_logits: return [probs, result] return probs
def attention(x_inner, x_outer, n_factor, dropout): x_Q = L.Conv1D( n_factor, 1, activation='linear', kernel_initializer='glorot_uniform', bias_initializer='glorot_uniform', )(x_inner) x_K = L.Conv1D( n_factor, 1, activation='linear', kernel_initializer='glorot_uniform', bias_initializer='glorot_uniform', )(x_outer) x_V = L.Conv1D( n_factor, 1, activation='linear', kernel_initializer='glorot_uniform', bias_initializer='glorot_uniform', )(x_outer) x_KT = L.Permute((2, 1))(x_K) res = L.Lambda(lambda c: K.batch_dot(c[0], c[1]) / np.sqrt(n_factor))( [x_Q, x_KT]) # res = tf.expand_dims(res, axis = 3) # res = L.Conv2D(16, 3, 1, padding = "same", activation = "relu")(res) # res = L.Conv2D(1, 3, 1, padding = "same", activation = "relu")(res) # res = tf.squeeze(res, axis = 3) att = L.Lambda(lambda c: K.softmax(c, axis=-1))(res) att = L.Lambda(lambda c: K.batch_dot(c[0], c[1]))([att, x_V]) return att
def call(self, x): # 如果只传入Q_seq,K_seq,V_seq,那么就不做Mask # 如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len,那么对多余部分做Mask if len(x) == 3: Q_seq, K_seq, V_seq = x Q_len, V_len = None, None elif len(x) == 5: Q_seq, K_seq, V_seq, Q_len, V_len = x # 对Q、K、V做线性变换 Q_seq = K.dot(Q_seq, self.WQ) Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.head_dim)) Q_seq = K.permute_dimensions(Q_seq, (0, 2, 1, 3)) K_seq = K.dot(K_seq, self.WK) K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.head_dim)) K_seq = K.permute_dimensions(K_seq, (0, 2, 1, 3)) V_seq = K.dot(V_seq, self.WV) V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.head_dim)) V_seq = K.permute_dimensions(V_seq, (0, 2, 1, 3)) # 计算内积,然后mask,然后softmax A = K.batch_dot(Q_seq, K_seq, axes=[3, 3]) / self.head_dim ** 0.5 A = K.permute_dimensions(A, (0, 3, 2, 1)) A = self.Mask(A, V_len, 'add') A = K.permute_dimensions(A, (0, 3, 2, 1)) A = K.softmax(A) # 输出并mask O_seq = K.batch_dot(A, V_seq, axes=[3, 2]) O_seq = K.permute_dimensions(O_seq, (0, 2, 1, 3)) O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.dim)) O_seq = self.Mask(O_seq, Q_len, 'mul') return O_seq
def energy_step(decode_outs, states): # decode_outs(batch,dim) decode_outs = _p(decode_outs, "energy_step:decode_outs 算能量函数了.........." ) #decode_outs:[1,20] # decoder_seq [N,30,512] 30是字符串长度 en_seq_len, en_hidden = encoder_out_seq.shape[ 1], encoder_out_seq.shape[2] # 30, 512 de_hidden = decode_outs.shape[-1] # W * h_j reshaped_enc_outputs = K.reshape( encoder_out_seq, (-1, en_hidden)) #[b,64,512]=> [b*64,512] _p(reshaped_enc_outputs, "reshaped_enc_outputs") # W_a[512x512],reshaped_enc_outputs[b*64,512] => [b*64,512] => [b,64,512] W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden)) # U * S_t - 1,decode_outs[b,512],U_a[512,512] => [b,512] => [b,1,512] U_a_dot_h = K.expand_dims(K.dot(decode_outs, self.U_a), axis=1) # <= batch_size, 1, latent_dim # 这个细节很变态,其实就是完成了decoder的输出复制time(64)个,和encoder的输出【64,512】,相加的过程 # tanh ( W * h_j + U * S_t-1 + b ),[b,64,512] = [b*64,512] reshaped_Ws_plus_Uh = K.tanh( K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden))) # V * tanh ( W * h_j + U * S_t-1 + b ), [b*64,512]*[512,1] => [b*64,1] => [b,64] e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len)) # softmax(e_tj) e_i = K.softmax(e_i) e_i = _p(e_i, "energy_step:e_i") return e_i, [e_i]
def call(self, x): if len(x) == 3:#解析传入的入Q_seq,K_seq,V_seq Q_seq,K_seq,V_seq = x Q_len,V_len = None,None elif len(x) == 5:#Q_len,V_len为mask的长度 Q_seq,K_seq,V_seq,Q_len,V_len = x print("Q_seq------------------",Q_seq) #对Q、K、V做线性变换,一共做nb_head次,每次线性变化成size_per_head维度 Q_seq = K.dot(Q_seq, self.WQ)#查询 Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head)) Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))#相当于transpose,排列各维度的顺序 shape=(4,) K_seq = K.dot(K_seq, self.WK)#键 K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head)) K_seq = K.permute_dimensions(K_seq, (0,2,1,3))#shape=(4,) V_seq = K.dot(V_seq, self.WV)#值 V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head)) V_seq = K.permute_dimensions(V_seq, (0,2,1,3)) #计算内积,然后mask,然后softmax A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5#attention_11/Shape_12:0", shape=(5,) ########上句报错 ########ValueError: Dimension must be 5 but is 4 for 'attention_11/transpose_7' #####在TF1中,A形状为shape=(4,),到了TF2中,A形状变成了(5,) A = K.permute_dimensions(A, (0,3,2,1)) A = self.Mask(A, V_len, 'add') A = K.permute_dimensions(A, (0,3,2,1)) A = K.softmax(A) #输出并mask O_seq = K.batch_dot(A, V_seq, axes=[3,2]) O_seq = K.permute_dimensions(O_seq, (0,2,1,3)) O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim)) O_seq = self.Mask(O_seq, Q_len, 'mul') return O_seq
def call(self, x): # soft-assignment. s = K.conv2d(x, self.kernel, padding='same') + self.bias print('s.shape=', s.shape) a = K.softmax(s) self.amap = K.argmax(a, -1) # print 'amap.shape', self.amap.shape # Dims used hereafter: batch, H, W, desc_coeff, cluster a = K.expand_dims(a, -2) # print 'a.shape=',a.shape # Core v = K.expand_dims(x, -1) + self.C # print 'v.shape', v.shape v = a * v # print 'v.shape', v.shape v = K.sum(v, axis=[1, 2]) # print 'v.shape', v.shape v = K.permute_dimensions(v, pattern=[0, 2, 1]) # print 'v.shape', v.shape #v.shape = None x K x D # Normalize v (Intra Normalization) v = K.l2_normalize(v, axis=-1) v = K.batch_flatten(v) v = K.l2_normalize(v, axis=-1) # return [v, self.amap] return v
def rpn_loss_regr_fixed_num(y_true, y_pred): shape = K.shape(y_true) true_reshaped = K.reshape(y_true, (C.BATCH_SIZE, 7, 7, 5, 25)) pred_reshaped = K.reshape(y_pred, (C.BATCH_SIZE, 7, 7, 5, 25)) mask = true_reshaped[:,:,:,:,4] # class_mask = K.reshape(K.repeat_elements(mask,20,3), (C.BATCH_SIZE,7,7,5,20)) # coord_mask = K.reshape(K.repeat_elements(mask,4,3), (C.BATCH_SIZE,7,7,5,4)) # object_mask = mask # no_object_mask = 1 - mask class_loss = 10 * (1 - K.categorical_crossentropy(true_reshaped[:,:,:,:,5:],K.softmax(pred_reshaped[:,:,:,:,5:]))) object_square = K.square(1 - K.sigmoid(pred_reshaped[:,:,:,:,4])) object_loss = object_lambda * K.sum(object_square) no_object_square = K.square(0 - K.sigmoid(pred_reshaped[:,:,:,:,4])) no_object_loss = object_lambda * K.sum(no_object_square) coord_square = K.square(true_reshaped[:,:,:,:,:4] - pred_reshaped[:,:,:,:,:4]) coord_loss = coord_lambda * K.sum(coord_square) return (class_loss + object_loss + no_object_loss + coord_loss)