def evaluate(data): total, right, true_positives, possible_positives, predicted_positives = 0., 0., 0., 0., 0. for x_true, y_true in data: y_pred = model.predict(x_true).argmax(axis=1) y_true = y_true[:, 0] right += (y_true == y_pred).sum() true_positives += K.sum(K.round(K.clip(y_true * y_pred, 0, 1))).numpy() possible_positives += K.sum(K.round(K.clip(y_true, 0, 1))).numpy() predicted_positives += K.sum(K.round(K.clip(y_pred, 0, 1))).numpy() total += len(y_true) accuracy = right / total recall = true_positives / (possible_positives + K.epsilon()) precision = true_positives / (predicted_positives + K.epsilon()) f1_score = 2 * ((precision * recall) / (precision + recall + K.epsilon())) return accuracy, recall, precision, f1_score
def new_update(x, new_x): if is_one_of(x, params) and self._do_layer_adaptation(x): dx = new_x - x lr_t = K.clip(self.learning_rate, K.epsilon(), 1e10) x_norm = tf.norm(x) g_norm = tf.norm(dx / lr_t) ratio = K.switch( x_norm > 0., K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.), 1.) new_x = x + dx * ratio return old_update(x, new_x)
def new_update(x, new_x): if x is var and self._do_layer_adaptation(x): dx = new_x - x lr_t = self._decayed_lr(x.dtype.base_dtype) lr_t = K.clip(lr_t, K.epsilon(), K.infinity()) x_norm = tf.norm(x) g_norm = tf.norm(dx / lr_t) ratio = K.switch( x_norm > 0.0, K.switch(g_norm > 0.0, x_norm / g_norm, 1.0), 1.0) new_x = x + dx * ratio return old_update(x, new_x)
def compute_position_ids(self, inputs): q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 max_position = (self.input_dim - 1) // 2 pos_ids = K.clip(pos_ids, -max_position, max_position) pos_ids = pos_ids + max_position return pos_ids
def call(self, inputs, q_mask=False, v_mask=False, a_mask=False): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 """ # 处理mask inputs = inputs[:] for i, mask in enumerate([q_mask, v_mask, a_mask]): if not mask: inputs.insert(3 + i, None) q, k, v, q_mask, v_mask = inputs[:5] if len(inputs) == 5: a_mask = 'history_only' elif len(inputs) == 6: a_mask = inputs[-1] else: raise ValueError('wrong inputs for MultiHeadAttention.') # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 相对位置编码 if self.max_relative_position is not None: q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs pos_ids = K.clip(pos_ids, -self.max_relative_position, self.max_relative_position) pos_ids = pos_ids + self.max_relative_position pos_embeddings = K.gather(self.relative_embeddings, pos_ids) a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) # Attention(续) a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: if is_string(a_mask): ones = K.ones_like(a[:1, :1]) a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12 a = a - a_mask else: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if self.max_relative_position is not None: o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) o = sequence_masking(o, q_mask, 0) return o
def call(self, inputs, q_mask=None, v_mask=None, a_mask=None): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 """ q, k, v = inputs[:3] if a_mask: if len(inputs) == 3: a_mask = 'history_only' else: a_mask = inputs[3] if q_mask is not None: if not hasattr(self, 'q_mask_layer'): self.q_mask_layer = search_layer(q, q_mask) q_mask = self.q_mask_layer.output_mask if v_mask is not None: if not hasattr(self, 'v_mask_layer'): self.v_mask_layer = search_layer(v, v_mask) v_mask = self.v_mask_layer.output_mask # Pooling if self.pool_size > 1: is_self_attention = (q is k is v) q_in_len = K.shape(q)[1] q = sequence_masking(q, q_mask, 0) q = divisible_temporal_padding(q, self.pool_size) q = pool1d(q, self.pool_size, self.pool_size, pool_mode='avg') if is_self_attention: k = v = q else: k = sequence_masking(k, v_mask, 0) k = divisible_temporal_padding(k, self.pool_size) k = pool1d(k, self.pool_size, self.pool_size, pool_mode='avg') v = sequence_masking(v, v_mask, 0) v = divisible_temporal_padding(v, self.pool_size) v = pool1d(v, self.pool_size, self.pool_size, pool_mode='avg') if v_mask is not None: v_mask = v_mask[:, ::self.pool_size] if a_mask is not None and not is_string(a_mask): a_mask = a_mask[..., ::self.pool_size, ::self.pool_size] # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 相对位置编码 if self.max_relative_position is not None: q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs pos_ids = K.clip(pos_ids, -self.max_relative_position, self.max_relative_position) pos_ids = pos_ids + self.max_relative_position pos_embeddings = K.gather(self.relative_embeddings, pos_ids) a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) # Attention(续) a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: if is_string(a_mask): ones = K.ones_like(a[:1, :1]) a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12 a = a - a_mask else: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if self.max_relative_position is not None: o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) # 恢复长度 if self.pool_size > 1: o = K.repeat_elements(o, self.pool_size, 1)[:, :q_in_len] # 返回结果 o = sequence_masking(o, q_mask, 0) return o