def att_layer(self, inputs, W_name="W_att_"): net = inputs f = ne.conv2d(net, filters=self.att_filters[W_name+"f_0"], biases=None, strides=self.att_f_strides, padding=self.att_f_padding) # [b, h, w, c] g = ne.conv2d(net, filters=self.att_filters[W_name+"g_0"], biases=None, strides=self.att_g_strides, padding=self.att_g_padding) # [b, h, w, c] h = ne.conv2d(net, filters=self.att_filters[W_name+"h_0"], biases=None, strides=self.att_h_strides, padding=self.att_h_padding) # [b, h, w, c] if self.attention_type == "GOOGLE": f = ne.max_pool_2x2(f) # [b, h/2, w/2, c] h = ne.max_pool_2x2(h) # [b, h/2, w/2, c] elif self.attention_type == "DUOCONV": f = ne.max_pool_2x2(ne.max_pool_2x2(f)) # [b, h/4, w/4, c] h = ne.max_pool_2x2(ne.max_pool_2x2(h)) # [b, h/4, w/4, c] # N = h * w s = tf.matmul(ne.hw_flatten(g), ne.hw_flatten(f), transpose_b=True) # [b, N, N] beta = ne.softmax(s) # attention map, [b, N, N] o = tf.matmul(beta, ne.hw_flatten(h)) # [b, N, C] o = tf.reshape(o, shape=[tf.shape(inputs)[0]] + inputs.get_shape().as_list()[1:-1]+[self.att_o_channel_size]) # [b, h, w, C] o = ne.conv2d(o, filters=self.att_filters[W_name+"o_0"], biases=None, strides=self.att_o_strides, padding=self.att_o_padding) # [b, h, w, c] net = self.att_gamma * o + net return net
def _multihead_attention_layer(self, layer_idx, query, memory=None, mask=None): if memory is None: memory = query # Linear project to d_model dimension: [batch, q_size/k_size, d_model] Q = ne.fully_conn(query, self.att_weights["W_att_Q_l{}_0".format(layer_idx)], self.att_biases["b_att_Q_l{}_0".format(layer_idx)]) Q = ne.leaky_relu(Q, self.leaky_ratio[layer_idx]) K = ne.fully_conn(memory, self.att_weights["W_att_K_l{}_0".format(layer_idx)], self.att_biases["b_att_K_l{}_0".format(layer_idx)]) K = ne.leaky_relu(K, self.leaky_ratio[layer_idx]) V = ne.fully_conn(memory, self.att_weights["W_att_V_l{}_0".format(layer_idx)], self.att_biases["b_att_V_l{}_0".format(layer_idx)]) V = ne.leaky_relu(V, self.leaky_ratio[layer_idx]) # Split the matrix to multiple heads and then concatenate to have a larger # batch size: [h*batch, q_size/k_size, d_model/num_heads] Q_split = tf.concat(tf.split(Q, self.num_att_header, axis=2), axis=0) K_split = tf.concat(tf.split(K, self.num_att_header, axis=2), axis=0) V_split = tf.concat(tf.split(V, self.num_att_header, axis=2), axis=0) if mask != None: mask = tf.tile(mask, [self.num_att_header, 1, 1]) # Apply scaled dot product attention d = self.feature_size // self.num_att_header assert d == Q_split.shape[-1] == K_split.shape[-1] == V_split.shape[-1] out = tf.matmul(Q_split, tf.transpose(K_split, [0, 2, 1])) # [h*batch, q_size, k_size] out = out / tf.sqrt(tf.cast(d, tf.float32)) # scaled by sqrt(d_k) if mask is not None: # masking out (0.0) => setting to -inf. out = tf.multiply(out, mask) + (1.0 - mask) * (-1e10) out = ne.softmax(out) # [h * batch, q_size, k_size] out = ne.dropout(out, self.drop_rate[layer_idx], self.is_training) out = tf.matmul(out, V_split) # [h * batch, q_size, d_model] # Merge the multi-head back to the original shape out = tf.concat(tf.split(out, self.num_att_header, axis=0), axis=2) # [bs, q_size, d_model] return out