Example #1
0
    def att_layer(self, inputs, W_name="W_att_"):
        net = inputs
        f = ne.conv2d(net, filters=self.att_filters[W_name+"f_0"], biases=None,
                      strides=self.att_f_strides, padding=self.att_f_padding) # [b, h, w, c]
        g = ne.conv2d(net, filters=self.att_filters[W_name+"g_0"], biases=None,
                      strides=self.att_g_strides, padding=self.att_g_padding) # [b, h, w, c]
        h = ne.conv2d(net, filters=self.att_filters[W_name+"h_0"], biases=None,
                      strides=self.att_h_strides, padding=self.att_h_padding) # [b, h, w, c]
        if self.attention_type == "GOOGLE":
            f = ne.max_pool_2x2(f) # [b, h/2, w/2, c]
            h = ne.max_pool_2x2(h) # [b, h/2, w/2, c]
        elif self.attention_type == "DUOCONV":
            f = ne.max_pool_2x2(ne.max_pool_2x2(f)) # [b, h/4, w/4, c]
            h = ne.max_pool_2x2(ne.max_pool_2x2(h)) # [b, h/4, w/4, c]

        # N = h * w
        s = tf.matmul(ne.hw_flatten(g), ne.hw_flatten(f), transpose_b=True) # [b, N, N]
        beta = ne.softmax(s)  # attention map, [b, N, N]
        o = tf.matmul(beta, ne.hw_flatten(h)) # [b, N, C]
        o = tf.reshape(o, shape=[tf.shape(inputs)[0]] + inputs.get_shape().as_list()[1:-1]+[self.att_o_channel_size]) # [b, h, w, C]
        o = ne.conv2d(o, filters=self.att_filters[W_name+"o_0"], biases=None,
                      strides=self.att_o_strides, padding=self.att_o_padding) # [b, h, w, c]
        net = self.att_gamma * o + net

        return net
Example #2
0
    def _multihead_attention_layer(self,
                                   layer_idx,
                                   query,
                                   memory=None,
                                   mask=None):
        if memory is None:
            memory = query

        # Linear project to d_model dimension: [batch, q_size/k_size, d_model]
        Q = ne.fully_conn(query,
                          self.att_weights["W_att_Q_l{}_0".format(layer_idx)],
                          self.att_biases["b_att_Q_l{}_0".format(layer_idx)])
        Q = ne.leaky_relu(Q, self.leaky_ratio[layer_idx])

        K = ne.fully_conn(memory,
                          self.att_weights["W_att_K_l{}_0".format(layer_idx)],
                          self.att_biases["b_att_K_l{}_0".format(layer_idx)])
        K = ne.leaky_relu(K, self.leaky_ratio[layer_idx])

        V = ne.fully_conn(memory,
                          self.att_weights["W_att_V_l{}_0".format(layer_idx)],
                          self.att_biases["b_att_V_l{}_0".format(layer_idx)])
        V = ne.leaky_relu(V, self.leaky_ratio[layer_idx])

        # Split the matrix to multiple heads and then concatenate to have a larger
        # batch size: [h*batch, q_size/k_size, d_model/num_heads]
        Q_split = tf.concat(tf.split(Q, self.num_att_header, axis=2), axis=0)
        K_split = tf.concat(tf.split(K, self.num_att_header, axis=2), axis=0)
        V_split = tf.concat(tf.split(V, self.num_att_header, axis=2), axis=0)
        if mask != None:
            mask = tf.tile(mask, [self.num_att_header, 1, 1])

        # Apply scaled dot product attention
        d = self.feature_size // self.num_att_header
        assert d == Q_split.shape[-1] == K_split.shape[-1] == V_split.shape[-1]

        out = tf.matmul(Q_split,
                        tf.transpose(K_split,
                                     [0, 2, 1]))  # [h*batch, q_size, k_size]
        out = out / tf.sqrt(tf.cast(d, tf.float32))  # scaled by sqrt(d_k)

        if mask is not None:
            # masking out (0.0) => setting to -inf.
            out = tf.multiply(out, mask) + (1.0 - mask) * (-1e10)

        out = ne.softmax(out)  # [h * batch, q_size, k_size]
        out = ne.dropout(out, self.drop_rate[layer_idx], self.is_training)
        out = tf.matmul(out, V_split)  # [h * batch, q_size, d_model]

        # Merge the multi-head back to the original shape
        out = tf.concat(tf.split(out, self.num_att_header, axis=0),
                        axis=2)  # [bs, q_size, d_model]

        return out