Beispiel #1
0
    def attention_layer(self,
                        x,
                        y,
                        hidden_size,
                        bias,
                        name,
                        is_train,
                        cache=None):
        """
        """
        # Query Key Value projection
        q = dense_layer(x, hidden_size // 4, name='q')
        k = dense_layer(y, hidden_size // 4, name='k')
        v = dense_layer(y, hidden_size, name='v')

        if cache is not None:
            # Combine cached keys and values with new keys and values.
            k = tf.concat([cache["k"], k], axis=1)
            v = tf.concat([cache["v"], v], axis=1)

            # Update cache
            cache["k"] = k
            cache["v"] = v

        # Split head (for multi head attention)
        q = self.split_heads(q, hidden_size // 4)
        k = self.split_heads(k, hidden_size // 4)
        v = self.split_heads(v, hidden_size)

        # Scale q to prevent the dot product
        # between q and k from growing too large.
        depth = (hidden_size // self.num_heads)
        q *= depth**-0.5

        # Calculate dot product attention
        logits = tf.matmul(q, k, transpose_b=True)
        logits += bias
        w = tf.nn.softmax(logits, name="attention_weights")

        if is_train:
            w = tf.nn.dropout(w, self.dropout_rate)

        attention_output = tf.matmul(w, v)

        # Recombine heads --> [batch_size, length, hidden_size]
        attention_output = self.combine_heads(attention_output, hidden_size)

        # Run the combined outputs through another linear projection layer.
        attention_output = dense_layer(attention_output,
                                       hidden_size,
                                       name='att_out')

        return attention_output, w
Beispiel #2
0
    def get_logits(self, image, is_train, **kwargs):
        widths = tf.ones(tf.shape(image)[0],
                         dtype=tf.int32) * tf.shape(image)[2]
        features, sequence_length = self._convnet_layers(
            image, widths, is_train)
        features = rnn_layers(features,
                              sequence_length,
                              self.rnn_size,
                              use_projection=True)
        logits = dense_layer(features,
                             len(self.out_charset) + 1,
                             name='logits')

        return logits, sequence_length
Beispiel #3
0
    def get_logits(self, image, is_train, **kwargs):
        """
        """
        # ResNet
        widths = tf.ones(tf.shape(image)[0],
                         dtype=tf.int32) * tf.shape(image)[2]
        features, sequence_length = self._convnet_layers(
            image, widths, is_train)

        # LSTM encoder
        with tf.variable_scope("rnn"):
            rnn_inputs = tf.nn.max_pool(features, (1, 8, 1, 1), (1, 1, 1, 1),
                                        'VALID',
                                        data_format='NHWC')
            rnn_inputs = tf.squeeze(rnn_inputs, axis=[1])
            rnn_inputs = tf.transpose(rnn_inputs,
                                      perm=[1, 0, 2],
                                      name='time_major')
            holistic_features = rnn_layer(rnn_inputs,
                                          sequence_length,
                                          self.rnn_size,
                                          scope='holistic')
            holistic_feature = dense_layer(holistic_features[-1],
                                           self.FLAGS.rnn_size,
                                           name='holistic_projection')

        # 2D LSTM decoder
        logits, weights = self.twodim_attention_decoder(
            holistic_feature, features, kwargs['label'], len(self.out_charset),
            self.FLAGS.rnn_size, is_train, self.FLAGS.label_maxlen)
        logits = tf.reshape(
            logits, [-1, self.FLAGS.label_maxlen,
                     len(self.out_charset) + 1])

        sequence_length = None
        self.attention_weights = tf.expand_dims(weights, axis=1)

        return logits, sequence_length
Beispiel #4
0
    def get_logits(self, image, is_train, **kwargs):
        """
        """
        widths = tf.ones(tf.shape(image)[0],
                         dtype=tf.int32) * tf.shape(image)[2]
        features, sequence_length = self._convnet_layers(
            image, widths, is_train)
        attention_states = rnn_layers(features,
                                      sequence_length,
                                      self.rnn_size,
                                      use_projection=True)
        attention_states = dense_layer(attention_states,
                                       self.rnn_size,
                                       name='att_state_dense')
        logits, weights = attention_decoder(attention_states,
                                            kwargs['label'],
                                            len(self.out_charset),
                                            self.rnn_size,
                                            is_train,
                                            self.FLAGS.label_maxlen,
                                            cell_type='gru')

        return logits, sequence_length
Beispiel #5
0
    def twodim_attention_decoder(self,
                                 holistic_feature,
                                 attention_states,
                                 label,
                                 num_classes,
                                 rnn_size,
                                 is_train,
                                 label_maxlen=25):
        """
        """
        with tf.variable_scope('attention_layer'):
            batch_size = tf.shape(attention_states)[0]
            cell = tf.contrib.rnn.LSTMCell(rnn_size)
            dummy_label = tf.concat([
                tf.zeros([batch_size, num_classes]),
                tf.ones([batch_size, 1])
            ],
                                    axis=-1)
            decoder_inputs = [dummy_label] + [None] * (label_maxlen - 1)

            if label is not None:
                output_shape = tf.to_int64(
                    tf.stack([batch_size, label_maxlen], axis=0))
                label = tf.sparse_to_dense(sparse_indices=label.indices,
                                           sparse_values=label.values,
                                           output_shape=output_shape,
                                           default_value=num_classes)
                label_one_hot = tf.one_hot(label, num_classes + 1)
            else:
                label_one_hot = tf.zeros([batch_size, label_maxlen])

            softmax_w = tf.get_variable(
                'softmax_w', [rnn_size, num_classes + 1],
                initializer=tf.contrib.layers.xavier_initializer())
            softmax_b = tf.get_variable(
                'softmax_b', [num_classes + 1],
                initializer=tf.constant_initializer(value=0.0))

            def get_train_input(prev, i):
                if i == 0:
                    return dummy_label
                else:
                    return label_one_hot[:, i - 1, :]

            def get_eval_input(prev, i):
                if i == 0:
                    return dummy_label
                else:
                    _logit = tf.nn.xw_plus_b(prev, softmax_w, softmax_b)
                    _prediction = tf.argmax(_logit, axis=-1)
                    return tf.one_hot(_prediction, num_classes + 1)

            def get_input(prev, i):
                if is_train:
                    return get_train_input(prev, i)
                else:
                    return get_eval_input(prev, i)

            # attention_states [B, 8, 25, 512]
            height = tf.shape(attention_states)[1]
            width = tf.shape(attention_states)[2]
            attn_size = rnn_size
            q = tf.get_variable("AttnQ", [1, attn_size * 2, attn_size],
                                dtype=tf.float32)
            k = tf.get_variable("AttnK", [3, 3, attn_size, attn_size],
                                dtype=tf.float32)
            v = tf.get_variable("AttnV", [1, 1, attn_size, 1],
                                dtype=tf.float32)
            key = tf.nn.conv2d(attention_states, k, [1, 1, 1, 1], "SAME")

            def attention(query):
                with tf.variable_scope("Attention"):
                    query = tf.reshape(query, [batch_size, 1, attn_size * 2])
                    y = tf.nn.conv1d(query, q, 1, "SAME", data_format="NWC")
                    y = tf.reshape(y, [-1, 1, 1, attn_size])
                    s = tf.nn.conv2d(tf.nn.tanh(key + y), v, [1, 1, 1, 1],
                                     "SAME")
                    s = tf.reshape(s, [-1, height * width, 1])
                    a = tf.nn.softmax(s, axis=1)
                    a = tf.reshape(a, [-1, height, width, 1])
                    d = tf.reduce_sum(a * attention_states, [1, 2])

                return d, tf.reshape(a, [-1, height, width])

            attn_weights = []
            features = []
            prev = None
            state = (holistic_feature, holistic_feature)
            _state = tf.concat(state, axis=-1)
            attns, ats = attention(_state)
            attn_weights.append(ats)

            for i, inp in enumerate(decoder_inputs):

                if i > 0:
                    tf.get_variable_scope().reuse_variables()

                if prev is not None:
                    with tf.variable_scope("loop_function", reuse=True):
                        inp = get_input(prev, i)

                input_size = inp.get_shape().with_rank(2)[1]

                inputs = tf.concat([inp, attns], axis=-1)
                x = dense_layer(inputs,
                                input_size,
                                name="input_projection",
                                activation=None)

                # Run the RNN.
                cell_output, state = cell(x, state)

                # Run the attention mechanism.
                _state = tf.concat(state, axis=-1)
                attns, ats = attention(_state)
                attn_weights.append(ats)

                with tf.variable_scope("AttnOutputProjection"):
                    inputs = tf.concat([cell_output, attns], axis=-1)
                    output = dense_layer(inputs,
                                         rnn_size,
                                         name="output_projection",
                                         activation=tf.nn.relu)

                prev = output
                features.append(output)

            features = tf.stack(features, axis=1)
            features = tf.reshape(features, (-1, rnn_size))
            rnn_logits = tf.nn.xw_plus_b(features, softmax_w, softmax_b)
            rnn_logits = tf.reshape(
                rnn_logits, (batch_size, label_maxlen, num_classes + 1))
            attn_weights = tf.stack(attn_weights, axis=1)

            return rnn_logits, attn_weights
Beispiel #6
0
    def decoder_stack(self,
                      decoder_inputs,
                      encoder_outputs,
                      self_attention_bias,
                      attention_bias,
                      is_train,
                      cache=None):
        """
        """
        ws = []

        # Decoder stack
        for n in range(self.dec_layers):
            with tf.variable_scope("decoder_layer_%d" % n):
                layer_name = "layer_%d" % n
                layer_cache = cache[layer_name] if cache is not None else None

                with tf.variable_scope("self_attention"):
                    # layer norm
                    y = self.layer_norm(decoder_inputs, self.hidden_size)

                    # self att
                    y, _ = self.attention_layer(y, y, self.hidden_size,
                                                self_attention_bias,
                                                'self_att', is_train,
                                                layer_cache)

                    # dropout
                    if is_train:
                        y = tf.nn.dropout(y, self.dropout_rate)

                    # skip
                    decoder_inputs = y + decoder_inputs

                with tf.variable_scope("encdec_attention"):
                    # layer norm
                    y = self.layer_norm(decoder_inputs, self.hidden_size)

                    # self att
                    y, w = self.attention_layer(y, encoder_outputs,
                                                self.hidden_size,
                                                attention_bias, 'encdec_att',
                                                is_train)
                    ws.append(w)

                    # dropout
                    if is_train:
                        y = tf.nn.dropout(y, self.dropout_rate)

                    # skip
                    decoder_inputs = y + decoder_inputs

                with tf.variable_scope("ffn"):
                    # layer norm
                    y = self.layer_norm(decoder_inputs, self.hidden_size)

                    # ffn
                    y = dense_layer(y,
                                    self.filter_size,
                                    name='filter_layer',
                                    activation=tf.nn.relu)

                    # dropout
                    if is_train:
                        y = tf.nn.dropout(y, self.dropout_rate)

                    y = dense_layer(y,
                                    self.hidden_size,
                                    name='output_layer',
                                    activation=tf.nn.relu)
                    # dropout
                    if is_train:
                        y = tf.nn.dropout(y, self.dropout_rate)

                    # skip
                    decoder_inputs = y + decoder_inputs

        # Output normalization
        decoder_outputs = self.layer_norm(decoder_inputs, self.hidden_size)
        ws = tf.stack(ws, axis=1)

        return decoder_outputs, ws
Beispiel #7
0
    def transformer_encoder(self, features, num_layers, hidden_size, is_train):
        """
        """
        with tf.variable_scope('transformer_enc'):
            attention_bias = 0

            # Position encoding
            batch_size = tf.shape(features)[0]
            height = tf.shape(features)[1]
            width = tf.shape(features)[2]
            const_h = self.FLAGS.resize_hw.height // 4
            const_w = self.FLAGS.resize_hw.width // 4
            h_encoding = self.get_position_encoding(height, hidden_size,
                                                    'h_encoding')
            w_encoding = self.get_position_encoding(width, hidden_size,
                                                    'w_encoding')
            h_encoding = tf.expand_dims(h_encoding, axis=1)
            w_encoding = tf.expand_dims(w_encoding, axis=0)
            h_encoding = tf.tile(tf.expand_dims(h_encoding, axis=0),
                                 [batch_size, 1, 1, 1])
            w_encoding = tf.tile(tf.expand_dims(w_encoding, axis=0),
                                 [batch_size, 1, 1, 1])

            # Adaptive 2D potisiontal encoding
            inter = tf.reduce_mean(features, axis=[1, 2])  # [B, hidden]
            inter = dense_layer(inter,
                                hidden_size // 2,
                                name='intermediate',
                                activation=tf.nn.relu)

            if is_train:
                inter = tf.nn.dropout(inter, self.dropout_rate)

            alpha = dense_layer(inter,
                                2 * hidden_size,
                                name='alpha',
                                activation=tf.nn.sigmoid)
            alpha = tf.reshape(alpha, [-1, 2, 1, hidden_size])
            pos_encoding = alpha[:, 0:1, :, :] * h_encoding \
                           + alpha[:, 1:2, :, :] * w_encoding

            features += pos_encoding
            self.hw = tf.reduce_sum(alpha, axis=[2, 3])

            # Save shape
            shape = (-1, height, width, hidden_size)
            features = tf.reshape(features, (-1, height * width, hidden_size))

            # Dropout
            if is_train:
                features = tf.nn.dropout(features, self.dropout_rate)

            # Encoder stack
            ws = []
            for n in range(num_layers):
                with tf.variable_scope("encoder_layer_%d" % n):
                    with tf.variable_scope("self_attention"):
                        # layer norm
                        y = self.layer_norm(features, hidden_size)

                        # self att
                        y, w = self.attention_layer(y, y, hidden_size,
                                                    attention_bias, 'self_att',
                                                    is_train)
                        ws.append(w)

                        # dropout
                        if is_train:
                            y = tf.nn.dropout(y, self.dropout_rate)

                        # skip
                        features = y + features

                    with tf.variable_scope("ffn"):
                        # layer norm
                        y = self.layer_norm(features, hidden_size)

                        # cnn
                        y = tf.reshape(features, shape)

                        conv_params = [
                            ConvParams(self.filter_size, 1, (1, 1), 'same',
                                       False, True, 'expand'),
                            ConvParams(self.filter_size, 3, (1, 1), 'same',
                                       False, True, 'dwconv'),
                            ConvParams(self.hidden_size, 1, (1, 1), 'same',
                                       False, True, 'reduce')
                        ]
                        y = conv_layer(y, conv_params[0], is_train)
                        y = depthwise_conv_layer(y, conv_params[1], is_train)
                        y = conv_layer(y, conv_params[2], is_train)
                        y = tf.reshape(y, (-1, height * width, hidden_size))

                        # skip
                        features = y + features

            # Output normalization
            features = self.layer_norm(features, hidden_size)
            ws = tf.stack(ws, axis=1)

        return features, shape, ws