Exemple #1
0
def encoder(sequence,
            seq_length,
            repr_dim=100,
            module='lstm',
            num_layers=1,
            reuse=None,
            residual=False,
            activation=None,
            layer_norm=False,
            name='encoder',
            dropout=None,
            is_eval=True,
            **kwargs):
    if num_layers == 1:
        if layer_norm:
            with tf.variable_scope('layernorm', reuse=False) as vs:
                vs._reuse = False  # HACK
                num_layernorms = sum(1 for v in vs.global_variables()
                                     if 'layernorm' in v.name)
                sequence = tf.contrib.layers.layer_norm(
                    sequence, scope=str(num_layernorms))

        with tf.variable_scope(name, reuse=reuse):
            if module == 'lstm':
                out = bi_lstm(repr_dim, sequence, seq_length, **kwargs)
                if activation:
                    out = activation_from_string(activation)(out)
            elif module == 'sru':
                with_residual = sequence.get_shape()[2].value == repr_dim
                out = bi_sru(repr_dim, sequence, seq_length, with_residual,
                             **kwargs)
                if activation:
                    out = activation_from_string(activation)(out)
            elif module == 'rnn':
                out = bi_rnn(
                    repr_dim,
                    tf.nn.rnn_cell.BasicRNNCell(
                        repr_dim, activation_from_string(activation)),
                    sequence, seq_length, **kwargs)
            elif module == 'gru':
                out = bi_rnn(repr_dim, tf.contrib.rnn.GRUBlockCell(repr_dim),
                             sequence, seq_length, **kwargs)
                if activation:
                    out = activation_from_string(activation)(out)
            elif module == 'gldr':
                out = gated_linear_dilated_residual_network(
                    repr_dim, sequence, **kwargs)
            elif module == 'conv':
                out = convnet(repr_dim,
                              sequence,
                              1,
                              activation=activation_from_string(activation),
                              **kwargs)
            elif module == 'conv_glu':
                out = gated_linear_convnet(repr_dim, sequence, 1, **kwargs)
            elif module == 'conv_separable':
                out = depthwise_separable_convolution(
                    repr_dim,
                    sequence,
                    activation=activation_from_string(activation),
                    **kwargs)
            elif module == 'dense':
                out = tf.layers.dense(sequence, repr_dim)
                if activation:
                    out = activation_from_string(activation)(out)
            elif module == 'highway':
                out = highway_network(sequence, num_layers,
                                      activation_from_string(activation))
            elif module == 'self_attn':
                outs = []
                num_attn = kwargs.get('num_attn_heads', 1)
                for i in range(num_attn):
                    with tf.variable_scope(str(i)):
                        attn = self_attention(sequence,
                                              seq_length,
                                              repr_dim=repr_dim,
                                              **kwargs)
                        outs.append(attn)
                out = tf.concat(outs, 2) if num_attn > 1 else outs[0]
            elif module == 'positional_encoding':
                out = positional_encoding(sequence, seq_length)
            else:
                raise ValueError("Unknown encoder type: %s" % module)

            if residual:
                if out.get_shape()[-1].value != sequence.get_shape()[-1].value:
                    logging.error(
                        'Residual connection only possible if input to sequence encoder %s of type %s has same '
                        'dimension (%d) as output (%d).' %
                        (name, module, sequence.get_shape()[-1].value,
                         out.get_shape()[-1].value))
                    raise RuntimeError()
                out += sequence

            if dropout is not None:
                out = tf.cond(
                    tf.logical_and(tf.greater(dropout, 0.0),
                                   tf.logical_not(is_eval)),
                    lambda: tf.nn.dropout(
                        out,
                        1.0 - dropout,
                        noise_shape=[tf.shape(out)[0], 1,
                                     tf.shape(out)[-1]]), lambda: out)
    else:
        out = encoder(sequence,
                      seq_length,
                      repr_dim,
                      module,
                      num_layers - 1,
                      reuse,
                      residual,
                      activation,
                      layer_norm,
                      name,
                      dropout=dropout,
                      is_eval=is_eval,
                      **kwargs)

        out = encoder(out,
                      seq_length,
                      repr_dim,
                      module,
                      1,
                      reuse,
                      residual,
                      activation,
                      layer_norm,
                      name + str(num_layers - 1),
                      dropout=dropout,
                      is_eval=is_eval,
                      **kwargs)

    return out
Exemple #2
0
    def create_output(self, shared_resources, input_tensors):
        tensors = TensorPortTensors(input_tensors)
        with tf.variable_scope(
                "fast_qa", initializer=tf.contrib.layers.xavier_initializer()):
            # Some helpers
            batch_size = tf.shape(tensors.question_length)[0]
            max_question_length = tf.reduce_max(tensors.question_length)
            support_mask = misc.mask_for_lengths(tensors.support_length)

            input_size = shared_resources.embeddings.shape[-1]
            size = shared_resources.config["repr_dim"]
            with_char_embeddings = shared_resources.config.get(
                "with_char_embeddings", False)

            # set shapes for inputs
            tensors.emb_question.set_shape([None, None, input_size])
            tensors.emb_support.set_shape([None, None, input_size])

            emb_question = tensors.emb_question
            emb_support = tensors.emb_support
            if with_char_embeddings:
                # compute combined embeddings
                [char_emb_question, char_emb_support
                 ] = conv_char_embedding(len(shared_resources.char_vocab),
                                         size, tensors.word_chars,
                                         tensors.word_char_length, [
                                             tensors.question_batch_words,
                                             tensors.support_batch_words
                                         ])

                emb_question = tf.concat([emb_question, char_emb_question], 2)
                emb_support = tf.concat([emb_support, char_emb_support], 2)
                input_size += size

                # set shapes for inputs
                emb_question.set_shape([None, None, input_size])
                emb_support.set_shape([None, None, input_size])

            # compute encoder features
            question_features = tf.ones(
                tf.stack([batch_size, max_question_length, 2]))

            v_wiqw = tf.get_variable("v_wiq_w", [1, 1, input_size],
                                     initializer=tf.constant_initializer(1.0))

            wiq_w = tf.matmul(tf.gather(emb_question * v_wiqw,
                                        tensors.support2question),
                              emb_support,
                              adjoint_b=True)
            wiq_w = wiq_w + tf.expand_dims(support_mask, 1)

            question_binary_mask = tf.gather(
                tf.sequence_mask(tensors.question_length, dtype=tf.float32),
                tensors.support2question)
            wiq_w = tf.reduce_sum(
                tf.nn.softmax(wiq_w) * tf.expand_dims(question_binary_mask, 2),
                [1])

            # [B, L , 2]
            support_features = tf.stack([tensors.word_in_question, wiq_w], 2)

            # highway layer to allow for interaction between concatenated embeddings
            if with_char_embeddings:
                with tf.variable_scope("char_embeddings") as vs:
                    emb_question = tf.layers.dense(
                        emb_question, size, name="embeddings_projection")
                    emb_question = highway_network(emb_question, 1)
                    vs.reuse_variables()
                    emb_support = tf.layers.dense(emb_support,
                                                  size,
                                                  name="embeddings_projection")
                    emb_support = highway_network(emb_support, 1)

            keep_prob = 1.0 - shared_resources.config.get("dropout", 0.0)
            emb_question, emb_support = tf.cond(
                tensors.is_eval, lambda: (emb_question, emb_support), lambda:
                (tf.nn.dropout(emb_question,
                               keep_prob,
                               noise_shape=
                               [1, 1, emb_question.get_shape()[-1].value]),
                 tf.nn.dropout(emb_support,
                               keep_prob,
                               noise_shape=
                               [1, 1, emb_question.get_shape()[-1].value])))

            # extend embeddings with features
            emb_question_ext = tf.concat([emb_question, question_features], 2)
            emb_support_ext = tf.concat([emb_support, support_features], 2)

            # encode question and support
            encoder_type = shared_resources.config.get('encoder',
                                                       'lstm').lower()
            if encoder_type in ['lstm', 'sru', 'gru']:
                size = size + 2 if encoder_type == 'sru' else size  # to allow for use of residual in SRU
                encoded_question = encoder(emb_question_ext,
                                           tensors.question_length,
                                           size,
                                           module=encoder_type)
                encoded_support = encoder(emb_support_ext,
                                          tensors.support_length,
                                          size,
                                          module=encoder_type,
                                          reuse=True)
                projection_initializer = tf.constant_initializer(
                    np.concatenate([np.eye(size), np.eye(size)]))
                encoded_question = tf.layers.dense(
                    encoded_question,
                    size,
                    tf.tanh,
                    use_bias=False,
                    kernel_initializer=projection_initializer,
                    name='projection_q')
                encoded_support = tf.layers.dense(
                    encoded_support,
                    size,
                    tf.tanh,
                    use_bias=False,
                    kernel_initializer=projection_initializer,
                    name='projection_s')
            else:
                raise ValueError(
                    "Only rnn ('lstm', 'sru', 'gru') encoder allowed for FastQA!"
                )

            answer_layer = shared_resources.config.get('answer_layer',
                                                       'conditional').lower()

            topk = tf.get_variable('topk',
                                   initializer=shared_resources.config.get(
                                       'topk', 1),
                                   dtype=tf.int32,
                                   trainable=False)
            topk_p = tf.placeholder(tf.int32, [], 'beam_size_setter')
            topk_assign = topk.assign(topk_p)
            self._topk_assign = lambda k: self.tf_session.run(
                topk_assign, {topk_p: k})

            if answer_layer == 'conditional':
                start_scores, end_scores, doc_idx, predicted_start_pointer, predicted_end_pointer = \
                    conditional_answer_layer(size, encoded_question, tensors.question_length, encoded_support,
                                             tensors.support_length,
                                             tensors.correct_start, tensors.support2question, tensors.answer2support,
                                             tensors.is_eval,
                                             topk=topk,
                                             max_span_size=shared_resources.config.get("max_span_size", 10000))
            elif answer_layer == 'conditional_bilinear':
                start_scores, end_scores, doc_idx, predicted_start_pointer, predicted_end_pointer = \
                    conditional_answer_layer(size, encoded_question, tensors.question_length, encoded_support,
                                             tensors.support_length,
                                             tensors.correct_start, tensors.support2question, tensors.answer2support,
                                             tensors.is_eval,
                                             topk=topk,
                                             max_span_size=shared_resources.config.get("max_span_size", 10000),
                                             bilinear=True)
            elif answer_layer == 'bilinear':
                start_scores, end_scores, doc_idx, predicted_start_pointer, predicted_end_pointer = \
                    bilinear_answer_layer(size, encoded_question, tensors.question_length, encoded_support,
                                          tensors.support_length,
                                          tensors.support2question, tensors.answer2support, tensors.is_eval,
                                          topk=topk,
                                          max_span_size=shared_resources.config.get("max_span_size", 10000))
            else:
                raise ValueError

            span = tf.stack(
                [doc_idx, predicted_start_pointer, predicted_end_pointer], 1)

            return TensorPort.to_mapping(self.output_ports,
                                         (start_scores, end_scores, span))