def create_encodings(self, embeddings):
     encodings = [Encoding()(embedding) for embedding in embeddings]
     return [
         DecayingDropout(initial_keep_rate=self.dropout_initial_keep_rate,
                         decay_interval=self.dropout_decay_interval,
                         decay_rate=self.dropout_decay_rate)(encoding)
         for encoding in encodings
     ]
    def create_feature_extraction(self, interaction):
        d = K.int_shape(interaction)[-1]
        feature_extractor_input = Conv2D(filters=int(
            d * self.first_scale_down_ratio),
                                         kernel_size=1,
                                         activation=None,
                                         name='FirstScaleDown')(interaction)
        feature_extractor = DenseNet(
            include_top=False,
            input_tensor=Input(shape=K.int_shape(feature_extractor_input)[1:]),
            nb_dense_block=self.nb_dense_blocks,
            nb_layers_per_block=self.layers_per_dense_block,
            compression=self.transition_scale_down_ratio,
            growth_rate=self.growth_rate)(feature_extractor_input)

        features = DecayingDropout(
            initial_keep_rate=self.dropout_initial_keep_rate,
            decay_interval=self.dropout_decay_interval,
            decay_rate=self.dropout_decay_rate,
            name='Features')(feature_extractor)
        return features
    def call(self, P, **kwargs):
        """
        :param P: inputs
        :return: encoding of inputs P
        """
        ''' Paper notations in the code '''
        # P = P_hw
        # itr_attn = P_itrAtt
        # encoding = P_enc
        # The paper takes inputs to be P(_hw) as an example and then computes the same thing for H,
        # therefore we'll name our inputs P too.

        # Input of encoding is P with shape (batch, p, d). It would be (batch, h, d) for hypothesis
        # Construct alphaP of shape (batch, p, 3*d, p)
        # A = dot(w_itr_att, alphaP)

        # alphaP consists of 3*d rows along 2nd axis
        # 1. up   -> first  d items represent P[i]
        # 2. mid  -> second d items represent P[j]
        # 3. down -> final items represent alpha(P[i], P[j]) which is element-wise product of P[i] and P[j] = P[i]*P[j]

        # If we look at one slice of alphaP we'll see that it has the following elements:
        # ----------------------------------------
        # P[i][0], P[i][0], P[i][0], ... P[i][0]   ▲
        # P[i][1], P[i][1], P[i][1], ... P[i][1]   |
        # P[i][2], P[i][2], P[i][2], ... P[i][2]   |
        # ...                              ...     | up
        #      ...                         ...     |
        #             ...                  ...     |
        # P[i][d], P[i][d], P[i][d], ... P[i][d]   ▼
        # ----------------------------------------
        # P[0][0], P[1][0], P[2][0], ... P[p][0]   ▲
        # P[0][1], P[1][1], P[2][1], ... P[p][1]   |
        # P[0][2], P[1][2], P[2][2], ... P[p][2]   |
        # ...                              ...     | mid
        #      ...                         ...     |
        #             ...                  ...     |
        # P[0][d], P[1][d], P[2][d], ... P[p][d]   ▼
        # ----------------------------------------
        #                                          ▲
        #                                          |
        #                                          |
        #               up * mid                   | down
        #          element-wise product            |
        #                                          |
        #                                          ▼
        # ----------------------------------------

        # For every slice(i) the up part changes its P[i] values
        # The middle part is repeated p times in depth (for every i)
        # So we can get the middle part by doing the following:
        # mid = broadcast(P) -> to get tensor of shape (batch, p, d, p)
        # As we can notice up is the same mid, but with changed axis, so to obtain up from mid we can do:
        # up = swap_axes(mid, axis1=0, axis2=2)
        ''' Alpha '''
        # P                                                     # (batch, p, d)
        mid = broadcast_last_axis(P)  # (batch, p, d, p)
        up = K.permute_dimensions(mid,
                                  pattern=(0, 3, 2, 1))  # (batch, p, d, p)
        alphaP = K.concatenate([up, mid, up * mid],
                               axis=2)  # (batch, p, 3d, p)
        A = K.dot(self.w_itr_att, alphaP)  # (batch, p, p)
        ''' Self-attention '''
        # P_itr_attn[i] = sum of for j = 1...p:
        #                           s = sum(for k = 1...p:  e^A[k][j]
        #                           ( e^A[i][j] / s ) * P[j]  --> P[j] is the j-th row, while the first part is a number
        # So P_itr_attn is the weighted sum of P
        # SA is column-wise soft-max applied on A
        # P_itr_attn[i] is the sum of all rows of P scaled by i-th row of SA
        SA = softmax(A, axis=2)  # (batch, p, p)
        itr_attn = K.batch_dot(SA, P)  # (batch, p, d)
        ''' Fuse gate '''
        # These layers are considered linear in the official implementation, therefore we apply dropout on each input
        P_concat = K.concatenate([P, itr_attn], axis=2)  # (batch, p, 2d)
        z = K.tanh(K.dot(DecayingDropout()(P_concat), self.w1) +
                   self.b1)  # (batch, p, d)
        r = K.sigmoid(K.dot(DecayingDropout()(P_concat), self.w2) +
                      self.b2)  # (batch, p, d)
        f = K.sigmoid(K.dot(DecayingDropout()(P_concat), self.w3) +
                      self.b3)  # (batch, p, d)

        encoding = r * P + f * z  # (batch, p, d)
        return encoding  # (batch, p, d)
Example #4
0
    def __init__(self,
                 p=None,
                 h=None,
                 include_word_vectors=True,
                 word_embedding_weights=None,
                 train_word_embeddings=True,
                 include_chars=True,
                 chars_per_word=16,
                 char_embedding_size=8,
                 char_conv_filters=100,
                 char_conv_kernel_size=5,
                 include_syntactical_features=True,
                 syntactical_feature_size=50,
                 include_exact_match=True,
                 dropout_initial_keep_rate=1.,
                 dropout_decay_rate=0.977,
                 dropout_decay_interval=10000,
                 first_scale_down_ratio=0.3,
                 transition_scale_down_ratio=0.5,
                 growth_rate=20,
                 layers_per_dense_block=8,
                 nb_dense_blocks=3,
                 nb_labels=3,
                 inputs=None,
                 outputs=None,
                 name='DIIN'):
        """
        :ref https://openreview.net/forum?id=r1dHXnH6-&noteId=r1dHXnH6-

        :param p: sequence length of premise
        :param h: sequence length of hypothesis
        :param include_word_vectors: whether or not to include word vectors in the model
        :param word_embedding_weights: matrix of weights for word embeddings (GloVe pre-trained vectors)
        :param train_word_embeddings: whether or not to modify word embeddings while training
        :param include_chars: whether or not to include character embeddings in the model
        :param chars_per_word: how many chars are there per one word (a fixed number)
        :param char_embedding_size: input size of the character-embedding layer
        :param char_conv_filters: number of conv-filters applied on character embedding
        :param char_conv_kernel_size: size of the kernel applied on character embeddings
        :param include_syntactical_features: whether or not to include syntactical features (POS tags) in the model
        :param syntactical_feature_size: size of the syntactical feature vector for each word
        :param include_exact_match: whether or not to include exact match features in the model
        :param dropout_initial_keep_rate: initial state of dropout
        :param dropout_decay_rate: how much to change dropout at each interval
        :param dropout_decay_interval: how much time to wait for the next update
        :param first_scale_down_ratio: first scale down ratio in densenet
        :param transition_scale_down_ratio: transition scale down ratio in densenet
        :param growth_rate: growing rate in densenet
        :param layers_per_dense_block: number of layers in one dense-block
        :param nb_dense_blocks: number of dense blocks in densenet
        :param nb_labels: number of labels (3 labels by default: entailment, contradiction, neutral)
        """

        if inputs or outputs:
            super(DIIN, self).__init__(inputs=inputs,
                                       outputs=outputs,
                                       name=name)
            return

        if include_word_vectors:
            assert word_embedding_weights is not None
        inputs = []
        premise_embeddings = []
        hypothesis_embeddings = []
        '''Embedding layer'''
        # 1. Word embedding input
        if include_word_vectors:
            premise_word_input = Input(shape=(p, ),
                                       dtype='int64',
                                       name='PremiseWordInput')
            hypothesis_word_input = Input(shape=(h, ),
                                          dtype='int64',
                                          name='HypothesisWordInput')
            inputs.append(premise_word_input)
            inputs.append(hypothesis_word_input)

            word_embedding = Embedding(
                input_dim=word_embedding_weights.shape[0],
                output_dim=word_embedding_weights.shape[1],
                weights=[word_embedding_weights],
                trainable=train_word_embeddings,
                name='WordEmbedding')
            premise_word_embedding = word_embedding(premise_word_input)
            hypothesis_word_embedding = word_embedding(hypothesis_word_input)

            premise_word_embedding = DecayingDropout(
                initial_keep_rate=dropout_initial_keep_rate,
                decay_interval=dropout_decay_interval,
                decay_rate=dropout_decay_rate,
                name='PremiseWordEmbeddingDropout')(premise_word_embedding)
            hypothesis_word_embedding = DecayingDropout(
                initial_keep_rate=dropout_initial_keep_rate,
                decay_interval=dropout_decay_interval,
                decay_rate=dropout_decay_rate,
                name='HypothesisWordEmbeddingDropout')(
                    hypothesis_word_embedding)
            premise_embeddings.append(premise_word_embedding)
            hypothesis_embeddings.append(hypothesis_word_embedding)

        # 2. Character input
        if include_chars:
            premise_char_input = Input(shape=(
                p,
                chars_per_word,
            ),
                                       name='PremiseCharInput')
            hypothesis_char_input = Input(shape=(
                h,
                chars_per_word,
            ),
                                          name='HypothesisCharInput')
            inputs.append(premise_char_input)
            inputs.append(hypothesis_char_input)

            # Share weights of character-level embedding for premise and hypothesis
            character_embedding_layer = TimeDistributed(Sequential([
                Embedding(input_dim=100,
                          output_dim=char_embedding_size,
                          input_length=chars_per_word),
                Conv1D(filters=char_conv_filters,
                       kernel_size=char_conv_kernel_size),
                GlobalMaxPooling1D()
            ]),
                                                        name='CharEmbedding')
            character_embedding_layer.build(input_shape=(None, None,
                                                         chars_per_word))
            premise_char_embedding = character_embedding_layer(
                premise_char_input)
            hypothesis_char_embedding = character_embedding_layer(
                hypothesis_char_input)
            premise_embeddings.append(premise_char_embedding)
            hypothesis_embeddings.append(hypothesis_char_embedding)

        # 3. Syntactical features
        if include_syntactical_features:
            premise_syntactical_input = Input(shape=(
                p,
                syntactical_feature_size,
            ),
                                              name='PremiseSyntacticalInput')
            hypothesis_syntactical_input = Input(
                shape=(
                    h,
                    syntactical_feature_size,
                ),
                name='HypothesisSyntacticalInput')
            inputs.append(premise_syntactical_input)
            inputs.append(hypothesis_syntactical_input)
            premise_embeddings.append(premise_syntactical_input)
            hypothesis_embeddings.append(hypothesis_syntactical_input)

        # 4. One-hot exact match feature
        if include_exact_match:
            premise_exact_match_input = Input(shape=(p, ),
                                              name='PremiseExactMatchInput')
            hypothesis_exact_match_input = Input(
                shape=(h, ), name='HypothesisExactMatchInput')
            premise_exact_match = Reshape(target_shape=(
                p,
                1,
            ))(premise_exact_match_input)
            hypothesis_exact_match = Reshape(target_shape=(
                h,
                1,
            ))(hypothesis_exact_match_input)
            inputs.append(premise_exact_match_input)
            inputs.append(hypothesis_exact_match_input)
            premise_embeddings.append(premise_exact_match)
            hypothesis_embeddings.append(hypothesis_exact_match)

        # Concatenate all features
        premise_embedding = Concatenate(
            name='PremiseEmbedding')(premise_embeddings)
        hypothesis_embedding = Concatenate(
            name='HypothesisEmbedding')(hypothesis_embeddings)
        d = K.int_shape(hypothesis_embedding)[-1]
        '''Encoding layer'''
        # Now we have the embedded premise [pxd] along with embedded hypothesis [hxd]
        premise_encoding = Encoding(name='PremiseEncoding')(premise_embedding)
        hypothesis_encoding = Encoding(
            name='HypothesisEncoding')(hypothesis_embedding)
        '''Interaction layer'''
        interaction = Interaction(name='Interaction')(
            [premise_encoding, hypothesis_encoding])
        '''Feature Extraction layer'''
        feature_extractor_input = Conv2D(filters=int(d *
                                                     first_scale_down_ratio),
                                         kernel_size=1,
                                         activation=None,
                                         name='FirstScaleDown')(interaction)
        feature_extractor = DenseNet(
            include_top=False,
            input_tensor=Input(shape=K.int_shape(feature_extractor_input)[1:]),
            nb_dense_block=nb_dense_blocks,
            nb_layers_per_block=layers_per_dense_block,
            compression=transition_scale_down_ratio,
            growth_rate=growth_rate)(feature_extractor_input)
        '''Output layer'''
        features = DecayingDropout(initial_keep_rate=dropout_initial_keep_rate,
                                   decay_interval=dropout_decay_interval,
                                   decay_rate=dropout_decay_rate,
                                   name='Features')(feature_extractor)
        out = Dense(units=nb_labels, activation='softmax',
                    name='Output')(features)
        super(DIIN, self).__init__(inputs=inputs, outputs=out, name=name)
 def create_interaction(self, encodings):
     interaction = Interaction(name='Interaction')(encodings)
     return DecayingDropout(
         initial_keep_rate=self.dropout_initial_keep_rate,
         decay_interval=self.dropout_decay_interval,
         decay_rate=self.dropout_decay_rate)(interaction)
Example #6
0
    def __init__(self,
                 p=None,
                 h=None,
                 include_word_vectors=True,
                 word_embedding_weights=None,
                 train_word_embeddings=True,
                 include_chars=True,
                 chars_per_word=16,
                 char_embedding_size=8,
                 char_conv_filters=100,
                 char_conv_kernel_size=5,
                 include_syntactical_features=True,
                 syntactical_feature_size=50,
                 include_exact_match=True,
                 dropout_initial_keep_rate=1.,
                 dropout_decay_rate=0.977,
                 dropout_decay_interval=10000,
                 first_scale_down_ratio=0.3,
                 transition_scale_down_ratio=0.5,
                 growth_rate=20,
                 layers_per_dense_block=8,
                 nb_dense_blocks=3,
                 nb_labels=3,
                 inputs=None,
                 outputs=None,
                 name='DIIN'):
        """
        :ref https://openreview.net/forum?id=r1dHXnH6-&noteId=r1dHXnH6-

        :param p: sequence length of premise
        :param h: sequence length of hypothesis
        :param include_word_vectors: whether or not to include word vectors in the model
        :param word_embedding_weights: matrix of weights for word embeddings (GloVe pre-trained vectors)
        :param train_word_embeddings: whether or not to modify word embeddings while training
        :param include_chars: whether or not to include character embeddings in the model
        :param chars_per_word: how many chars are there per one word (a fixed number)
        :param char_embedding_size: input size of the character-embedding layer
        :param char_conv_filters: number of conv-filters applied on character embedding
        :param char_conv_kernel_size: size of the kernel applied on character embeddings
        :param include_syntactical_features: whether or not to include syntactical features (POS tags) in the model
        :param syntactical_feature_size: size of the syntactical feature vector for each word
        :param include_exact_match: whether or not to include exact match features in the model
        :param dropout_initial_keep_rate: initial state of dropout
        :param dropout_decay_rate: how much to change dropout at each interval
        :param dropout_decay_interval: how much time to wait for the next update
        :param first_scale_down_ratio: first scale down ratio in densenet
        :param transition_scale_down_ratio: transition scale down ratio in densenet
        :param growth_rate: growing rate in densenet
        :param layers_per_dense_block: number of layers in one dense-block
        :param nb_dense_blocks: number of dense blocks in densenet
        :param nb_labels: number of labels (3 labels by default: entailment, contradiction, neutral)
        """

        if inputs or outputs:
            super(DIIN, self).__init__(inputs=inputs,
                                       outputs=outputs,
                                       name=name)
            return

        if include_word_vectors:
            assert word_embedding_weights is not None
        inputs = []
        premise_embeddings = []
        hypothesis_embeddings = []
        '''Embedding Layer: 将词或者短语转换为向量表示, 并构造句子的矩阵表示.
            可以直接使用预训练的词向量, 比如word2vec, glove等等.
            为了提高效果, 还可以利用词性标注, 命名实体识别等方法获取更多词汇和句法信息.
            '''
        # 1. Word embedding input
        if include_word_vectors:
            premise_word_input = Input(shape=(p, ),
                                       dtype='int64',
                                       name='PremiseWordInput')
            hypothesis_word_input = Input(shape=(h, ),
                                          dtype='int64',
                                          name='HypothesisWordInput')
            inputs.append(premise_word_input)
            inputs.append(hypothesis_word_input)

            word_embedding = Embedding(
                input_dim=word_embedding_weights.shape[0],
                output_dim=word_embedding_weights.shape[1],
                weights=[word_embedding_weights],
                trainable=train_word_embeddings,
                name='WordEmbedding')
            premise_word_embedding = word_embedding(premise_word_input)
            hypothesis_word_embedding = word_embedding(hypothesis_word_input)

            premise_word_embedding = DecayingDropout(
                initial_keep_rate=dropout_initial_keep_rate,
                decay_interval=dropout_decay_interval,
                decay_rate=dropout_decay_rate,
                name='PremiseWordEmbeddingDropout')(premise_word_embedding)
            hypothesis_word_embedding = DecayingDropout(
                initial_keep_rate=dropout_initial_keep_rate,
                decay_interval=dropout_decay_interval,
                decay_rate=dropout_decay_rate,
                name='HypothesisWordEmbeddingDropout')(
                    hypothesis_word_embedding)
            premise_embeddings.append(premise_word_embedding)
            hypothesis_embeddings.append(hypothesis_word_embedding)

        # 2. Character input
        if include_chars:
            premise_char_input = Input(shape=(p, ), name='PremiseCharInput')
            hypothesis_char_input = Input(shape=(h, ),
                                          name='HypothesisCharInput')
            inputs.append(premise_char_input)
            inputs.append(hypothesis_char_input)

            char_embedding = Embedding(
                input_dim=word_embedding_weights.shape[0],
                output_dim=word_embedding_weights.shape[1],
                weights=[word_embedding_weights],
                trainable=train_word_embeddings,
                name='CharEmbedding')
            premise_char_embedding = char_embedding(premise_char_input)
            hypothesis_char_embedding = char_embedding(hypothesis_char_input)

            premise_char_embedding = DecayingDropout(
                initial_keep_rate=dropout_initial_keep_rate,
                decay_interval=dropout_decay_interval,
                decay_rate=dropout_decay_rate,
                name='PremiseCharEmbeddingDropout')(premise_char_embedding)
            hypothesis_char_embedding = DecayingDropout(
                initial_keep_rate=dropout_initial_keep_rate,
                decay_interval=dropout_decay_interval,
                decay_rate=dropout_decay_rate,
                name='HypothesisCharEmbeddingDropout')(
                    hypothesis_char_embedding)

            premise_embeddings.append(premise_char_embedding)
            hypothesis_embeddings.append(hypothesis_char_embedding)

        # 3. Syntactical features
        if include_syntactical_features:
            premise_syntactical_input = Input(shape=(
                p,
                syntactical_feature_size,
            ),
                                              name='PremiseSyntacticalInput')
            hypothesis_syntactical_input = Input(
                shape=(
                    h,
                    syntactical_feature_size,
                ),
                name='HypothesisSyntacticalInput')
            inputs.append(premise_syntactical_input)
            inputs.append(hypothesis_syntactical_input)
            premise_embeddings.append(premise_syntactical_input)
            hypothesis_embeddings.append(hypothesis_syntactical_input)

        # 4. One-hot exact match feature
        if include_exact_match:
            premise_exact_match_input = Input(shape=(p, ),
                                              name='PremiseExactMatchInput')
            hypothesis_exact_match_input = Input(
                shape=(h, ), name='HypothesisExactMatchInput')
            premise_exact_match = Reshape(target_shape=(
                p,
                1,
            ))(premise_exact_match_input)
            hypothesis_exact_match = Reshape(target_shape=(
                h,
                1,
            ))(hypothesis_exact_match_input)
            inputs.append(premise_exact_match_input)
            inputs.append(hypothesis_exact_match_input)
            premise_embeddings.append(premise_exact_match)
            hypothesis_embeddings.append(hypothesis_exact_match)

        # Concatenate all features
        premise_embedding = Concatenate(
            name='PremiseEmbedding')(premise_embeddings)
        hypothesis_embedding = Concatenate(
            name='HypothesisEmbedding')(hypothesis_embeddings)
        d = K.int_shape(hypothesis_embedding)[-1]
        '''Encoding Layer
        对Embedding Layer的输出进行编码, 
        这部分可以选择不同的编码器, 比如BiLSTM, self-attention等等. 不同的编码器可以结合使用来获得更好的句表示.
        '''
        # Now we have the embedded premise [pxd] along with embedded hypothesis [hxd]
        premise_encoding = Encoding(name='PremiseEncoding')(premise_embedding)
        hypothesis_encoding = Encoding(
            name='HypothesisEncoding')(hypothesis_embedding)
        '''Interaction Layer
        生成premise和hypothesis之间的interaction tensor.
        Interaction有多种不同的建模方式, 比如计算余弦距离, 点积等等.
        '''
        interaction = Interaction(name='Interaction')(
            [premise_encoding, hypothesis_encoding])
        '''Feature Extraction layer'''
        feature_extractor_input = Conv2D(filters=int(d *
                                                     first_scale_down_ratio),
                                         kernel_size=1,
                                         activation=None,
                                         name='FirstScaleDown')(interaction)
        feature_extractor_input = BatchNormalization()(feature_extractor_input)
        feature_extractor = DenseNet(
            include_top=False,
            input_tensor=Input(shape=K.int_shape(feature_extractor_input)[1:]),
            nb_dense_block=nb_dense_blocks,
            nb_layers_per_block=layers_per_dense_block,
            compression=transition_scale_down_ratio,
            growth_rate=growth_rate)(feature_extractor_input)
        '''Output layer'''
        features = DecayingDropout(initial_keep_rate=dropout_initial_keep_rate,
                                   decay_interval=dropout_decay_interval,
                                   decay_rate=dropout_decay_rate,
                                   name='Features')(feature_extractor)
        out = Dense(units=nb_labels, activation='sigmoid',
                    name='Output')(features)
        super(DIIN, self).__init__(inputs=inputs, outputs=out, name=name)