def create_encodings(self, embeddings): encodings = [Encoding()(embedding) for embedding in embeddings] return [ DecayingDropout(initial_keep_rate=self.dropout_initial_keep_rate, decay_interval=self.dropout_decay_interval, decay_rate=self.dropout_decay_rate)(encoding) for encoding in encodings ]
def create_feature_extraction(self, interaction): d = K.int_shape(interaction)[-1] feature_extractor_input = Conv2D(filters=int( d * self.first_scale_down_ratio), kernel_size=1, activation=None, name='FirstScaleDown')(interaction) feature_extractor = DenseNet( include_top=False, input_tensor=Input(shape=K.int_shape(feature_extractor_input)[1:]), nb_dense_block=self.nb_dense_blocks, nb_layers_per_block=self.layers_per_dense_block, compression=self.transition_scale_down_ratio, growth_rate=self.growth_rate)(feature_extractor_input) features = DecayingDropout( initial_keep_rate=self.dropout_initial_keep_rate, decay_interval=self.dropout_decay_interval, decay_rate=self.dropout_decay_rate, name='Features')(feature_extractor) return features
def call(self, P, **kwargs): """ :param P: inputs :return: encoding of inputs P """ ''' Paper notations in the code ''' # P = P_hw # itr_attn = P_itrAtt # encoding = P_enc # The paper takes inputs to be P(_hw) as an example and then computes the same thing for H, # therefore we'll name our inputs P too. # Input of encoding is P with shape (batch, p, d). It would be (batch, h, d) for hypothesis # Construct alphaP of shape (batch, p, 3*d, p) # A = dot(w_itr_att, alphaP) # alphaP consists of 3*d rows along 2nd axis # 1. up -> first d items represent P[i] # 2. mid -> second d items represent P[j] # 3. down -> final items represent alpha(P[i], P[j]) which is element-wise product of P[i] and P[j] = P[i]*P[j] # If we look at one slice of alphaP we'll see that it has the following elements: # ---------------------------------------- # P[i][0], P[i][0], P[i][0], ... P[i][0] ▲ # P[i][1], P[i][1], P[i][1], ... P[i][1] | # P[i][2], P[i][2], P[i][2], ... P[i][2] | # ... ... | up # ... ... | # ... ... | # P[i][d], P[i][d], P[i][d], ... P[i][d] ▼ # ---------------------------------------- # P[0][0], P[1][0], P[2][0], ... P[p][0] ▲ # P[0][1], P[1][1], P[2][1], ... P[p][1] | # P[0][2], P[1][2], P[2][2], ... P[p][2] | # ... ... | mid # ... ... | # ... ... | # P[0][d], P[1][d], P[2][d], ... P[p][d] ▼ # ---------------------------------------- # ▲ # | # | # up * mid | down # element-wise product | # | # ▼ # ---------------------------------------- # For every slice(i) the up part changes its P[i] values # The middle part is repeated p times in depth (for every i) # So we can get the middle part by doing the following: # mid = broadcast(P) -> to get tensor of shape (batch, p, d, p) # As we can notice up is the same mid, but with changed axis, so to obtain up from mid we can do: # up = swap_axes(mid, axis1=0, axis2=2) ''' Alpha ''' # P # (batch, p, d) mid = broadcast_last_axis(P) # (batch, p, d, p) up = K.permute_dimensions(mid, pattern=(0, 3, 2, 1)) # (batch, p, d, p) alphaP = K.concatenate([up, mid, up * mid], axis=2) # (batch, p, 3d, p) A = K.dot(self.w_itr_att, alphaP) # (batch, p, p) ''' Self-attention ''' # P_itr_attn[i] = sum of for j = 1...p: # s = sum(for k = 1...p: e^A[k][j] # ( e^A[i][j] / s ) * P[j] --> P[j] is the j-th row, while the first part is a number # So P_itr_attn is the weighted sum of P # SA is column-wise soft-max applied on A # P_itr_attn[i] is the sum of all rows of P scaled by i-th row of SA SA = softmax(A, axis=2) # (batch, p, p) itr_attn = K.batch_dot(SA, P) # (batch, p, d) ''' Fuse gate ''' # These layers are considered linear in the official implementation, therefore we apply dropout on each input P_concat = K.concatenate([P, itr_attn], axis=2) # (batch, p, 2d) z = K.tanh(K.dot(DecayingDropout()(P_concat), self.w1) + self.b1) # (batch, p, d) r = K.sigmoid(K.dot(DecayingDropout()(P_concat), self.w2) + self.b2) # (batch, p, d) f = K.sigmoid(K.dot(DecayingDropout()(P_concat), self.w3) + self.b3) # (batch, p, d) encoding = r * P + f * z # (batch, p, d) return encoding # (batch, p, d)
def __init__(self, p=None, h=None, include_word_vectors=True, word_embedding_weights=None, train_word_embeddings=True, include_chars=True, chars_per_word=16, char_embedding_size=8, char_conv_filters=100, char_conv_kernel_size=5, include_syntactical_features=True, syntactical_feature_size=50, include_exact_match=True, dropout_initial_keep_rate=1., dropout_decay_rate=0.977, dropout_decay_interval=10000, first_scale_down_ratio=0.3, transition_scale_down_ratio=0.5, growth_rate=20, layers_per_dense_block=8, nb_dense_blocks=3, nb_labels=3, inputs=None, outputs=None, name='DIIN'): """ :ref https://openreview.net/forum?id=r1dHXnH6-¬eId=r1dHXnH6- :param p: sequence length of premise :param h: sequence length of hypothesis :param include_word_vectors: whether or not to include word vectors in the model :param word_embedding_weights: matrix of weights for word embeddings (GloVe pre-trained vectors) :param train_word_embeddings: whether or not to modify word embeddings while training :param include_chars: whether or not to include character embeddings in the model :param chars_per_word: how many chars are there per one word (a fixed number) :param char_embedding_size: input size of the character-embedding layer :param char_conv_filters: number of conv-filters applied on character embedding :param char_conv_kernel_size: size of the kernel applied on character embeddings :param include_syntactical_features: whether or not to include syntactical features (POS tags) in the model :param syntactical_feature_size: size of the syntactical feature vector for each word :param include_exact_match: whether or not to include exact match features in the model :param dropout_initial_keep_rate: initial state of dropout :param dropout_decay_rate: how much to change dropout at each interval :param dropout_decay_interval: how much time to wait for the next update :param first_scale_down_ratio: first scale down ratio in densenet :param transition_scale_down_ratio: transition scale down ratio in densenet :param growth_rate: growing rate in densenet :param layers_per_dense_block: number of layers in one dense-block :param nb_dense_blocks: number of dense blocks in densenet :param nb_labels: number of labels (3 labels by default: entailment, contradiction, neutral) """ if inputs or outputs: super(DIIN, self).__init__(inputs=inputs, outputs=outputs, name=name) return if include_word_vectors: assert word_embedding_weights is not None inputs = [] premise_embeddings = [] hypothesis_embeddings = [] '''Embedding layer''' # 1. Word embedding input if include_word_vectors: premise_word_input = Input(shape=(p, ), dtype='int64', name='PremiseWordInput') hypothesis_word_input = Input(shape=(h, ), dtype='int64', name='HypothesisWordInput') inputs.append(premise_word_input) inputs.append(hypothesis_word_input) word_embedding = Embedding( input_dim=word_embedding_weights.shape[0], output_dim=word_embedding_weights.shape[1], weights=[word_embedding_weights], trainable=train_word_embeddings, name='WordEmbedding') premise_word_embedding = word_embedding(premise_word_input) hypothesis_word_embedding = word_embedding(hypothesis_word_input) premise_word_embedding = DecayingDropout( initial_keep_rate=dropout_initial_keep_rate, decay_interval=dropout_decay_interval, decay_rate=dropout_decay_rate, name='PremiseWordEmbeddingDropout')(premise_word_embedding) hypothesis_word_embedding = DecayingDropout( initial_keep_rate=dropout_initial_keep_rate, decay_interval=dropout_decay_interval, decay_rate=dropout_decay_rate, name='HypothesisWordEmbeddingDropout')( hypothesis_word_embedding) premise_embeddings.append(premise_word_embedding) hypothesis_embeddings.append(hypothesis_word_embedding) # 2. Character input if include_chars: premise_char_input = Input(shape=( p, chars_per_word, ), name='PremiseCharInput') hypothesis_char_input = Input(shape=( h, chars_per_word, ), name='HypothesisCharInput') inputs.append(premise_char_input) inputs.append(hypothesis_char_input) # Share weights of character-level embedding for premise and hypothesis character_embedding_layer = TimeDistributed(Sequential([ Embedding(input_dim=100, output_dim=char_embedding_size, input_length=chars_per_word), Conv1D(filters=char_conv_filters, kernel_size=char_conv_kernel_size), GlobalMaxPooling1D() ]), name='CharEmbedding') character_embedding_layer.build(input_shape=(None, None, chars_per_word)) premise_char_embedding = character_embedding_layer( premise_char_input) hypothesis_char_embedding = character_embedding_layer( hypothesis_char_input) premise_embeddings.append(premise_char_embedding) hypothesis_embeddings.append(hypothesis_char_embedding) # 3. Syntactical features if include_syntactical_features: premise_syntactical_input = Input(shape=( p, syntactical_feature_size, ), name='PremiseSyntacticalInput') hypothesis_syntactical_input = Input( shape=( h, syntactical_feature_size, ), name='HypothesisSyntacticalInput') inputs.append(premise_syntactical_input) inputs.append(hypothesis_syntactical_input) premise_embeddings.append(premise_syntactical_input) hypothesis_embeddings.append(hypothesis_syntactical_input) # 4. One-hot exact match feature if include_exact_match: premise_exact_match_input = Input(shape=(p, ), name='PremiseExactMatchInput') hypothesis_exact_match_input = Input( shape=(h, ), name='HypothesisExactMatchInput') premise_exact_match = Reshape(target_shape=( p, 1, ))(premise_exact_match_input) hypothesis_exact_match = Reshape(target_shape=( h, 1, ))(hypothesis_exact_match_input) inputs.append(premise_exact_match_input) inputs.append(hypothesis_exact_match_input) premise_embeddings.append(premise_exact_match) hypothesis_embeddings.append(hypothesis_exact_match) # Concatenate all features premise_embedding = Concatenate( name='PremiseEmbedding')(premise_embeddings) hypothesis_embedding = Concatenate( name='HypothesisEmbedding')(hypothesis_embeddings) d = K.int_shape(hypothesis_embedding)[-1] '''Encoding layer''' # Now we have the embedded premise [pxd] along with embedded hypothesis [hxd] premise_encoding = Encoding(name='PremiseEncoding')(premise_embedding) hypothesis_encoding = Encoding( name='HypothesisEncoding')(hypothesis_embedding) '''Interaction layer''' interaction = Interaction(name='Interaction')( [premise_encoding, hypothesis_encoding]) '''Feature Extraction layer''' feature_extractor_input = Conv2D(filters=int(d * first_scale_down_ratio), kernel_size=1, activation=None, name='FirstScaleDown')(interaction) feature_extractor = DenseNet( include_top=False, input_tensor=Input(shape=K.int_shape(feature_extractor_input)[1:]), nb_dense_block=nb_dense_blocks, nb_layers_per_block=layers_per_dense_block, compression=transition_scale_down_ratio, growth_rate=growth_rate)(feature_extractor_input) '''Output layer''' features = DecayingDropout(initial_keep_rate=dropout_initial_keep_rate, decay_interval=dropout_decay_interval, decay_rate=dropout_decay_rate, name='Features')(feature_extractor) out = Dense(units=nb_labels, activation='softmax', name='Output')(features) super(DIIN, self).__init__(inputs=inputs, outputs=out, name=name)
def create_interaction(self, encodings): interaction = Interaction(name='Interaction')(encodings) return DecayingDropout( initial_keep_rate=self.dropout_initial_keep_rate, decay_interval=self.dropout_decay_interval, decay_rate=self.dropout_decay_rate)(interaction)
def __init__(self, p=None, h=None, include_word_vectors=True, word_embedding_weights=None, train_word_embeddings=True, include_chars=True, chars_per_word=16, char_embedding_size=8, char_conv_filters=100, char_conv_kernel_size=5, include_syntactical_features=True, syntactical_feature_size=50, include_exact_match=True, dropout_initial_keep_rate=1., dropout_decay_rate=0.977, dropout_decay_interval=10000, first_scale_down_ratio=0.3, transition_scale_down_ratio=0.5, growth_rate=20, layers_per_dense_block=8, nb_dense_blocks=3, nb_labels=3, inputs=None, outputs=None, name='DIIN'): """ :ref https://openreview.net/forum?id=r1dHXnH6-¬eId=r1dHXnH6- :param p: sequence length of premise :param h: sequence length of hypothesis :param include_word_vectors: whether or not to include word vectors in the model :param word_embedding_weights: matrix of weights for word embeddings (GloVe pre-trained vectors) :param train_word_embeddings: whether or not to modify word embeddings while training :param include_chars: whether or not to include character embeddings in the model :param chars_per_word: how many chars are there per one word (a fixed number) :param char_embedding_size: input size of the character-embedding layer :param char_conv_filters: number of conv-filters applied on character embedding :param char_conv_kernel_size: size of the kernel applied on character embeddings :param include_syntactical_features: whether or not to include syntactical features (POS tags) in the model :param syntactical_feature_size: size of the syntactical feature vector for each word :param include_exact_match: whether or not to include exact match features in the model :param dropout_initial_keep_rate: initial state of dropout :param dropout_decay_rate: how much to change dropout at each interval :param dropout_decay_interval: how much time to wait for the next update :param first_scale_down_ratio: first scale down ratio in densenet :param transition_scale_down_ratio: transition scale down ratio in densenet :param growth_rate: growing rate in densenet :param layers_per_dense_block: number of layers in one dense-block :param nb_dense_blocks: number of dense blocks in densenet :param nb_labels: number of labels (3 labels by default: entailment, contradiction, neutral) """ if inputs or outputs: super(DIIN, self).__init__(inputs=inputs, outputs=outputs, name=name) return if include_word_vectors: assert word_embedding_weights is not None inputs = [] premise_embeddings = [] hypothesis_embeddings = [] '''Embedding Layer: 将词或者短语转换为向量表示, 并构造句子的矩阵表示. 可以直接使用预训练的词向量, 比如word2vec, glove等等. 为了提高效果, 还可以利用词性标注, 命名实体识别等方法获取更多词汇和句法信息. ''' # 1. Word embedding input if include_word_vectors: premise_word_input = Input(shape=(p, ), dtype='int64', name='PremiseWordInput') hypothesis_word_input = Input(shape=(h, ), dtype='int64', name='HypothesisWordInput') inputs.append(premise_word_input) inputs.append(hypothesis_word_input) word_embedding = Embedding( input_dim=word_embedding_weights.shape[0], output_dim=word_embedding_weights.shape[1], weights=[word_embedding_weights], trainable=train_word_embeddings, name='WordEmbedding') premise_word_embedding = word_embedding(premise_word_input) hypothesis_word_embedding = word_embedding(hypothesis_word_input) premise_word_embedding = DecayingDropout( initial_keep_rate=dropout_initial_keep_rate, decay_interval=dropout_decay_interval, decay_rate=dropout_decay_rate, name='PremiseWordEmbeddingDropout')(premise_word_embedding) hypothesis_word_embedding = DecayingDropout( initial_keep_rate=dropout_initial_keep_rate, decay_interval=dropout_decay_interval, decay_rate=dropout_decay_rate, name='HypothesisWordEmbeddingDropout')( hypothesis_word_embedding) premise_embeddings.append(premise_word_embedding) hypothesis_embeddings.append(hypothesis_word_embedding) # 2. Character input if include_chars: premise_char_input = Input(shape=(p, ), name='PremiseCharInput') hypothesis_char_input = Input(shape=(h, ), name='HypothesisCharInput') inputs.append(premise_char_input) inputs.append(hypothesis_char_input) char_embedding = Embedding( input_dim=word_embedding_weights.shape[0], output_dim=word_embedding_weights.shape[1], weights=[word_embedding_weights], trainable=train_word_embeddings, name='CharEmbedding') premise_char_embedding = char_embedding(premise_char_input) hypothesis_char_embedding = char_embedding(hypothesis_char_input) premise_char_embedding = DecayingDropout( initial_keep_rate=dropout_initial_keep_rate, decay_interval=dropout_decay_interval, decay_rate=dropout_decay_rate, name='PremiseCharEmbeddingDropout')(premise_char_embedding) hypothesis_char_embedding = DecayingDropout( initial_keep_rate=dropout_initial_keep_rate, decay_interval=dropout_decay_interval, decay_rate=dropout_decay_rate, name='HypothesisCharEmbeddingDropout')( hypothesis_char_embedding) premise_embeddings.append(premise_char_embedding) hypothesis_embeddings.append(hypothesis_char_embedding) # 3. Syntactical features if include_syntactical_features: premise_syntactical_input = Input(shape=( p, syntactical_feature_size, ), name='PremiseSyntacticalInput') hypothesis_syntactical_input = Input( shape=( h, syntactical_feature_size, ), name='HypothesisSyntacticalInput') inputs.append(premise_syntactical_input) inputs.append(hypothesis_syntactical_input) premise_embeddings.append(premise_syntactical_input) hypothesis_embeddings.append(hypothesis_syntactical_input) # 4. One-hot exact match feature if include_exact_match: premise_exact_match_input = Input(shape=(p, ), name='PremiseExactMatchInput') hypothesis_exact_match_input = Input( shape=(h, ), name='HypothesisExactMatchInput') premise_exact_match = Reshape(target_shape=( p, 1, ))(premise_exact_match_input) hypothesis_exact_match = Reshape(target_shape=( h, 1, ))(hypothesis_exact_match_input) inputs.append(premise_exact_match_input) inputs.append(hypothesis_exact_match_input) premise_embeddings.append(premise_exact_match) hypothesis_embeddings.append(hypothesis_exact_match) # Concatenate all features premise_embedding = Concatenate( name='PremiseEmbedding')(premise_embeddings) hypothesis_embedding = Concatenate( name='HypothesisEmbedding')(hypothesis_embeddings) d = K.int_shape(hypothesis_embedding)[-1] '''Encoding Layer 对Embedding Layer的输出进行编码, 这部分可以选择不同的编码器, 比如BiLSTM, self-attention等等. 不同的编码器可以结合使用来获得更好的句表示. ''' # Now we have the embedded premise [pxd] along with embedded hypothesis [hxd] premise_encoding = Encoding(name='PremiseEncoding')(premise_embedding) hypothesis_encoding = Encoding( name='HypothesisEncoding')(hypothesis_embedding) '''Interaction Layer 生成premise和hypothesis之间的interaction tensor. Interaction有多种不同的建模方式, 比如计算余弦距离, 点积等等. ''' interaction = Interaction(name='Interaction')( [premise_encoding, hypothesis_encoding]) '''Feature Extraction layer''' feature_extractor_input = Conv2D(filters=int(d * first_scale_down_ratio), kernel_size=1, activation=None, name='FirstScaleDown')(interaction) feature_extractor_input = BatchNormalization()(feature_extractor_input) feature_extractor = DenseNet( include_top=False, input_tensor=Input(shape=K.int_shape(feature_extractor_input)[1:]), nb_dense_block=nb_dense_blocks, nb_layers_per_block=layers_per_dense_block, compression=transition_scale_down_ratio, growth_rate=growth_rate)(feature_extractor_input) '''Output layer''' features = DecayingDropout(initial_keep_rate=dropout_initial_keep_rate, decay_interval=dropout_decay_interval, decay_rate=dropout_decay_rate, name='Features')(feature_extractor) out = Dense(units=nb_labels, activation='sigmoid', name='Output')(features) super(DIIN, self).__init__(inputs=inputs, outputs=out, name=name)