Ejemplo n.º 1
0
 def _encode(self):
     PL, QL, CL, d, dc, nh = self._params()
     with tf.variable_scope("Embedding_Encoder_Layer"):
         self.c_embed_encoding = residual_block(
             self.c_emb,
             num_blocks=1,
             num_conv_layers=4,
             kernel_size=5,
             mask=self.c_mask,
             num_filters=d,
             num_heads=nh,
             seq_len=self.c_len,
             scope="Encoder_Residual_Block",
             bias=False,
             dropout=self.dropout)
         self.q_embed_encoding = residual_block(
             self.q_emb,
             num_blocks=1,
             num_conv_layers=4,
             kernel_size=5,
             mask=self.q_mask,
             num_filters=d,
             num_heads=nh,
             seq_len=self.q_len,
             scope="Encoder_Residual_Block",
             reuse=True,  # Share the weights between passage and question
             bias=False,
             dropout=self.dropout)
Ejemplo n.º 2
0
 def _encode(self):
     N, PL, QL, CL, d, dc, nh = self._params()
     if self.config.fix_pretrained_vector:
         dc = self.char_mat.get_shape()[-1]
     with tf.variable_scope("Embedding_Encoder_Layer"):
         self.c_embed_encoding = residual_block(
             self.c_emb,
             num_blocks=1,
             num_conv_layers=2,
             kernel_size=7,
             mask=self.c_mask,
             num_filters=d,
             num_heads=nh,
             seq_len=self.c_len,
             scope="Encoder_Residual_Block",
             bias=False,
             dropout=self.dropout)
         self.q_embed_encoding = residual_block(
             self.q_emb,
             num_blocks=1,
             num_conv_layers=2,
             kernel_size=7,
             mask=self.q_mask,
             num_filters=d,
             num_heads=nh,
             seq_len=self.q_len,
             scope="Encoder_Residual_Block",
             reuse=True,  # Share the weights between passage and question
             bias=False,
             dropout=self.dropout)
Ejemplo n.º 3
0
def style_transformer_network(inputs, style_params):
    with tf.variable_scope('style_transformer'):
        with tf.variable_scope('encode') as scope:
            conv1 = layers.conv2d(inputs, 32, 9, 1, "conv1", scope.name, style_params)
            conv2 = layers.conv2d(conv1, 64, 3, 2, "conv2", scope.name, style_params)
            conv3 = layers.conv2d(conv2, 128, 3, 2, "conv3", scope.name, style_params)
        
        with tf.variable_scope('residual') as scope:
            res1 = layers.residual_block(conv3, 3, "residual1", scope.name, style_params)
            res2 = layers.residual_block(res1, 3, "residual2", scope.name, style_params)
            res3 = layers.residual_block(res2, 3, "residual3", scope.name, style_params)

        with tf.variable_scope('upsample') as scope:
            up1 = layers.upsampling(res3, 3, 2, 64, 'upsample1', scope.name, style_params)
            up2 = layers.upsampling(up1, 3, 2, 32, 'upsample2', scope.name, style_params)
            return layers.upsampling(up2, 9, 2, 3, 'upsample3', scope.name, style_params, tf.nn.sigmoid)
Ejemplo n.º 4
0
 def discriminator(sequence,training=tf.constant(True)):
     num = tf.shape(sequence)[0]
     
     x = lyr.conv('discriminator.conv1.filter','discriminator.conv1.bias','discriminator',(5,encode_length,64),sequence,max_size)
     x = tf.nn.leaky_relu(x)
     
     x = lyr.residual_block('discriminator.res1.filter1','discriminator.res1.bias1','discriminator.res1.filter2','discriminator.res1.bias1','discriminator',64,64,x,max_size)
     x = lyr.layernorm(x,num)
     x = lyr.residual_block('discriminator.res4.filter1','discriminator.res4.bias1','discriminator.res4.filter2','discriminator.res4.bias1','discriminator',64,64,x,max_size)
     x = lyr.layernorm(x,num)
     x = lyr.residual_block('discriminator.res5.filter1','discriminator.res5.bias1','discriminator.res5.filter2','discriminator.res5.bias1','discriminator',64,64,x,max_size)
     x = lyr.layernorm(x,num)
     
     x = tf.reshape(x,(num,max_size*64))
     
     output = lyr.dense('discriminator.dense1.matrix','discriminator.dense1.bias','discriminator',max_size*64,1,x)
     return output
Ejemplo n.º 5
0
    def discriminator(sequence):
        x = lyr.conv('discriminator.conv1.filter', 'discriminator.conv1.bias',
                     'discriminator', (5, encode_length, 64), sequence,
                     max_size)
        x = tf.nn.leaky_relu(x)

        x = lyr.residual_block('discriminator.res1.filter1',
                               'discriminator.res1.bias1',
                               'discriminator.res1.filter2',
                               'discriminator.res1.bias1', 'discriminator', 64,
                               64, x, max_size)
        x = lyr.layernorm(x, batch_size)
        x = lyr.residual_block('discriminator.res2.filter1',
                               'discriminator.res2.bias1',
                               'discriminator.res2.filter2',
                               'discriminator.res2.bias1', 'discriminator', 64,
                               64, x, max_size)
        x = lyr.layernorm(x, batch_size)
        x = lyr.residual_block('discriminator.res3.filter1',
                               'discriminator.res3.bias1',
                               'discriminator.res3.filter2',
                               'discriminator.res3.bias1', 'discriminator', 64,
                               64, x, max_size)
        x = lyr.layernorm(x, batch_size)
        x = lyr.residual_block('discriminator.res4.filter1',
                               'discriminator.res4.bias1',
                               'discriminator.res4.filter2',
                               'discriminator.res4.bias1', 'discriminator', 64,
                               64, x, max_size)
        x = lyr.layernorm(x, batch_size)
        x = lyr.residual_block('discriminator.res5.filter1',
                               'discriminator.res5.bias1',
                               'discriminator.res5.filter2',
                               'discriminator.res5.bias1', 'discriminator', 64,
                               64, x, max_size)
        x = lyr.layernorm(x, batch_size)

        x = tf.reshape(x, (batch_size, max_size * 64))

        output = lyr.dense('discriminator.dense1.matrix',
                           'discriminator.dense1.bias', 'discriminator',
                           max_size * 64, 1, x)
        return output
Ejemplo n.º 6
0
    def _fuse(self):

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            C = tf.tile(tf.expand_dims(self.c_embed_encoding, 2),
                        [1, 1, self.max_q_len, 1])
            Q = tf.tile(tf.expand_dims(self.q_embed_encoding, 1),
                        [1, self.max_p_len, 1, 1])
            S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, self.q_embed_encoding)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), self.c_embed_encoding)
            self.attention_outputs = [
                self.c_embed_encoding, self.c2q,
                self.c_embed_encoding * self.c2q,
                self.c_embed_encoding * self.q2c
            ]

        #  self.config.batch_size if not self.demo else 1,
        #  self.max_p_len,
        #  self.max_q_len,
        #  self.config.max_ch_len,
        #  self.config.hidden_size,
        #  self.config.char_embed_size,
        #  self.config.head_size
        N, PL, QL, CL, d, dc, nh = self._params()
        if self.config.fix_pretrained_vector:
            dc = self.char_mat.get_shape()[-1]
        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(self.attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=1,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=True,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

            for i, item in enumerate(self.enc):
                self.enc[i] = tf.reshape(self.enc[i],
                                         [N, -1, self.enc[i].get_shape()[-1]])
Ejemplo n.º 7
0
 def predictor(sequence):
     num = tf.shape(sequence)[0]
     
     x = lyr.conv('predictor.conv1.filter','predictor.conv1.bias','predictor',(5,encode_length,16),sequence,max_size)
     x = tf.nn.leaky_relu(x)
     
     x = lyr.residual_block('predictor.res1.filter1','predictor.res1.bias1','predictor.res1.filter2','predictor.res1.bias1','predictor',16,16,x,max_size,channels=16)
     
     x = tf.reshape(x,(num,max_size*16))
     
     output = lyr.dense('predictor.dense1.matrix','predictor.dense1.bias','predictor',max_size*16,num_classes,x)
     return output
Ejemplo n.º 8
0
    def generator(seed, training=True):
        seed = tf.reshape(seed, (batch_size, 100))

        seed2 = lyr.dense('generator.dense1.matrix', 'generator.dense1.bias',
                          'generator', 100, max_size * 64, seed)
        seed2 = tf.nn.leaky_relu(seed2)
        seed2 = tf.reshape(seed2, [batch_size, max_size, 64])

        x = lyr.residual_block('generator.res1.filter1',
                               'generator.res1.bias1',
                               'generator.res1.filter2',
                               'generator.res1.bias2', 'generator', 64, 64,
                               seed2, max_size)
        x = lyr.residual_block('generator.res2.filter1',
                               'generator.res2.bias1',
                               'generator.res2.filter2',
                               'generator.res2.bias2', 'generator', 64, 64, x,
                               max_size)
        x = lyr.residual_block('generator.res3.filter1',
                               'generator.res3.bias1',
                               'generator.res3.filter2',
                               'generator.res3.bias2', 'generator', 64, 64, x,
                               max_size)
        x = lyr.residual_block('generator.res4.filter1',
                               'generator.res4.bias1',
                               'generator.res4.filter2',
                               'generator.res4.bias2', 'generator', 64, 64, x,
                               max_size)
        x = lyr.residual_block('generator.res5.filter1',
                               'generator.res5.bias1',
                               'generator.res5.filter2',
                               'generator.res5.bias2', 'generator', 64, 64, x,
                               max_size)

        x = lyr.conv('generator.conv1.filter', 'generator.conv1.bias',
                     'generator', (5, 64, encode_length), x, max_size)
        x = tf.nn.softmax(x)
        return x
Ejemplo n.º 9
0
    def predictor_stem(sequence):
        x = lyr.conv('predictor_stem.conv1.filter',
                     'predictor_stem.conv1.bias', 'predictor_stem',
                     (5, encode_length, 64), sequence, stem_size)
        x = tf.nn.leaky_relu(x)

        x = lyr.residual_block('predictor_stem.res1.filter1',
                               'predictor_stem.res1.bias1',
                               'predictor_stem.res1.filter2',
                               'predictor_stem.res1.bias1', 'predictor_stem',
                               64, 64, x, stem_size)
        x = lyr.residual_block('predictor_stem.res2.filter1',
                               'predictor_stem.res2.bias1',
                               'predictor_stem.res2.filter2',
                               'predictor_stem.res2.bias1', 'predictor_stem',
                               64, 64, x, stem_size)
        x = lyr.residual_block('predictor_stem.res3.filter1',
                               'predictor_stem.res3.bias1',
                               'predictor_stem.res3.filter2',
                               'predictor_stem.res3.bias1', 'predictor_stem',
                               64, 64, x, stem_size)
        x = lyr.residual_block('predictor_stem.res4.filter1',
                               'predictor_stem.res4.bias1',
                               'predictor_stem.res4.filter2',
                               'predictor_stem.res4.bias1', 'predictor_stem',
                               64, 64, x, stem_size)
        x = lyr.residual_block('predictor_stem.res5.filter1',
                               'predictor_stem.res5.bias1',
                               'predictor_stem.res5.filter2',
                               'predictor_stem.res5.bias1', 'predictor_stem',
                               64, 64, x, stem_size)

        x = tf.reshape(x, (batch_size, stem_size * 64))

        output = lyr.dense('predictor_stem.dense1.matrix',
                           'predictor_stem.dense1.bias', 'predictor_stem',
                           stem_size * 64, num_classes, x)
        return output
Ejemplo n.º 10
0
    def generator(seed,training=tf.constant(True)):
        num = tf.shape(seed)[0]
        
        seed = tf.reshape(seed,(num,100))
        
        seed2 = lyr.dense('generator.dense1.matrix','generator.dense1.bias','generator',100,max_size*64,seed)
        seed2 = tf.nn.leaky_relu(seed2)
        seed2 = lyr.batchnorm(seed2,'generator.batchnorm1.offset','generator.batchnorm1.scale','generator.batchnorm1.average_means','generator.batchnorm1.average_variances','generator.num_means','generator',(max_size*64,),training=training)
        
        seed2 = tf.reshape(seed2,[num,max_size,64])

        x = lyr.residual_block('generator.res1.filter1','generator.res1.bias1','generator.res1.filter2','generator.res1.bias2','generator',64,64,seed2,max_size)
        x = lyr.batchnorm(x,'generator.batchnorm2.offset','generator.batchnorm2.scale','generator.batchnorm2.average_means','generator.batchnorm2.average_variances','generator.num_means','generator',(max_size,64),training=training)
        
        x = lyr.residual_block('generator.res2.filter1','generator.res2.bias1','generator.res2.filter2','generator.res2.bias2','generator',64,64,x,max_size)
        x = lyr.batchnorm(x,'generator.batchnorm3.offset','generator.batchnorm3.scale','generator.batchnorm3.average_means','generator.batchnorm3.average_variances','generator.num_means','generator',(max_size,64),training=training)
        
        x = lyr.residual_block('generator.res3.filter1','generator.res3.bias1','generator.res3.filter2','generator.res3.bias2','generator',64,64,x,max_size)
        x = lyr.batchnorm(x,'generator.batchnorm4.offset','generator.batchnorm4.scale','generator.batchnorm4.average_means','generator.batchnorm4.average_variances','generator.num_means','generator',(max_size,64),training=training)

        x = lyr.conv('generator.conv1.filter','generator.conv1.bias','generator',(5,64,encode_length),x,max_size)
        x = tf.nn.softmax(x)
        return x
Ejemplo n.º 11
0
 def encoder(sequence,training=True):
     num = tf.shape(sequence)[0]
     
     x = lyr.conv('encoder.conv1.filter','encoder.conv1.bias','encoder',(5,encode_length,args.channels),sequence,max_size)
     x = tf.nn.leaky_relu(x)
     x = lyr.batchnorm(x,'encoder.batchnorm1.offset','encoder.batchnorm1.scale','encoder.batchnorm1.average_means','encoder.batchnorm1.average_variances','encoder.num_means','encoder',(max_size,args.channels),training=training)
     
     x = lyr.residual_block('encoder.res1.filter1','encoder.res1.bias1','encoder.res1.filter2','encoder.res1.bias1','encoder',args.channels,args.channels,x,max_size,channels=args.channels)
     x = lyr.batchnorm(x,'encoder.batchnorm2.offset','encoder.batchnorm2.scale','encoder.batchnorm2.average_means','encoder.batchnorm2.average_variances','encoder.num_means','encoder',(max_size,args.channels),training=training)
     
     x = tf.reshape(x,(num,max_size*args.channels))
     
     x = lyr.dense('encoder.dense1.matrix','encoder.dense1.bias','encoder',max_size*args.channels,2*latent_dim,x)
     x = tf.nn.leaky_relu(x)
     output = lyr.batchnorm(x,'encoder.batchnorm3.offset','encoder.batchnorm3.scale','encoder.batchnorm3.average_means','encoder.batchnorm3.average_variances','encoder.num_means','encoder',(2*latent_dim),training=training)
     
     return output
Ejemplo n.º 12
0
    def _fuse(self):

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            C = tf.tile(tf.expand_dims(self.c_embed_encoding, 2),
                        [1, 1, self.max_q_len, 1])
            Q = tf.tile(tf.expand_dims(self.q_embed_encoding, 1),
                        [1, self.max_p_len, 1, 1])
            S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, self.q_embed_encoding)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), self.c_embed_encoding)
            self.attention_outputs = [
                self.c_embed_encoding, self.c2q,
                self.c_embed_encoding * self.c2q,
                self.c_embed_encoding * self.q2c
            ]

        PL, QL, CL, d, dc, nh = self._params()
        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(self.attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=3,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))
Ejemplo n.º 13
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh, dw = config.test_batch_size if self.loop_function else config.batch_size, self.c_maxlen, self.q_maxlen, \
                                config.char_limit, config.hidden, config.char_dim, config.num_heads, config.glove_dim

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=2,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=2,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=2,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Decoder_Layer"):
            memory = tf.concat([self.enc[1], self.enc[2], self.enc[3]],
                               axis=-1)
            oups = tf.split(self.a, [1] * self.a_maxlen, 1)
            h = tf.tanh(
                _linear(tf.reduce_mean(memory, axis=1),
                        output_size=d,
                        bias=False,
                        scope="h_initial"))
            c = tf.tanh(
                _linear(tf.reduce_mean(memory, axis=1),
                        output_size=d,
                        bias=False,
                        scope="c_initial"))
            state = (c, h)
            outputs = []
            prev = None
            prev_probs = [0.0]
            symbols = []
            for i, inp in enumerate(oups):
                einp = tf.reshape(tf.nn.embedding_lookup(self.word_mat, inp),
                                  [N, dw])
                if i > 0:
                    tf.get_variable_scope().reuse_variables()

                if self.loop_function is not None and prev is not None:
                    with tf.variable_scope("loop_function", reuse=True):
                        einp, prev_probs, index, prev_symbol = self.loop_function(
                            prev, prev_probs, self.beam_size, i)
                        h = tf.gather(h, index)  # update prev state
                        state = tuple(tf.gather(s, index)
                                      for s in state)  # update prev state
                        for j, symbol in enumerate(symbols):
                            symbols[j] = tf.gather(
                                symbol, index)  # update prev symbols
                        for j, output in enumerate(outputs):
                            outputs[j] = tf.gather(
                                output, index)  # update prev outputs
                        symbols.append(prev_symbol)

                attn = tf.reshape(
                    multihead_attention(tf.expand_dims(h, 1),
                                        units=d,
                                        num_heads=nh,
                                        memory=memory,
                                        mask=self.c_mask,
                                        bias=False), [-1, nh * d])

                cinp = tf.concat([einp, attn], 1)
                h, state = self.cell(cinp, state)

                with tf.variable_scope("AttnOutputProjection"):
                    output = _linear([h] + [cinp],
                                     output_size=dw * 2,
                                     bias=False,
                                     scope="output")
                    output = tf.reshape(output, [-1, dw, 2])
                    output = tf.reduce_max(output, 2)  # maxout
                    outputs.append(output)

                if self.loop_function is not None:
                    prev = output

            if self.loop_function is not None:
                # process the last symbol
                einp, prev_probs, index, prev_symbol = self.loop_function(
                    prev, prev_probs, self.beam_size, i + 1)
                for j, symbol in enumerate(symbols):
                    symbols[j] = tf.gather(symbol,
                                           index)  # update prev symbols
                for j, output in enumerate(outputs):
                    outputs[j] = tf.gather(output,
                                           index)  # update prev outputs
                symbols.append(prev_symbol)

                # output the final best result of beam search
                for k, symbol in enumerate(symbols):
                    symbols[k] = tf.gather(symbol, 0)
                for k, output in enumerate(outputs):
                    outputs[k] = tf.expand_dims(tf.gather(output, 0), 0)

            self.gen_loss = self._compute_loss(outputs, oups, N)
            self.symbols = symbols

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        self.loss = self.gen_loss

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
Ejemplo n.º 14
0
Archivo: model.py Proyecto: txye/QANet
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(
                self.char_mat, self.ch), [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(
                self.char_mat, self.qh), [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

			# Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb, d,
                bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = None)
            qh_emb = conv(qh_emb, d,
                bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = True)

            ch_emb = tf.reduce_max(ch_emb, axis = 1)
            qh_emb = tf.reduce_max(qh_emb, axis = 1)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None)
            q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.c_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.c_len,
                scope = "Encoder_Residual_Block",
                bias = False,
                dropout = self.dropout)
            q = residual_block(q_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.q_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.q_len,
                scope = "Encoder_Residual_Block",
                reuse = True, # Share the weights between passage and question
                bias = False,
                dropout = self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask = mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis = -1)
            self.enc = [conv(inputs, d, name = "input_projection")]
            for i in range(3):
                if i % 2 == 0: # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                        num_blocks = 7,
                        num_conv_layers = 2,
                        kernel_size = 5,
                        mask = self.c_mask,
                        num_filters = d,
                        num_heads = nh,
                        seq_len = self.c_len,
                        scope = "Model_Encoder",
                        bias = False,
                        reuse = True if i > 0 else None,
                        dropout = self.dropout)
                    )

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, bias = False, name = "start_pointer"),-1)
            end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1, bias = False, name = "end_pointer"), -1)
            self.logits = [mask_logits(start_logits, mask = self.c_mask),
                           mask_logits(end_logits, mask = self.c_mask)]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits1, labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits2, labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var,v))
Ejemplo n.º 15
0
    def pred(self):
        with tf.variable_scope("embedding_layer"):
            (self.questions, question_lengths), (
                self.contexts,
                context_lengths), self.answers = self.iterator.get_next()

            max_context_length = tf.reduce_max(context_lengths)
            max_question_length = tf.reduce_max(question_lengths)

            #max_context_length = self.train_max_context_length
            #max_question_length = self.train_max_question_length

            context_mask = tf.sequence_mask(context_lengths,
                                            maxlen=max_context_length)

            question_mask = tf.sequence_mask(question_lengths,
                                             maxlen=max_question_length)

            question_embeddings = tf.nn.embedding_lookup(
                self.embedding, self.questions)
            context_embeddings = tf.nn.embedding_lookup(
                self.embedding, self.contexts)
            print('question_embeddings',
                  question_embeddings.get_shape().as_list())
            print('context_embeddings',
                  context_embeddings.get_shape().as_list())

        with tf.variable_scope("embedding_layer"):
            c = residual_block(context_embeddings,
                               num_blocks=1,
                               num_conv_layers=1,
                               kernel_size=7,
                               mask=context_mask,
                               num_filters=self.lstm_hidden_size,
                               num_heads=1,
                               seq_len=max_context_length,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=1.0 - self.keep_prob)
            print('c', c.get_shape().as_list())
            q = residual_block(
                question_embeddings,
                num_blocks=1,
                num_conv_layers=1,
                kernel_size=7,
                mask=question_mask,
                num_filters=self.lstm_hidden_size,
                num_heads=1,
                seq_len=max_question_length,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=1.0 - self.keep_prob)

            print('q', q.get_shape().as_list())
            # context_output dimension is BS * max_context_length * d
            # where d = 2*lstm_hidden_size

        with tf.variable_scope("attention_layer"):

            S = optimized_trilinear_for_attention(
                [c, q],
                max_context_length,
                max_question_length,
                input_keep_prob=self.keep_prob)
            mask_q = tf.expand_dims(question_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(context_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("modeling_layer"):
            attention = tf.concat(attention_outputs, axis=-1)
            self.enc = [
                conv(attention, self.lstm_hidden_size, name="input_projection")
            ]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i], self.keep_prob)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=1,
                                   num_conv_layers=1,
                                   kernel_size=5,
                                   mask=context_mask,
                                   num_filters=self.lstm_hidden_size,
                                   num_heads=1,
                                   seq_len=max_context_length,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=1.0 - self.keep_prob))
                print('self.enc[i]', self.enc[i].get_shape().as_list())

        with tf.variable_scope("output_layer_start"):
            pred_start = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            print('pred_start', pred_start.get_shape().as_list())
            self.pred_start = preprocess_softmax(pred_start, context_mask)
            print('self.pred_start', self.pred_start.get_shape().as_list())

        with tf.variable_scope("output_layer_end"):
            pred_end = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            print('pred_end', pred_end.get_shape().as_list())
            self.pred_end = preprocess_softmax(pred_end, context_mask)
            print('self.pred_end', self.pred_end.get_shape().as_list())

            self.preds = tf.transpose([
                tf.argmax(self.pred_start, axis=1),
                tf.argmax(self.pred_end, axis=1)
            ])
Ejemplo n.º 16
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads
        d_cell = tf.contrib.rnn.BasicLSTMCell(d,
                                              forget_bias=1.0,
                                              state_is_tuple=True)
        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            print "ch_emb before", ch_emb.shape[-1]
            print "qh_emb before", qh_emb.shape[-1]

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])
            print "N", N, "PL", PL, "QL", QL
            print "ch_emb", ch_emb.shape
            print "qh_emb", qh_emb.shape
            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)
            print "c_emb high", c_emb.shape
            print "q_emb high", q_emb.shape

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c_tmp = residual_block(c_emb,
                                   num_blocks=1,
                                   num_conv_layers=4,
                                   kernel_size=7,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Encoder_Residual_Block",
                                   bias=False,
                                   dropout=self.dropout)

            # c_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True)
            c = drnn(d_cell, c_tmp, d)

            q_tmp = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
            # q_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True)
            q = drnn(d_cell, q_tmp, d)
            print "embd enc output c", c.shape
            print "embd enc output q", q.shape
            # exit()

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            print "enc len", len(self.enc)
            # print self.ch_len.shape
            # print self.qh_len.shape
            # print self.c_len.shape
            # print self.q_len.shape

            # print ip_len.shape
            print "qh shape", self.qh.shape
            print "qh type", self.qh.dtype
            print "ip shape", inputs.shape
            print "ip type", inputs.dtype
            ip_len = tf.reshape(
                tf.reduce_sum(tf.cast(tf.cast(inputs, tf.bool), tf.float32),
                              axis=2), [-1])
            print "ip_len", ip_len.shape

            # fw0 = drnn(d_cell, self.enc[0], d)
            # f_cell = tf.contrib.rnn.BasicLSTMCell(fw0[2], forget_bias=1.0, state_is_tuple=True)
            # fw1 = drnn(d_cell, fw0, d)
            # fw2 = drnn(d_cell, fw1, d)
            # self.enc.append(fw0)
            # self.enc.append(fw1)
            # self.enc.append(fw2)
            # print "fw1 shape", fw1
            #
            # (fw0, bw0), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
            #                   initial_state_fw=None, initial_state_bw=None,
            #                   dtype=None, parallel_iterations=None,
            #                   swap_memory=False, time_major=False, scope=None):

            # bw_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True)
            # g0 = bidirlstm(fw_cell, bw_cell, inputs, d)
            # g1 = bidirlstm(fw_cell, bw_cell, g0, d)
            # g2 = bidirlstm(fw_cell, bw_cell, g1, d)
            # fw0 = bidirlstm(d_cell, d_cell, inputs, d)
            # d_cell1 = tf.contrib.rnn.BasicLSTMCell(fw0[1], forget_bias=1.0, state_is_tuple=True)
            # fw1 = bidirlstm(d_cell1, d_cell1, fw0, d)
            # (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(d_cell, d_cell, self.enc[0], dtype='float', scope='g0')  # [N, M, JX, 2d]
            # g0 = tf.concat([fw_g0, bw_g0], 4)

            # (fw_g1, bw_g1) = bidirectional_dynamic_rnn(d_cell, d_cell, fw_g0, dtype='float', scope='g1')  # [N, M, JX, 2d]
            # print "fw_g0", fw_g0.shape
            # print "bw_g0", bw_g0.shape

            # print g0.shape
            # (fw_g1, bw_g1), _ = bidirlstm(d_cell, d_cell, g0, dtype='float', scope='g1')  # [N, M, JX, 2d]
            # g1 = tf.concat([fw_g1, bw_g1], 3)
            # flat_output_fw = nest.flatten(fw_g0)
            # flat_output_bw = nest.flatten(bw_g0)

            # flat_outputs = tuple(array_ops.concat(1, [fw, bw])
            #                     for fw, bw in zip(flat_output_fw, flat_output_bw))

            # outputs = nest.pack_sequence_as(structure=output_fw,
            # flat_sequence=flat_outputs)
            # print "output", outputs.shape

            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    drnn(
                        d_cell,
                        residual_block(self.enc[i],
                                       num_blocks=7,
                                       num_conv_layers=2,
                                       kernel_size=5,
                                       mask=self.c_mask,
                                       num_filters=d,
                                       num_heads=nh,
                                       seq_len=self.c_len,
                                       scope="Model_Encoder",
                                       bias=False,
                                       reuse=True if i > 0 else None,
                                       dropout=self.dropout), d))
            # print "enc[0] shape", self.enc[0].shape
            print "chalala"
            # exit()

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
Ejemplo n.º 17
0
    def build_model(self):
        PL, QL, CL, d, dc, nh = self.c_maxlen, self.q_maxlen, self.char_limit, self.filters, self.char_dim, self.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.contc_input),
                [-1, CL, dc])
            qh_emb = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.quesc_input),
                [-1, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb, [-1, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [-1, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.contw_input),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.quesw_input),
                1.0 - self.dropout)

            # if self.use_cove:
            #     c_emb_cove = self.cove_model(c_emb)
            #     q_emb_cove = self.cove_model(q_emb)
            #     c_emb = tf.concat([c_emb, c_emb_cove], axis=-1)
            #     q_emb = tf.concat([q_emb, q_emb_cove], axis=-1)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            if self.use_elmo:
                c_emb = tf.concat([c_emb, self.cont_elmo], axis=-1)
                q_emb = tf.concat([q_emb, self.ques_elmo], axis=-1)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.cont_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(q_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.q_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.ques_len,
                               scope="Encoder_Residual_Block",
                               reuse=True,
                               bias=False,
                               dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            c2q = tf.matmul(S_, q)
            q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, c2q, c * c2q, c * q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            attention_inputs = tf.concat(attention_outputs, axis=-1)
            enc = [conv(attention_inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    enc[i] = tf.nn.dropout(enc[i], 1.0 - self.dropout)
                enc.append(
                    residual_block(enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.cont_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.concat([enc[1], enc[2]], axis=-1)
            end_logits = tf.concat([enc[1], enc[3]], axis=-1)
            if self.use_elmo:
                start_logits = tf.concat((start_logits, self.cont_elmo),
                                         axis=-1)
                end_logits = tf.concat((end_logits, self.cont_elmo), axis=-1)

            start_logits = tf.squeeze(
                conv(start_logits, 1, bias=False, name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(end_logits, 1, bias=False, name="end_pointer"), -1)
            unanswer_bias = tf.get_variable(
                "unanswer_bias", [1],
                regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7),
                initializer=tf.zeros_initializer())
            unanswer_bias = tf.reshape(
                tf.tile(unanswer_bias, [self.batch_size]), [-1, 1])
            self.logits1 = tf.concat(
                (unanswer_bias, mask_logits(start_logits, mask=self.c_mask)),
                axis=-1)
            self.logits2 = tf.concat(
                (unanswer_bias, mask_logits(end_logits, mask=self.c_mask)),
                axis=-1)
            start_loss = tf.nn.softmax_cross_entropy_with_logits(
                logits=self.logits1, labels=self.y_start)
            end_loss = tf.nn.softmax_cross_entropy_with_logits(
                logits=self.logits2, labels=self.y_end)
            self.loss = tf.reduce_mean(start_loss + end_loss)
            if self.l2_norm is not None:
                variables = tf.get_collection(
                    tf.GraphKeys.REGULARIZATION_LOSSES)
                l2_loss = tf.contrib.layers.apply_regularization(
                    regularizer, variables)
                self.loss += l2_loss

            # output
            outer = tf.matmul(
                tf.expand_dims(tf.nn.softmax(self.logits1), axis=2),
                tf.expand_dims(tf.nn.softmax(self.logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, self.ans_limit)
            self.output1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) - 1
            self.output2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) - 1

            if self.decay is not None:
                self.var_ema = tf.train.ExponentialMovingAverage(self.decay)
                ema_op = self.var_ema.apply(tf.trainable_variables())
                with tf.control_dependencies([ema_op]):
                    self.loss = tf.identity(self.loss)
                    self.assign_vars = []
                    for var in tf.global_variables():
                        v = self.var_ema.average(var)
                        if v is not None:
                            self.assign_vars.append(tf.assign(var, v))
Ejemplo n.º 18
0
    def forward(self):
        config = self.config
        N = config.batch_size if not self.demo else 1
        PL = self.c_maxlen
        QL = self.q_maxlen
        XL = self.x_maxlen

        # DEBUG
        self.debug_ops.extend([PL, QL, XL])

        CL = config.char_limit  # 16
        d = config.hidden       # 96
        dc = config.char_dim    # 64
        nh = config.num_heads   # 1

        with tf.variable_scope("Input_Embedding_Layer"):
            '''
                self.ch : (N, c_maxlen, 16)
                self.qh : (N, q_maxlen, 16)
                self.xh : (N, x_maxlen, 16)
            '''
            ######################################
            #get elmo embeddings
            ######################################
            datadir = "/data/elmo_experiment_20180906/20180906_model"
            vocab_file = os.path.join(datadir, 'vocab-2016-09-10.txt')
            options_file = os.path.join(datadir, 'options.json')
            weight_file = os.path.join(datadir, 'weights.hdf5')
            print(vocab_file)
            print(options_file)
            print(weight_file)
            
            # Create a Batcher to map text to character ids.
            batcher = Batcher(vocab_file, 50)
            
            # Input placeholders to the biLM.
            #context_character_ids = tf.placeholder('int32', shape=(None, None, 50))
            #question_character_ids = tf.placeholder('int32', shape=(None, None, 50))
            
            # Build the biLM graph.
            bilm = BidirectionalLanguageModel(options_file, weight_file)
            
            # Get ops to compute the LM embeddings.
            print(self.c)
            print(self.c.shape)
            #print(self.ch)
            #print(self.ch.shape)
            print(self.c_elmo)
            print(self.c_elmo.shape)
            print(self.q_elmo)
            print(self.q_elmo.shape)
            print(self.x_elmo)
            print(self.x_elmo.shape)
             
            context_embeddings_op = bilm(self.c_elmo)
            question_embeddings_op = bilm(self.q_elmo)
            candidate_embeddings_op = bilm(self.x_elmo)
            
            # Get an op to compute ELMo (weighted average of the internal biLM layers)
            # Our SQuAD model includes ELMo at both the input and output layers
            # of the task GRU, so we need 4x ELMo representations for the question
            # and context at each of the input and output.
            # We use the same ELMo weights for both the question and context
            # at each of the input and output.
            #context elmo
            elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)
            with tf.variable_scope('', reuse=True):
                # the reuse=True scope reuses weights from the context for the question
                elmo_question_input = weight_layers(
                    'input', question_embeddings_op, l2_coef=0.0
                )
                elmo_candidate_input = weight_layers(
                    'input', candidate_embeddings_op, l2_coef=0.0
                )
            
            elmo_context_output = weight_layers(
                'output', context_embeddings_op, l2_coef=0.0
            )
            with tf.variable_scope('', reuse=True):
                # the reuse=True scope reuses weights from the context for the question
                elmo_question_output = weight_layers(
                    'output', question_embeddings_op, l2_coef=0.0
                )
                elmo_candidate_output = weight_layers(
                    'output', candidate_embeddings_op, l2_coef=0.0
                )
            
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) #(N*PL,16,64)
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) #(N*QL,16,64)
            xh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.xh), [N * XL, CL, dc]) #(N*XL,16,64)

            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)
            xh_emb = tf.nn.dropout(xh_emb, 1.0 - 0.5 * self.dropout)

            # BiDAF style conv-highway encoder: conv over chars in each word in a batch of passages
            ch_emb = conv(ch_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5,
                          name = "char_conv", reuse = None) # (N*c_maxlen, 16-5+1, 96)
            qh_emb = conv(qh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5,
                          name = "char_conv", reuse = True) # (N*q_maxlen, 16-5+1, 96)
            xh_emb = conv(xh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5,
                          name="char_conv", reuse=True)  # (N*x_maxlen, 16-5+1, 96)

            # Max Pooling
            ch_emb = tf.reduce_max(ch_emb, axis = 1) # (N*c_maxlen, 96)
            qh_emb = tf.reduce_max(qh_emb, axis = 1) # (N*q_maxlen, 96)
            xh_emb = tf.reduce_max(xh_emb, axis = 1) # (N*x_maxlen, 96)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) # (N, c_maxlen, 96)
            qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]]) # (N, q_maxlen, 96)
            xh_emb = tf.reshape(xh_emb, [N, XL, xh_emb.shape[-1]]) # (N, x_maxlen, 96)

            '''
                self.c : (N, c_maxlen)
                self.q : (N, q_maxlen)
                self.x : (N, x_maxlen)
            '''
            #print(self.c)
            #print(self.q)
            #print(self.x)
            
            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout)#(N,c_maxlen,300)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout)#(N,q_maxlen,300)
            x_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.x), 1.0 - self.dropout)#(N,x_maxlen,300)

            #c_emb_elmo = 
            #q_emb_elmo = 
            #x_emb_elmo = 

            c_emb = tf.concat([c_emb, ch_emb], axis=2) # (N, c_maxlen, 396)
            q_emb = tf.concat([q_emb, qh_emb], axis=2) # (N, q_maxlen, 396)
            x_emb = tf.concat([x_emb, xh_emb], axis=2) # (N, x_maxlen, 396)
            
            print(c_emb)
            print(c_emb.shape)
            
            c_emb = tf.concat([elmo_context_output['weighted_op'], c_emb], axis=2) # (N, c_maxlen, 1024 + 396)
            q_emb = tf.concat([elmo_question_output['weighted_op'], q_emb], axis=2) # (N, q_maxlen, 1024 + 396)
            x_emb = tf.concat([elmo_candidate_output['weighted_op'], x_emb], axis=2) # (N, x_maxlen, 1024 + 396)
            
            print(c_emb)
            print(c_emb.shape)

            c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None)#(N,c_maxlen,96)
            q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)#(N,q_maxlen,96)
            x_emb = highway(x_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)#(N,x_maxlen,96)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            '''
                -> positional encoding 
                -> layer_normalization 
                -> depth-wise separable convolution 
                -> self attention 
                -> feed forward network
                In the paper: The total number of encoder blocks is 1
            '''
            # (N, c_maxlen, 96)
            c = residual_block(c_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.c_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.c_len,
                scope = "Encoder_Residual_Block",
                bias = False,
                dropout = self.dropout)
            # (N, q_maxlen, 96)
            q = residual_block(q_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.q_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.q_len,
                scope = "Encoder_Residual_Block",
                reuse = True, # Share the weights between passage and question
                bias = False,
                dropout = self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            '''
                tf.tile(input, multiples, name=None): creates a new tensor by replicating input multiples times. 
                    The output tensor's i'th dimension has input.dims(i) * multiples[i] elements, 
                    and the values of input are replicated multiples[i] times along the 'i'th dimension.
                Paper: The layer parameters are the same as the Embedding Encoder Layer 
                       except that convolution layer number is 2 within a block 
                       and the total number of blocks is 7
            '''
            '''
                c:        (N, c_maxlen, d)
                q:        (N, q_maxlen, d)
                ch_emb:   (N, c_maxlen, d)
                qh_emb:   (N, q_maxlen, d)
                C:        (N, c_maxlen, q_maxlen, d)
                Q:        (N, c_maxlen, q_maxlen, d)
                S:        (N, c_maxlen, q_maxlen)
                mask_q:   (N, 1, q_maxlen)
                mask_c:   (N, c_maxlen, 1)
                S_:       (N, c_maxlen, q_maxlen)
                S_T:      (N, q_maxlen, c_maxlen)
                self.c2q: (N, c_maxlen, d) = tf.matmul(S_, q)
                self.q2c: (N, c_maxlen, d) = tf.matmul(tf.matmul(S_, S_T), c)
            '''
            # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)

            # optimization from jasonwbw
            S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen,
                                                  input_keep_prob=1.0 - self.dropout)

            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask = mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), axis = 1),(0,2,1))

            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)

            # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]
            # if config.q2c:
            #     attention_outputs.append(c * self.q2c)

        # with tf.variable_scope("Model_Encoder_Layer"):
        #     inputs = tf.concat(attention_outputs, axis = -1)
        #
        #     # same as a dxd MLP layer
        #     self.enc = [conv(inputs, d, name = "input_projection")] # d=hidden=96
        #
        #     for i in range(3):
        #         if i % 2 == 0: # dropout every 2 blocks
        #             self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout)
        #         self.enc.append(
        #             residual_block(self.enc[i],
        #                 num_blocks = 7,
        #                 num_conv_layers = 2,
        #                 kernel_size = 5,
        #                 mask = self.c_mask,
        #                 num_filters = d,
        #                 num_heads = nh,
        #                 seq_len = self.c_len,
        #                 scope = "Model_Encoder",
        #                 bias = False,
        #                 reuse = True if i > 0 else None,
        #                 dropout = self.dropout)
        #             )

            # DEBUG
            # self.debug_ops.append(inputs)
            # self.debug_ops.extend(self.enc)

        with tf.variable_scope("Output_Layer"):
            '''
                broadcasting:dimensions with size 1 are stretched or "copied" to match the other
            '''
            '''
                x_emb:              (N, x_maxlen, d)
                inputs:             (N, c_maxlen, 4*d)
                mask_x:             (N, x_maxlen, 1)
                c_proj:             (N, c_maxlen, d)
                S_xc/S_xc_:         (N, x_maxlen, c_maxlen)
                x2c:                (N, x_maxlen, d)
                xp_exp:             (N, x_maxlen, c_maxlen, 1)
                c_proj_exp:         (N, 1, c_maxlen, d)
                cand_context:       (N, x_maxlen, c_maxlen, d)
                cand_context_pool:  (N, x_maxlen, d)
                cand_condense:      (N, x_maxlen, d*2)
                self.cand_condense: (N, x_maxlen, d)
                self.cand_logits:   (N, x_maxlen, 1)
            '''
            inputs = tf.concat(attention_outputs, axis = -1)

            # masking candidate embedding
            mask_x = tf.expand_dims(self.x_mask, 2)
            c_proj = conv(inputs, d, name="context_projection")

            S_xc = optimized_trilinear_for_attention([x_emb, c_proj], self.x_maxlen, self.c_maxlen,
                                                  input_keep_prob=1.0 - self.dropout)
            S_xc_ = tf.nn.softmax(mask_logits(S_xc, mask = mask_x))

            self.x2c = tf.matmul(S_xc_, c_proj)

            self.cand_condense = self.x2c

            if self.config.cand_condense_vector:
                xp_exp = tf.expand_dims(self.xp, axis=-1)
                c_proj_exp = tf.expand_dims(c_proj, axis=1)
                cand_context = tf.multiply(c_proj_exp, xp_exp)

                if self.config.cand_condense_conv:
                    cand_context = tf.reshape(cand_context, [N*XL, PL, d])
                    cand_context = conv(cand_context, d, bias=True, activation=tf.nn.relu,
                                        kernel_size=3, name="candidate_from_context")
                    cand_context = tf.reshape(cand_context, [N, XL, -1, d])

                if self.config.cand_condense_pool:
                    cand_context_pool = tf.reduce_max(cand_context, axis=-2)
                else:
                    cand_context_pool = tf.reduce_mean(cand_context, axis=-2)

                cand_condense = tf.concat([self.x2c, cand_context_pool], axis = -1)
                self.cand_condense = conv(cand_condense, d, name="candidate_projection")

                if self.config.cand_fuse_vector:
                    raise NotImplementedError

                # DEBUG
                self.debug_ops.extend([xp_exp, c_proj_exp, cand_context, cand_context_pool,
                                       cand_condense, self.cand_condense])

            if not config.max_margin:
                cand_logits = tf.squeeze(conv(self.cand_condense, 1, bias=False, name="candidate_logits_1"), -1)
                self.cand_logits = mask_logits(cand_logits, mask=self.x_mask)
                loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.cand_logits, labels=self.yx)
                # DEBUG
                self.debug_ops.extend([loss, x_emb, c_proj, S_xc, S_xc_, self.x2c,
                                       self.x_mask, self.cand_logits, self.yx])
            else:
                cand_logits = conv(self.cand_condense, 1, bias=False, name="candidate_logits_1")
                cand_logits = tf.tanh(cand_logits)
                cand_logits = tf.squeeze(conv(cand_logits, 1, bias=False, name="candidate_logits_2"), -1)
                self.cand_logits = tf.sigmoid(cand_logits)
                pos = tf.multiply(self.cand_logits, self.yx)
                pos = tf.reduce_max(pos, axis=-1)
                negs = tf.multiply(self.cand_logits, self.yx_inv)
                neg = tf.reduce_max(negs, axis=-1)
                loss = tf.maximum(tf.add(tf.subtract(neg, pos), config.margin), 0.0)
                # DEBUG
                self.debug_ops.extend([loss, x_emb, c_proj, S_xc, S_xc_, self.x2c,
                                       self.x_mask, self.cand_logits, self.yx,
                                       pos, negs, neg, self.yx, self.yx_inv])

            self.loss = tf.reduce_mean(loss)

        # with tf.variable_scope("Output_Layer"):
        #     '''
        #         tf.matrix_band_part: Copy a tensor setting everything outside a central band
        #                              in each innermost matrix to zero.
        #         self.enc[i]:  (N, c_maxlen, d)
        #         start_logits: (N, c_maxlen)
        #         end_logits:   (N, c_maxlen)
        #         logits1:      (N, c_maxlen)
        #         logits2:      (N, c_maxlen)
        #         outer:        (N, c_maxlen, c_maxlen)
        #         self.c_mask:  (N, c_maxlen)
        #         yp1, yp2, losses, losses2: (N,)
        #     '''
        #
        #     # map vectors to scalars
        #     start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1,
        #                                    bias = False, name = "start_pointer"),-1)
        #     end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1,
        #                                  bias = False, name = "end_pointer"), -1)
        #     self.logits = [mask_logits(start_logits, mask = self.c_mask), mask_logits(end_logits, mask = self.c_mask)]
        #
        #     logits1, logits2 = [l for l in self.logits]
        #
        #     losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1)
        #     losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2)
        #     self.loss = tf.reduce_mean(losses + losses2)
        #
        #     # find max-score span
        #     outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
        #                       tf.expand_dims(tf.nn.softmax(logits2), axis=1))
        #     # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18
        #     outer = tf.matrix_band_part(outer, 0, config.ans_limit)
        #     self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
        #     self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
        #
        #     # DEBUG
        #     self.debug_ops.extend([start_logits, end_logits, logits1, logits2,
        #                            outer, self.yp1, self.yp2, losses, losses2, self.loss])

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18
                self.assign_vars = []
                # self.shadow_vars = []
                # self.global_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
Ejemplo n.º 19
0
def darknet(inputs):

    with tf.name_scope("darknet"):

        inputs = layers.conv_layer(name="darknet/conv_0",
                                   inputs=inputs,
                                   filters=32,
                                   kernel_size=3)
        inputs = layers.conv_layer(name="darknet/conv_1",
                                   inputs=inputs,
                                   filters=64,
                                   kernel_size=3,
                                   downsample=True)

        for i in range(1):
            inputs = layers.residual_block(
                f"darknet/residual_group_0/residual_{i}",
                inputs,
                num_filters=32)

        inputs = layers.conv_layer(name="darknet/residual_group_0/conv_2",
                                   inputs=inputs,
                                   filters=128,
                                   kernel_size=3,
                                   downsample=True)

        for i in range(2):
            inputs = layers.residual_block(
                f"darknet/residual_group_1/residual_{i}",
                inputs,
                num_filters=64)

        inputs = layers.conv_layer(name="darknet/residual_group_1/conv_3",
                                   inputs=inputs,
                                   filters=256,
                                   kernel_size=3,
                                   downsample=True)

        for i in range(8):
            inputs = layers.residual_block(
                f"darknet/residual_group_2/residual_{i}",
                inputs,
                num_filters=128)

        darknet_route_1 = inputs
        inputs = layers.conv_layer(name="darknet/residual_group_3/conv_4",
                                   inputs=inputs,
                                   filters=512,
                                   kernel_size=3,
                                   downsample=True)

        for i in range(8):
            inputs = layers.residual_block(
                f"darknet/residual_group_3/residual_{i}",
                inputs,
                num_filters=256)

        darknet_route_2 = inputs
        inputs = layers.conv_layer(name="darknet/residual_group_4/conv_5",
                                   inputs=inputs,
                                   filters=1024,
                                   kernel_size=3,
                                   downsample=True)

        for i in range(4):
            inputs = layers.residual_block(
                f"darknet/residual_group_4/residual_{i}",
                inputs,
                num_filters=512)

        return darknet_route_1, darknet_route_2, inputs
Ejemplo n.º 20
0
    def forward(self, trainable):
        config = self.config
        N, PL, QL, CL, d, dc, nh= config.batch_size,self.c_maxlen, self.q_maxlen,\
                                               config.char_limit, config.hidden, config.char_dim, \
                                               config.num_heads,

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.ch),
                [N * PL, CL, dc])  #[一个句子共有多少单词,每个单词的字符个数,每一个字符的维度]
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])

            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder以下是得到卷积之后的特征输出
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)  #[batch,feature_len,d]
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(
                ch_emb, axis=1)  #求出横向唯独的最大特征,这里可以用k_max尝试,而没有用max_pooling
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb,
                                [N, PL, ch_emb.shape[-1]])  #最终转变为句子长度对应的维度,
            qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat(
                [c_emb, ch_emb],
                axis=2)  #把字符与对应的特征进行连接[batch,sequence_len,对应的输出维度]
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(
                c_emb,
                size=d,
                scope="highway",
                dropout=self.dropout,
                reuse=None)  #相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75]
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]
        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]

            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))
        with tf.variable_scope('question_rnn'):
            self.gru = tf.contrib.rnn.GRUCell(d)
            initstate = self.gru.zero_state(batch_size=N, dtype=tf.float32)
            output, state = tf.nn.dynamic_rnn(self.gru,
                                              q,
                                              initial_state=initstate)
            # self.qandc=tf.concat([self.q2c,self.c2q],axis=2)
            # self.qandc=dense(self.qandc,d)
            # output,state=tf.nn.dynamic_rnn(self.gru,self.qandc,initial_state=initstate)#(32,?,75)

            state = tf.expand_dims(state, axis=2)
            weight1 = tf.matmul(self.enc[1], state)
            weight2 = tf.matmul(self.enc[2], state)
            weight3 = tf.matmul(self.enc[3], state)

            weight_enc1 = tf.multiply(self.enc[1], weight1)
            weight_enc1 = tf.reduce_sum(weight_enc1, axis=1)

            weight_enc2 = tf.multiply(self.enc[2], weight2)
            weight_enc2 = tf.reduce_sum(weight_enc2, axis=1)

            weight_enc3 = tf.multiply(self.enc[3], weight3)
            weight_enc3 = tf.reduce_sum(weight_enc3, axis=1)

        with tf.variable_scope("Output_Layer"):
            print(weight_enc1, "ggggggggggggggggg")
            inputs_shape = weight_enc1.get_shape().as_list()
            W = tf.get_variable(
                "W",
                shape=[inputs_shape[-1], 3],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[3]), name="b")
            self.l2_loss += tf.nn.l2_loss(W)
            self.l2_loss += tf.nn.l2_loss(b)
            self.scores1 = tf.nn.xw_plus_b(weight_enc1, W, b, name="scores")
            self.scores2 = tf.nn.xw_plus_b(weight_enc2, W, b, name="scores")
            self.scores3 = tf.nn.xw_plus_b(weight_enc3, W, b, name="scores")
            self.scores = (self.scores1 + self.scores2 + self.scores3) / 3.0
            print(self.scores)
            self.predictions = tf.argmax(self.scores, 1, name="predictions")
            if trainable:
                with tf.name_scope("loss"):
                    print(self.scores, self.input_y, "llllllllllllllll")
                    losses = tf.nn.softmax_cross_entropy_with_logits(
                        logits=self.scores, labels=self.input_y)
                    self.loss = tf.reduce_mean(
                        losses) + self.l2_reg_lambda * self.l2_loss
                    # Accuracy
                with tf.name_scope("accuracy"):
                    correct_predictions = tf.equal(self.predictions,
                                                   tf.argmax(self.input_y, 1))
                    self.accuracy = tf.reduce_mean(tf.cast(
                        correct_predictions, "float"),
                                                   name="accuracy")
                # losses2 = tf.nn.softmax_cross_entropy_with_logits(
                #     logits=logits2, labels=self.y2)
                if config.decay is not None:
                    self.var_ema = tf.train.ExponentialMovingAverage(
                        config.decay)
                    ema_op = self.var_ema.apply(tf.trainable_variables())
                    with tf.control_dependencies([ema_op]):
                        self.loss = tf.identity(self.loss)

                        self.assign_vars = []
                        for var in tf.global_variables():
                            v = self.var_ema.average(var)
                            if v:
                                self.assign_vars.append(tf.assign(var, v))
                self.lr = tf.minimum(
                    config.init_lr, 0.001 / tf.log(999.) *
                    tf.log(tf.cast(self.global_step, tf.float32) + 1))
                self.opt = tf.train.AdamOptimizer(learning_rate=self.lr,
                                                  beta1=0.8,
                                                  beta2=0.999,
                                                  epsilon=1e-7)
                grads = self.opt.compute_gradients(self.loss)
                gradients, variables = zip(*grads)
                capped_grads, _ = tf.clip_by_global_norm(
                    gradients, config.grad_clip)
                self.train_op = self.opt.apply_gradients(
                    zip(capped_grads, variables), global_step=self.global_step)
                self.saver = tf.train.Saver(tf.global_variables(),
                                            max_to_keep=3)
Ejemplo n.º 21
0
    def pred(self):
        with tf.variable_scope("embedding_layer"):
            (self.questions, question_lengths), (
                self.contexts,
                context_lengths), self.answers = self.iterator.get_next()

            #max_context_length = tf.reduce_max(context_lengths)
            #max_question_length = tf.reduce_max(question_lengths)

            max_context_length = self.train_max_context_length
            max_question_length = self.train_max_question_length

            context_mask = tf.sequence_mask(context_lengths,
                                            maxlen=max_context_length)

            question_mask = tf.sequence_mask(question_lengths,
                                             maxlen=max_question_length)

            question_embeddings = tf.nn.embedding_lookup(
                self.embedding, self.questions)
            context_embeddings = tf.nn.embedding_lookup(
                self.embedding, self.contexts)
            print('question_embeddings',
                  question_embeddings.get_shape().as_list())
            print('context_embeddings',
                  context_embeddings.get_shape().as_list())

        with tf.variable_scope("embedding_layer"):
            context_output = residual_block(context_embeddings,
                                            num_blocks=1,
                                            num_conv_layers=4,
                                            kernel_size=7,
                                            mask=context_mask,
                                            num_filters=self.lstm_hidden_size,
                                            num_heads=1,
                                            seq_len=max_context_length,
                                            scope="Encoder_Residual_Block",
                                            bias=False,
                                            dropout=1.0 - self.keep_prob)
            print('context_output', context_output.get_shape().as_list())
            question_output = residual_block(
                question_embeddings,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=question_mask,
                num_filters=self.lstm_hidden_size,
                num_heads=1,
                seq_len=max_question_length,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=1.0 - self.keep_prob)

            print('question_output', question_output.get_shape().as_list())
            # context_output dimension is BS * max_context_length * d
            # where d = 2*lstm_hidden_size

        with tf.variable_scope("attention_layer"):
            # d is equal to 2*self.lstm_hidden_size

            similarity_matrix = tf.matmul(
                context_output, tf.transpose(question_output, [0, 2, 1]))
            print('similarity_matrix', similarity_matrix.get_shape().as_list())

            mask_aug = tf.expand_dims(context_mask, 2) & tf.expand_dims(
                question_mask, 1)

            similarity_matrix = preprocess_softmax(similarity_matrix, mask_aug)
            print('similarity_matrix', similarity_matrix.get_shape().as_list())

            context_to_query_attention_weights = tf.nn.softmax(
                similarity_matrix, axis=2)
            print('context_to_query_attention_weights',
                  context_to_query_attention_weights.get_shape().as_list())

            context_to_query = tf.matmul(context_to_query_attention_weights,
                                         question_output)
            print('context_to_query', context_to_query.get_shape().as_list())

            max_col_similarity = tf.reduce_max(similarity_matrix, axis=2)
            print('max_col_similarity',
                  max_col_similarity.get_shape().as_list())

            b = tf.nn.softmax(max_col_similarity, axis=1)
            print('b', b.get_shape().as_list())

            b = tf.expand_dims(b, 1)
            print('b', b.get_shape().as_list())

            query_to_context = tf.matmul(b, context_output)
            print('query_to_context', query_to_context.get_shape().as_list())

            context_output_with_context_to_query = context_output * context_to_query
            print('context_output_with_context_to_query',
                  context_output_with_context_to_query.get_shape().as_list())

            context_output_with_query_to_context = context_output * query_to_context
            print('context_output_with_query_to_context',
                  context_output_with_query_to_context.get_shape().as_list())

            attention = tf.concat([
                context_output, context_to_query,
                context_output_with_context_to_query,
                context_output_with_query_to_context
            ],
                                  axis=2)
            print('attention', attention.get_shape().as_list())

        with tf.variable_scope("modeling_layer"):
            self.enc = [
                conv(attention, self.lstm_hidden_size, name="input_projection")
            ]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i], self.keep_prob)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=context_mask,
                                   num_filters=self.lstm_hidden_size,
                                   num_heads=1,
                                   seq_len=max_context_length,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=1.0 - self.keep_prob))
                print('self.enc[i]', self.enc[i].get_shape().as_list())

        with tf.variable_scope("output_layer_start"):
            pred_start = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            print('pred_start', pred_start.get_shape().as_list())
            self.pred_start = preprocess_softmax(pred_start, context_mask)
            print('self.pred_start', self.pred_start.get_shape().as_list())

        with tf.variable_scope("output_layer_end"):
            pred_end = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            print('pred_end', pred_end.get_shape().as_list())
            self.pred_end = preprocess_softmax(pred_end, context_mask)
            print('self.pred_end', self.pred_end.get_shape().as_list())

            self.preds = tf.transpose([
                tf.argmax(self.pred_start, axis=1),
                tf.argmax(self.pred_end, axis=1)
            ])
Ejemplo n.º 22
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh, AL1,AL2,AL3= config.batch_size,self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads,self.aletr1_maxlen, \
                                               self.aletr2_maxlen,self.aletr3_maxlen

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.ch),
                [N * PL, CL, dc])  #[一个句子共有多少单词,每个单词的字符个数,每一个字符的维度]
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            self.alternati_emb1 = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.alter1h),
                [N * AL1, CL, dc])  # (875, 25, 20)
            self.alternati_emb2 = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.alter2h),
                [N * AL2, CL, dc])  # (768, 16, 300)
            self.alternati_emb3 = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.alter3h),
                [N * AL3, CL, dc])  # (768, 16, 300)

            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)
            alternati_emb1 = tf.nn.dropout(self.alternati_emb1,
                                           1.0 - 0.5 * self.dropout)
            alternati_emb2 = tf.nn.dropout(self.alternati_emb2,
                                           1.0 - 0.5 * self.dropout)
            alternati_emb3 = tf.nn.dropout(self.alternati_emb3,
                                           1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder以下是得到卷积之后的特征输出
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)  #[batch,feature_len,d]
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)
            alternati_emb1 = conv(alternati_emb1,
                                  d,
                                  bias=True,
                                  activation=tf.nn.relu,
                                  kernel_size=5,
                                  name="char_conv",
                                  reuse=True)
            alternati_emb2 = conv(alternati_emb2,
                                  d,
                                  bias=True,
                                  activation=tf.nn.relu,
                                  kernel_size=5,
                                  name="char_conv",
                                  reuse=True)
            alternati_emb3 = conv(alternati_emb3,
                                  d,
                                  bias=True,
                                  activation=tf.nn.relu,
                                  kernel_size=5,
                                  name="char_conv",
                                  reuse=True)

            ch_emb = tf.reduce_max(
                ch_emb, axis=1)  #求出横向唯独的最大特征,这里可以用k_max尝试,而没有用max_pooling
            qh_emb = tf.reduce_max(qh_emb, axis=1)
            alternati_emb1 = tf.reduce_max(alternati_emb1, axis=1)
            alternati_emb2 = tf.reduce_max(alternati_emb2, axis=1)
            alternati_emb3 = tf.reduce_max(alternati_emb3, axis=1)

            ch_emb = tf.reshape(ch_emb,
                                [N, PL, ch_emb.shape[-1]])  #最终转变为句子长度对应的维度,
            qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]])
            alternati_emb1 = tf.reshape(alternati_emb1,
                                        [N, AL1, qh_emb.shape[-1]])
            alternati_emb2 = tf.reshape(alternati_emb2,
                                        [N, AL2, qh_emb.shape[-1]])
            alternati_emb3 = tf.reshape(alternati_emb3,
                                        [N, AL3, qh_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)
            alter_embedding1 = tf.nn.embedding_lookup(self.word_mat,
                                                      self.alter1)  # 上下文
            alter_embedding2 = tf.nn.embedding_lookup(self.word_mat,
                                                      self.alter2)  # 上下文
            alter_embedding3 = tf.nn.embedding_lookup(self.word_mat,
                                                      self.alter3)  # 上下文

            c_emb = tf.concat(
                [c_emb, ch_emb],
                axis=2)  #把字符与对应的特征进行连接[batch,sequence_len,对应的输出维度]
            q_emb = tf.concat([q_emb, qh_emb], axis=2)
            alter_embedding1 = tf.concat([alter_embedding1, alternati_emb1],
                                         axis=2)
            alter_embedding2 = tf.concat([alter_embedding2, alternati_emb2],
                                         axis=2)
            alter_embedding3 = tf.concat([alter_embedding3, alternati_emb3],
                                         axis=2)

            c_emb = highway(
                c_emb,
                size=d,
                scope="highway",
                dropout=self.dropout,
                reuse=None)  #相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75]
            self.alter_embedding1 = c_emb

            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)
            alter_embedding1 = highway(alter_embedding1,
                                       size=d,
                                       scope="highway",
                                       dropout=self.dropout,
                                       reuse=True)
            alter_embedding2 = highway(alter_embedding2,
                                       size=d,
                                       scope="highway",
                                       dropout=self.dropout,
                                       reuse=True)
            alter_embedding3 = highway(alter_embedding3,
                                       size=d,
                                       scope="highway",
                                       dropout=self.dropout,
                                       reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
            alter1 = residual_block(
                alter_embedding1,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.alter1_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.alterh1_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
            alter2 = residual_block(
                alter_embedding2,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.alter2_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.alter2_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
            alter3 = residual_block(
                alter_embedding3,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.alter3_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.alter3_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]
        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]

            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))
        with tf.variable_scope('question_rnn'):
            self.gru = tf.contrib.rnn.GRUCell(d)
            initstate = self.gru.zero_state(batch_size=N, dtype=tf.float32)
            output, state = tf.nn.dynamic_rnn(self.gru,
                                              q,
                                              initial_state=initstate)
            # self.qandc=tf.concat([self.q2c,self.c2q],axis=2)
            # self.qandc=dense(self.qandc,d)
            # output,state=tf.nn.dynamic_rnn(self.gru,self.qandc,initial_state=initstate)#(32,?,75)
            output1, state1 = tf.nn.dynamic_rnn(self.gru,
                                                alter1,
                                                initial_state=state)
            output2, state2 = tf.nn.dynamic_rnn(self.gru,
                                                alter2,
                                                initial_state=state)
            output3, state3 = tf.nn.dynamic_rnn(self.gru,
                                                alter3,
                                                initial_state=state)

            state = tf.expand_dims(state, axis=2)
            weight1 = tf.matmul(self.enc[1], state)
            weight2 = tf.matmul(self.enc[2], state)
            weight3 = tf.matmul(self.enc[3], state)

            weight_enc1 = tf.multiply(self.enc[1], weight1)
            weight_enc1 = tf.reduce_sum(weight_enc1, axis=1)

            weight_enc2 = tf.multiply(self.enc[2], weight2)
            weight_enc2 = tf.reduce_sum(weight_enc2, axis=1)

            weight_enc3 = tf.multiply(self.enc[3], weight3)
            weight_enc3 = tf.reduce_sum(weight_enc3, axis=1)

        with tf.variable_scope("Output_Layer"):
            # start_logits = tf.squeeze(
            #     conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1)
            # end_logits = tf.squeeze(
            #     conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1)
            # self.logits = [mask_logits(start_logits, mask=self.c_mask),
            #                mask_logits(end_logits, mask=self.c_mask)]
            #
            # logits1, logits2 = [l for l in self.logits]
            #
            # outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
            #                   tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            # outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            # self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            # self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)

            similary1 = tf.expand_dims(self.cos_sine(weight_enc1, state1),
                                       axis=1)
            similary2 = tf.expand_dims(self.cos_sine(weight_enc2, state2),
                                       axis=1)
            similary3 = tf.expand_dims(self.cos_sine(weight_enc3, state3),
                                       axis=1)
            self.logits1 = tf.nn.softmax(
                tf.concat([similary1, similary2, similary3], axis=1))
            print(self.logits1, "lllllllllllllllllllllllllllllllllllll")
    def __init__(self, config, batch, word_mat=None,char_mat=None,  filter_sizes=None, embedding_size=None,num_filters=None,trainable=True, l2_reg_lambda=0.0, keep_prob=0.9, graph=None):

        # Placeholders for input, output and dropout
        self.config = config
        self.graph = graph if graph is not None else tf.Graph()
        self.trainable = trainable
        gru = cudnn_gru if config.use_cudnn else native_gru
        self.is_train = tf.get_variable("is_train", shape=[], dtype=tf.bool, trainable=True)
        if trainable == True:
            self.input_x, self.input_x1, self.ch, self.qh, self.input_y, self.qa_id,self.alternatives_tokens = batch.get_next()  # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len]
        else:
            self.input_x, self.input_x1, self.ch, self.qh,self.alternatives_tokens= batch.get_next()  # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len]
        self.dropout_keep_prob =keep_prob
        self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32,
                                           initializer=tf.constant_initializer(0), trainable=False)
        self.dropout = tf.placeholder_with_default(0.5, (), name="dropout")
        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)
        self.c_mask = tf.cast(self.input_x, tf.bool)  # 这里是判断出每一个数据集的context对应实际句子长度的位置(64,400)
        self.q_mask = tf.cast(self.input_x1, tf.bool)  # 同上(64,50)
        self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1)  # 每一个训练数据集实际长度
        self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1)  # 每一个问题的实际长度
        self.ch_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1])
        self.qh_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1])
        # Embedding layer
        N, PL, QL, CL, d, dc,dg,nh= config.batch_size,config.para_limit,config.ques_limit,config.char_limit,\
                                    config.hidden, config.char_dim,config.char_hidden,config.num_heads
        with tf.variable_scope("Input_Embedding_Layer"):
            self.char_mat = tf.get_variable("char_mat", initializer=tf.constant(char_mat, dtype=tf.float32),trainable=True)
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            cell_fw = tf.contrib.rnn.GRUCell(dg)  # 按照字符有多少个gru神经单元
            cell_bw = tf.contrib.rnn.GRUCell(dg)
            _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw, cell_bw, ch_emb, self.ch_len,
                dtype=tf.float32)  # self.ch_len表示训练数据集所有字符平摊之后,实际字符的长度,sequence_length=[bacth_size] is N * PL, because
            # char_hidden is 100 so state_fw and state_bw is [N * PL,100]
            ch_emb = tf.concat([state_fw, state_bw], axis=1)  # [N * PL,200]
            _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, qh_emb, self.qh_len,dtype=tf.float32)  # state_* [N*QL]
            qh_emb = tf.concat([state_fw, state_bw], axis=1)  # question_emd is [,200]

            qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])  # [batch_size,que_len,200]
            ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])  # 以上过程对应了论文里边的 the character-level embedding are generate by ...in the token
                                                   #这样就把每一个单词的字符转化为单词的字符级别embedding信息,tf.reshape(ch_emb, [N, PL, 2 * dg])
                                                    # 从这里可以看出作者最后那字符的state状态作为字符信息与原始单词embedding进行连接,那么是否可以用拼音
                                                    # 作为汉语的字符级别信息呢,可以尝试
            print(qh_emb,"llllllllllllll")
        with tf.name_scope("embedding"):

            self.W = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32),
                                            trainable=True)
            self.c_mask = tf.cast(self.input_x, tf.bool)  # self.c为填充之后的长度是一致的,用0进行填充
            self.q_mask = tf.cast(self.input_x1, tf.bool)
            if trainable:
                self.c_maxlen, self.q_maxlen, = config.para_limit, config.ques_limit,
            else:
                self.c_maxlen, self.q_maxlen = config.test_para_limit, config.test_ques_limit
            self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
            self.embedded_chars1 = tf.nn.embedding_lookup(self.W, self.input_x1)
            c_emb = tf.concat([self.embedded_chars, ch_emb], axis=2)
            q_emb= tf.concat([self.embedded_chars1, qh_emb], axis=2)
            # self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
            # self.embedded_chars_expanded1 = tf.expand_dims(self.embedded_chars1, -1)
        with tf.variable_scope("cnn_predict"):
            pooled_outputs = []
            c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None)  # 相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75]
            q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True)
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(q_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.q_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.q_len,
                               scope="Encoder_Residual_Block",
                               reuse=True,  # Share the weights between passage and question
                               bias=False,
                               dropout=self.dropout)
            qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d,
                                   keep_prob=config.keep_prob, is_train=self.is_train)  # 这个函数实现的是公式(4)中的所有
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)  # this is 公式(3) #[batch,c_maxlen,150]
            print(att,"111111111111111111111111")
            c_emb_expanded_shape=att.get_shape().as_list()
            c_emb_expanded=tf.expand_dims(att, -1)
            for i, filter_size in enumerate(filter_sizes):
                with tf.name_scope("conv-maxpool-%s" % filter_size):
                    # Convolution Layer
                    filter_shape = [filter_size,c_emb_expanded_shape[-1], 1, num_filters]
                    W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                    l2_loss += tf.nn.l2_loss(W)
                    b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                    l2_loss += tf.nn.l2_loss(b)
                    conv_ouput = tf.nn.conv2d(
                        c_emb_expanded,
                        W,
                        strides=[1, 1, 1, 1],
                        padding="VALID",
                        name="conv")
                    # Apply nonlinearity
                    h = tf.nn.relu(tf.nn.bias_add(conv_ouput, b), name="relu")
                    # Maxpooling over the outputs
                    pooled = tf.nn.max_pool(
                        h,
                        ksize=[1,  c_emb_expanded_shape[1]- filter_size + 1, 1, 1],
                        strides=[1, 1, 1, 1],
                        padding='VALID',
                        name="pool")
                    print(pooled,"222222222222222222222")
                    pooled_outputs.append(pooled)

            # Combine all the pooled features
            num_filters_total = num_filters * len(filter_sizes)
            self.h_pool = tf.concat(pooled_outputs, 3)
            self.h_pool_flat_cnn = tf.reshape(self.h_pool, [-1, num_filters_total])
        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)  #input_size对应embedding的长度,此过程是初始化一个gru,双向lstm,包括他们的初始状态
            c = rnn(c_emb, seq_len=self.c_len) #上下文编码输出为batch ,c_maxlen,以及lstm输出长度 [batch_size,sequncen_length,150*3] num_layers is 3 so concat each layers
                                                    #each layer is 150 because each layers has back_forword and feed_forword(75+75)
            q = rnn(q_emb, seq_len=self.q_len) #问题编码
        with tf.variable_scope("attention"):
            qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d,
                                   keep_prob=config.keep_prob, is_train=self.is_train)  # 这个函数实现的是公式(4)中的所有公式
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)  # this is 公式(3) #[batch,c_maxlen,150]
        # Create a convolution + maxpool layer for each filter size
        input_shape=att.get_shape().as_list()
        print(att,"rrrr")
        att=tf.expand_dims(att,-1)
        print(att,"hhhhhhhhhhhh")
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, input_shape[-1], 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                l2_loss += tf.nn.l2_loss(W)
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                l2_loss += tf.nn.l2_loss(b)
                conv_ouput = tf.nn.conv2d(
                    att,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv_ouput, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, config.para_limit - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                print(pooled,"3333333333333333333333333")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop_lstm = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
            self.h_drop_cnn=tf.nn.dropout(self.h_pool_flat_cnn, self.dropout_keep_prob)
            self.h_drop=tf.concat([self.h_drop_lstm,self.h_drop_cnn],axis=-1)
        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total*2, 3],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[3]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # Calculate mean cross-entropy loss
        if trainable:
            with tf.name_scope("loss"):
                print(self.scores,self.input_y, "llllllllllllllll")
                losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
                self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
            # Accuracy
            with tf.name_scope("accuracy"):
                correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
            # if config.decay is not None:
            #     self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            #     ema_op = self.var_ema.apply(tf.trainable_variables())
            #     with tf.control_dependencies([ema_op]):
            #         self.loss = tf.identity(self.loss)
            #
            #         self.assign_vars = []
            #         for var in tf.global_variables():
            #             v = self.var_ema.average(var)
            #             if v:
            #                 self.assign_vars.append(tf.assign(var, v))
            self.lr = tf.minimum(config.init_lr,
                                 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1))
            self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7)
            grads = self.opt.compute_gradients(self.loss)
            gradients, variables = zip(*grads)
            capped_grads, _ = tf.clip_by_global_norm(
                gradients, config.grad_clip)
            self.train_op = self.opt.apply_gradients(
                zip(capped_grads, variables), global_step=self.global_step)
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
Ejemplo n.º 24
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])

            # shape = (?, 16, 64)
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            # d(hidden_size) = 96
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)
            # shape = (?, 12, 96)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)
            # shape = (?, 96)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])
            # shape = (32, ?, 96)

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            # self.enc[1] = (32, ?, 96)
            conv1 = conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                         1,
                         bias=False,
                         name="start_pointer")
            # tf.shape(conv1) = (32, ?, 1)
            start_logits = tf.squeeze(conv1, -1)
            # tf.shape(start_logits) = (32, ?)
            conv2 = conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                         1,
                         bias=False,
                         name="end_pointer")
            end_logits = tf.squeeze(conv2, -1)

            # mask ??
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            # shape = (32, ?) -> cause the context length is variable
            # matmul([32, ?, 1] x [32, 1, ?])
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            # outer = (32, ?, ?)
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)

            reduced1 = tf.reduce_max(outer, axis=2)
            reduced2 = tf.reduce_max(outer, axis=1)
            # tf.shape(reduced) = (32, ?)

            # ###############################################
            paddings = [[0, 0], [0, self.MAX_PL - tf.shape(reduced1)[0]]]

            reduced1 = tf.pad(reduced1, paddings, "CONSTANT")
            reduced2 = tf.pad(reduced2, paddings, "CONSTANT")

            reduced1 = tf.slice(reduced1, [0, 0], [N, self.MAX_PL])
            reduced2 = tf.slice(reduced2, [0, 0], [N, self.MAX_PL])
            # tf.shape(reduced) = (32, ?)

            # no answer flag: (no_answer, answer_exist)
            # TODO add additinal layer
            # TODO dimenstion between reduced and weight
            na_flag1 = tf.cast(
                tf.argmax(tf.matmul(reduced1, self.weights1), axis=1),
                tf.float32)
            na_flag2 = tf.cast(
                tf.argmax(tf.matmul(reduced2, self.weights2), axis=1),
                tf.float32)
            # Tensor("Output_Layer/ArgMax:0", shape=(32, ?), dtype=int64)

            self.yp1 = tf.argmax(reduced1, axis=1)
            self.yp2 = tf.argmax(reduced2, axis=1)

            print(tf.reduce_sum(reduced1, axis=1))
            print(tf.multiply(na_flag1, tf.reduce_sum(reduced1, axis=1)))
            print(
                tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1,
                                                           labels=self.y1))

            # no_answer
            losses = tf.where(
                self.no_answer > 0,
                tf.multiply(na_flag1, tf.reduce_sum(reduced1, axis=1)),
                tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1,
                                                           labels=self.y1))

            losses2 = tf.where(
                self.no_answer > 0,
                tf.multiply(na_flag2, tf.reduce_sum(reduced2, axis=1)),
                tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2,
                                                           labels=self.y2))

            #################################################
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
Ejemplo n.º 25
0
    def build_model(self):
        PL, QL, CL, d, dc, nh = self.c_maxlen, self.q_maxlen, self.char_limit, self.filters, self.char_dim, self.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.contc_input), [-1, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.quesc_input), [-1, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None)
            qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb, [-1, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [-1, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.contw_input), 1.0 - self.dropout)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.quesw_input), 1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            if self.use_elmo:
                c_emb = tf.concat([c_emb, self.cont_elmo], axis=-1)
                q_emb = tf.concat([q_emb, self.ques_elmo], axis=-1)

            c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None)
            q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.cont_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(q_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.q_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.ques_len,
                               scope="Encoder_Residual_Block",
                               reuse=True,
                               bias=False,
                               dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen,
                                                  input_keep_prob=1.0 - self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            c2q = tf.matmul(S_, q)
            q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, c2q, c * c2q, c * q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            attention_inputs = tf.concat(attention_outputs, axis=-1)
            enc = [conv(attention_inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    enc[i] = tf.nn.dropout(enc[i], 1.0 - self.dropout)
                enc.append(residual_block(enc[i],
                                          num_blocks=7,
                                          num_conv_layers=2,
                                          kernel_size=5,
                                          mask=self.c_mask,
                                          num_filters=d,
                                          num_heads=nh,
                                          seq_len=self.cont_len,
                                          scope="Model_Encoder",
                                          bias=False,
                                          reuse=True if i > 0 else None,
                                          dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.concat([enc[1], enc[2]], axis=-1)
            end_logits = tf.concat([enc[1], enc[3]], axis=-1)
            if self.use_elmo:
                start_logits = tf.concat((start_logits, self.cont_elmo), axis=-1)
                end_logits = tf.concat((end_logits, self.cont_elmo), axis=-1)
            start_logits = tf.squeeze(conv(start_logits, 1, bias=False, name="start_pointer"), -1)
            end_logits = tf.squeeze(conv(end_logits, 1, bias=False, name="end_pointer"), -1)
            # 2.0 Dataset
            # unanswer_bias = tf.get_variable("unanswer_bias", [1],
            #                                 regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7),
            #                                 initializer=tf.zeros_initializer())
            # unanswer_bias = tf.reshape(tf.tile(unanswer_bias, [self.batch_size]), [-1, 1])
            # self.logits1 = tf.concat((unanswer_bias, mask_logits(start_logits, mask=self.c_mask)), axis=-1)
            # self.logits2 = tf.concat((unanswer_bias, mask_logits(end_logits, mask=self.c_mask)), axis=-1)
            self.logits1 = mask_logits(start_logits, mask=self.c_mask)
            self.logits2 = mask_logits(end_logits, mask=self.c_mask)
            start_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits1, labels=self.y_start)
            end_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits2, labels=self.y_end)
            self.loss = tf.reduce_mean(start_loss + end_loss)

            # output
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(self.logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, self.ans_limit)
            self.output1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.output2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)

        if self.use_topk:
            with tf.variable_scope("Topk_Layer"):
                top_size = 3
                outer = tf.reshape(outer, [self.batch_size, -1])
                outer_inds = tf.nn.top_k(outer, top_size).indices  # [N,top_size]
                self.yp1 = outer_inds // tf.shape(self.logits1)[-1]
                self.yp2 = outer_inds % tf.shape(self.logits2)[-1]

                def sen_mask(tensor):
                    def sen_mask_(a, b, filters):
                        try:
                            mata = tf.zeros([a, filters], tf.int32)
                        except:
                            mata = []
                        matb = tf.ones([b - a, filters], tf.int32)
                        matc = tf.zeros([tf.shape(self.logits1)[-1] - b, filters], tf.int32)
                        mat = tf.concat((mata, matb, matc), axis=0)
                        return mat

                    return tf.map_fn(lambda x: sen_mask_(x[0], x[1], self.filters), tensor)

                self.yp3 = self.yp2 + 1
                self.yp1 = tf.expand_dims(self.yp1, -1)
                self.yp2 = tf.expand_dims(self.yp2, -1)
                self.yp3 = tf.expand_dims(self.yp3, -1)
                self.y_mask = tf.concat([self.yp1, self.yp3], axis=-1)
                self.y_mask = tf.map_fn(lambda x: sen_mask(x), self.y_mask)

                # answer
                c = tf.tile(tf.expand_dims(c2q, 1), [1, top_size, 1, 1])
                c_topk = tf.multiply(tf.cast(self.y_mask, tf.float32), c)
                W1 = tf.get_variable("W1", initializer=tf.ones([1, 1, 1, self.filters]))
                W1 = tf.tile(W1, [self.batch_size, top_size, 1, 1])
                alpha1 = tf.nn.softmax(tf.matmul(W1, c_topk, transpose_b=True), axis=2)
                answer = tf.matmul(alpha1, c_topk)  # [32,top_size,1,128]

                # question
                W2 = tf.get_variable("W2", initializer=tf.ones([1, 1, self.filters]))
                W2 = tf.tile(W2, [self.batch_size, 1, 1])
                alpha2 = tf.nn.softmax(tf.matmul(W2, q, transpose_b=True), axis=1)
                ques = tf.matmul(alpha2, q)
                ques = tf.tile(tf.expand_dims(ques, 1), [1, top_size, 1, 1])  # [32,top_size,1,128]

                # question & answer
                W3 = tf.get_variable("W3", initializer=tf.ones([1, 1, self.filters, self.filters]))
                W3 = tf.tile(W3, [self.batch_size, top_size, 1, 1])
                y_topk_logits = tf.nn.sigmoid(tf.matmul(ques, tf.matmul(W3, answer, transpose_b=True))) # [32,top_size,1,1]
                y_topk_logits = tf.squeeze(y_topk_logits)  # [32,top_size]

                self.yp1 = tf.squeeze(self.yp1)
                self.yp2 = tf.squeeze(self.yp2)
                coeff1_topk = tf.one_hot(self.yp1, self.c_maxlen, axis=-1) # [32,top_size,400] one-hot
                coeff2_topk = tf.one_hot(self.yp2, self.c_maxlen, axis=-1)
                # [0,1,0,0,0][0,0,0,1,0]->[0,1,1,1,1]-[0,0,0,1,1]->[0,1,1,0,0]+[0,0,0,1,0]->[0,1,1,1,0]
                coeff1_topk_cumsum = tf.cumsum(coeff1_topk, axis=-1)
                coeff2_topk_cumsum = tf.cumsum(coeff2_topk, axis=-1)
                self.y_d = coeff1_topk_cumsum - coeff2_topk_cumsum + coeff2_topk # [32, top_size, 400]

                def clip_for_sigmoid(output):
                    _epsilon = tf.convert_to_tensor(1e-7, dtype=output.dtype.base_dtype)
                    output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
                    output = tf.log(output / (1 - output))
                    return output

                if self.topk_loss=='f1':
                    # f1 loss
                    y_start_ind = tf.cumsum(self.y_start, axis=-1)
                    y_end_ind = tf.cumsum(self.y_end, axis=-1)
                    y_gtd = y_start_ind - y_end_ind + self.y_end # [32, 400]
                    def cal_num_same(y_pred, y_truth): # [top_size, 400] [400,]
                        def cal_num_same_(y_pred_, y_truth): # [400,] [400,]
                            return tf.reduce_sum(tf.cast(tf.logical_and(tf.cast(y_pred_, tf.bool), tf.cast(y_truth, tf.bool)), tf.float32),axis=-1)
                        return [tf.map_fn(lambda x:cal_num_same_(x,y_truth),y_pred),tf.map_fn(lambda x:cal_num_same_(x,y_truth),y_pred)]
                    num_same = tf.map_fn(lambda x:cal_num_same(x[0], x[1]), [self.y_d, y_gtd])[0] # [32, top_size]
                    y_precision = num_same / (tf.cast(tf.reduce_sum(self.y_d, axis=-1),tf.float32) + 1e-8) # [32, top_size]
                    y_recall = num_same / tf.expand_dims(tf.cast(tf.reduce_sum(y_gtd, axis=-1),tf.float32) + 1e-8, axis=-1) # [32, top_size]
                    y_f1 = (2.0 * y_precision * y_recall) / (tf.cast(y_precision + y_recall,tf.float32) + 1e-8) # [32, top_size]
                    topk_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=clip_for_sigmoid(y_topk_logits), labels=y_f1))

                elif self.topk_loss=='em':
                    # em loss
                    start_em = tf.equal(tf.cast(tf.expand_dims(tf.argmax(self.y_start, axis=-1), axis=1), tf.int32),
                                        tf.cast(self.yp1, tf.int32))  # [32, top_size]
                    end_em = tf.equal(tf.cast(tf.expand_dims(tf.argmax(self.y_end, axis=-1), axis=1), tf.int32),
                                      tf.cast(self.yp2, tf.int32))  # [32, top_size]
                    y_em = tf.cast(tf.logical_and(start_em, end_em), tf.float32) # [32, top_size]
                    topk_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=clip_for_sigmoid(y_topk_logits), labels=y_em))

                # final loss
                self.Lambda1 = tf.get_variable("Lambda1", initializer=tf.constant([0.9]), trainable=False)
                self.loss = tf.reduce_mean(self.Lambda1 * (start_loss + end_loss) + (1 - self.Lambda1) * topk_loss)

                # output
                outer_topk = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2),
                                  tf.expand_dims(tf.nn.softmax(self.logits2), axis=1))
                outer_topk = tf.matrix_band_part(outer_topk, 0, self.ans_limit)
                self.output1 = tf.argmax(tf.reduce_max(outer_topk, axis=2), axis=1)
                self.output2 = tf.argmax(tf.reduce_max(outer_topk, axis=1), axis=1)

                # diversity loss
                if self.diversity_loss:
                    self.Lambda2 = tf.get_variable("Lambda2", initializer=tf.constant([0.1]),trainable=False)
                    diversity_loss = tf.reduce_mean(tf.reduce_prod(self.y_d, axis=1),axis=-1) # [32,top_size,400]->[32,400]->[32,]
                    self.loss = self.loss + tf.reduce_mean(self.Lambda2 * diversity_loss)


        if self.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables)
            self.loss += l2_loss

        if self.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(self.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)
                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v is not None:
                        self.assign_vars.append(tf.assign(var, v))
Ejemplo n.º 26
0
    def forward(self):
        config = self.config
        N = config.batch_size if not self.demo else 1
        PL = self.c_maxlen
        QL = self.q_maxlen
        CL = config.char_limit  # 16
        d = config.hidden  # 96
        dc = config.char_dim  # 64
        nh = config.num_heads  # 1

        with tf.variable_scope("Input_Embedding_Layer"):
            '''
                self.ch : (N, c_maxlen, 16)
                self.qh : (N, q_maxlen, 16)
            '''
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])  # (N*c_maxlen, 16, 64)
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])  # (N*q_maxlen, 16, 64)
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # BiDAF style conv-highway encoder: conv over chars in each word in a batch of passages
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)  # (N*c_maxlen, 16-5+1, 96)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)  # (N*q_maxlen, 16-5+1, 96)

            ch_emb = tf.reduce_max(ch_emb, axis=1)  # (N*c_maxlen, 96)
            qh_emb = tf.reduce_max(qh_emb, axis=1)  # (N*q_maxlen, 96)

            ch_emb = tf.reshape(ch_emb,
                                [N, PL, ch_emb.shape[-1]])  # (N, c_maxlen, 96)
            qh_emb = tf.reshape(qh_emb,
                                [N, QL, ch_emb.shape[-1]])  # (N, q_maxlen, 96)
            '''
                self.c : (N, c_maxlen)
                self.q : (N, q_maxlen)
            '''
            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(
                self.word_mat, self.c),
                                  1.0 - self.dropout)  # (N, c_maxlen, 300)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(
                self.word_mat, self.q),
                                  1.0 - self.dropout)  # (N, q_maxlen, 300)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)  # (N, c_maxlen, 396)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)  # (N, q_maxlen, 396)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)  # (N, c_maxlen, 96)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)  # (N, q_maxlen, 96)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            '''
                -> positional encoding 
                -> layer_normalization 
                -> depth-wise separable convolution 
                -> self attention 
                -> feed forward network
                In the paper: The total number of encoder blocks is 1
            '''
            # (N, c_maxlen, 96)
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            # (N, q_maxlen, 96)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            '''
                tf.tile(input, multiples, name=None): creates a new tensor by replicating input multiples times. 
                    The output tensor's i'th dimension has input.dims(i) * multiples[i] elements, 
                    and the values of input are replicated multiples[i] times along the 'i'th dimension.
                Paper: The layer parameters are the same as the Embedding Encoder Layer 
                       except that convolution layer number is 2 within a block 
                       and the total number of blocks is 7
            '''
            '''
                c:        (N, c_maxlen, d)
                q:        (N, q_maxlen, d)
                ch_emb:   (N, c_maxlen, d)
                qh_emb:   (N, q_maxlen, d)
                C:        (N, c_maxlen, q_maxlen, d)
                Q:        (N, c_maxlen, q_maxlen, d)
                S:        (N, c_maxlen, q_maxlen)
                mask_q:   (N, 1, q_maxlen)
                mask_c:   (N, c_maxlen, 1)
                S_:       (N, c_maxlen, q_maxlen)
                S_T:      (N, q_maxlen, c_maxlen)
                self.c2q: (N, c_maxlen, d) = tf.matmul(S_, q)
                self.q2c: (N, c_maxlen, d) = tf.matmul(tf.matmul(S_, S_T), c)
            '''
            C = tf.tile(tf.expand_dims(c, 2), [1, 1, self.q_maxlen, 1])
            Q = tf.tile(tf.expand_dims(q, 1), [1, self.c_maxlen, 1, 1])
            S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout)

            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1))

            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)

            attention_outputs = [c, self.c2q, c * self.c2q]
            if config.q2c:
                attention_outputs.append(c * self.q2c)

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d,
                             name="input_projection")]  # d=hidden=96
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            '''
                tf.matrix_band_part: Copy a tensor setting everything outside a central band 
                                     in each innermost matrix to zero.
                self.enc[i]:  (N, c_maxlen, d)
                start_logits: (N, c_maxlen)
                end_logits:   (N, c_maxlen)
                logits1:      (N, c_maxlen)
                logits2:      (N, c_maxlen)
                outer:        (N, c_maxlen, c_maxlen)
                yp1, yp2, losses, losses2: (N,)  
            '''
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

            # find max-score span
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)

            #DEBUG
            self.debug_ops.extend([
                self.enc[1], start_logits, end_logits, logits1, logits2, outer,
                self.yp1, self.yp2, losses, losses2, self.loss
            ])

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.shadow_vars = []
                self.global_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.shadow_vars.append(v)
                        self.global_vars.append(var)
                self.assign_vars = []
                for g, v in zip(self.global_vars, self.shadow_vars):
                    self.assign_vars.append(tf.assign(g, v))
Ejemplo n.º 27
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, \
            self.q_maxlen, config.char_limit, config.hidden, config.tw_char_dim, config.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            if config.type == "all":
                ch_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.ch),
                    [N * PL, CL, dc])
                qh_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.qh),
                    [N * QL, CL, dc])
                ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
                qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

                # Bidaf style conv-highway encoder
                ch_emb = conv(ch_emb,
                              d,
                              bias=True,
                              activation=tf.nn.relu,
                              kernel_size=5,
                              name="char_conv",
                              reuse=None)
                qh_emb = conv(qh_emb,
                              d,
                              bias=True,
                              activation=tf.nn.relu,
                              kernel_size=5,
                              name="char_conv",
                              reuse=True)

                ch_emb = tf.reduce_max(ch_emb, axis=1)
                qh_emb = tf.reduce_max(qh_emb, axis=1)

                ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
                qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

                c_emb = tf.nn.dropout(
                    tf.nn.embedding_lookup(self.word_mat, self.c),
                    1.0 - self.dropout)
                q_emb = tf.nn.dropout(
                    tf.nn.embedding_lookup(self.word_mat, self.q),
                    1.0 - self.dropout)

                c_emb = tf.concat([c_emb, ch_emb], axis=2)
                q_emb = tf.concat([q_emb, qh_emb], axis=2)

                c_emb = highway(c_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=None)
                q_emb = highway(q_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=True)

            elif config.type == 'char':
                c_emb = tf.nn.dropout(
                    tf.nn.embedding_lookup(self.char_mat, self.c),
                    1.0 - self.dropout)
                q_emb = tf.nn.dropout(
                    tf.nn.embedding_lookup(self.char_mat, self.q),
                    1.0 - self.dropout)

                c_emb = highway(c_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=None)
                q_emb = highway(q_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            # guess : mask the padding part pad in the end of the passage
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1,
                                                                labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits2, labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
Ejemplo n.º 28
0
    def forward(self):
        config = self.config
        '''
        N: batch_size
        PL: passage最大长度
        QL: question最大长度
        CL: 单词最大字母长度
        d: 输出通道数
        dc: 字母的嵌入维度
        nh: 自注意力的头数
        '''
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads
        # Embedding层:获取词向量和字符向量的拼接
        with tf.variable_scope("Input_Embedding_Layer"):
            # # character嵌入:
            # 1、先对单词的每个字母进行char2vec
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # 2、将单词对应的word2vec矩阵通过conv编码成向量
            # 卷积 ch_emb_shape = [N * PL, CL-5+1, d], qh_emb_shape = [N * QL, CL-5+1, d]
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            # max_time_pooling
            # ch_emb_shape = [N * PL, d], qh_emb_shape = [N * QL, d]
            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)
            # ch_emb_shape = [N, PL, d], qh_emb_shape = [N, QL, d]
            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

            # # 词嵌入:从glove获取
            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            # 拼接词向量和字符向量
            # c_emb_size = [batch, n_c, c_emb+ch_emb]
            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            # q_emb_size = [batch, n_q, c_emb + ch_emb]
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            # 分别通过highway网络
            # c_emb_size = [batch, n_c, d]
            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            # c_emb_size = [batch, n_q, d]
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        # Stacking Embedding Encoder Block的实现:共1个encoder block,每个7个卷积层,卷积核数d=96
        with tf.variable_scope("Embedding_Encoder_Layer"):
            # c_size = [batch, n_c, d]
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            # q_size = [batch, n_q, d]
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=
                True,  # 共享passage和question的Stacking Embedding Encoder Block的权重
                bias=False,
                dropout=self.dropout)

        # Context-Query-Attention实现:
        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            # S_size = [batch, n_c, n_q], q_size = [batch, n_q, d], c_size = [batch, n_c, d]
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            # n_q方向进行softmax
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q), dim=-1)
            mask_c = tf.expand_dims(self.c_mask, 2)
            # n_c方向进行softmax
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            # c2q_size = [batch, n_c, d]
            self.c2q = tf.matmul(S_, q)
            # q2c_size = [batch, n_c, d]
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            # attention_size = [4, batch, n_c, d]
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        # Stacked Model Encoder Blocks实现:共7个encoder block,每个2个卷积层,卷积核数d=96
        with tf.variable_scope("Model_Encoder_Layer"):
            # c, self.c2q, c * self.c2q, c * self.q2c 按照通道维度进行合并
            # input_shape = [batch, n_c, 4d]
            inputs = tf.concat(attention_outputs, axis=-1)
            # self.enc[i]_shape = [batch, n_c, d]
            self.enc = [conv(inputs, d, name="input_projection")]
            # 3个Stacked Model Encoder Blocks
            for i in range(3):
                if i % 2 == 0:  # 每两层进行一次dropout
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(
                        self.enc[i],
                        num_blocks=7,
                        num_conv_layers=2,
                        kernel_size=5,
                        mask=self.c_mask,
                        num_filters=d,
                        num_heads=nh,
                        seq_len=self.c_len,
                        scope="Model_Encoder",
                        bias=False,
                        reuse=True if i > 0 else
                        None,  # 共享同一个Stacked Model Encoder Blocks的权重
                        dropout=self.dropout))

        # 输出层实现:
        with tf.variable_scope("Output_Layer"):
            # 合并Stacked Model Encoder Blocks的第一个和第二个输出,并和并通道
            # start_logits_shape = [batch, n_c, 1]
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            # 合并Stacked Model Encoder Blocks的第一个和第三个输出,并和并通道
            # end_logits_shape = [batch, n_c, 1]
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)

            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            # outer_shape = [bacth, n_c, n_c]
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            # 保留行坐标<纵坐标,且行坐标+纵坐标<=ans_limit的数据,其余置0
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            # 最大值的行坐标,代表起始位置
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            # 最大值的列坐标,代表结束位置
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        # L2正则化
        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            # control_dependencies传入的操作是先于with后的操作
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))