def __init__(self, label, image_size, channel_num, kernel_num, z_size, device): super().__init__() self.model_name = "ae_vine" self.label = label self.image_size = image_size self.channel_num = channel_num self.kernel_num = kernel_num self.z_size = z_size self.device = device self.vine = None # encoder self.encoder = nn.Sequential( _conv(channel_num, kernel_num // 4), _conv(kernel_num // 4, kernel_num // 2), _conv(kernel_num // 2, kernel_num), ) # encoded feature's size and volume self.feature_size = image_size // 8 self.feature_volume = kernel_num * (self.feature_size**2) # decoder self.decoder = nn.Sequential(_deconv(kernel_num, kernel_num // 2), _deconv(kernel_num // 2, kernel_num // 4), _deconv(kernel_num // 4, channel_num), nn.Sigmoid()) # projection self.project = _linear(z_size, self.feature_volume, relu=False) self.q_layer = _linear(self.feature_volume, z_size, relu=False)
def __init__(self, label, image_size, channel_num, kernel_num, z_size, device): # configurations super().__init__() self.model_name = "cvae" self.label = label self.image_size = image_size self.channel_num = channel_num self.kernel_num = kernel_num self.z_size = z_size self.device = device # encoder self.encoder = nn.Sequential( _conv(channel_num, kernel_num // 4), _conv(kernel_num // 4, kernel_num // 2), _conv(kernel_num // 2, kernel_num), ) # encoded feature's size and volume self.feature_size = image_size // 8 self.feature_volume = kernel_num * (self.feature_size ** 2) # q self.q_mean = _linear(self.feature_volume, z_size, relu=False) self.q_logvar = _linear(self.feature_volume, z_size, relu=False) n = int(self.z_size * (self.z_size - 1) / 2) self.q_atanhcor = _linear(self.feature_volume, n, relu=False) # projection self.project = _linear(z_size, self.feature_volume, relu=False) # decoder self.decoder = nn.Sequential( _deconv(kernel_num, kernel_num // 2), _deconv(kernel_num // 2, kernel_num // 4), _deconv(kernel_num // 4, channel_num), nn.Sigmoid() )
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh, dw = config.test_batch_size if self.loop_function else config.batch_size, self.c_maxlen, self.q_maxlen, \ config.char_limit, config.hidden, config.char_dim, config.num_heads, config.glove_dim with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=2, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=2, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=2, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Decoder_Layer"): memory = tf.concat([self.enc[1], self.enc[2], self.enc[3]], axis=-1) oups = tf.split(self.a, [1] * self.a_maxlen, 1) h = tf.tanh( _linear(tf.reduce_mean(memory, axis=1), output_size=d, bias=False, scope="h_initial")) c = tf.tanh( _linear(tf.reduce_mean(memory, axis=1), output_size=d, bias=False, scope="c_initial")) state = (c, h) outputs = [] prev = None prev_probs = [0.0] symbols = [] for i, inp in enumerate(oups): einp = tf.reshape(tf.nn.embedding_lookup(self.word_mat, inp), [N, dw]) if i > 0: tf.get_variable_scope().reuse_variables() if self.loop_function is not None and prev is not None: with tf.variable_scope("loop_function", reuse=True): einp, prev_probs, index, prev_symbol = self.loop_function( prev, prev_probs, self.beam_size, i) h = tf.gather(h, index) # update prev state state = tuple(tf.gather(s, index) for s in state) # update prev state for j, symbol in enumerate(symbols): symbols[j] = tf.gather( symbol, index) # update prev symbols for j, output in enumerate(outputs): outputs[j] = tf.gather( output, index) # update prev outputs symbols.append(prev_symbol) attn = tf.reshape( multihead_attention(tf.expand_dims(h, 1), units=d, num_heads=nh, memory=memory, mask=self.c_mask, bias=False), [-1, nh * d]) cinp = tf.concat([einp, attn], 1) h, state = self.cell(cinp, state) with tf.variable_scope("AttnOutputProjection"): output = _linear([h] + [cinp], output_size=dw * 2, bias=False, scope="output") output = tf.reshape(output, [-1, dw, 2]) output = tf.reduce_max(output, 2) # maxout outputs.append(output) if self.loop_function is not None: prev = output if self.loop_function is not None: # process the last symbol einp, prev_probs, index, prev_symbol = self.loop_function( prev, prev_probs, self.beam_size, i + 1) for j, symbol in enumerate(symbols): symbols[j] = tf.gather(symbol, index) # update prev symbols for j, output in enumerate(outputs): outputs[j] = tf.gather(output, index) # update prev outputs symbols.append(prev_symbol) # output the final best result of beam search for k, symbol in enumerate(symbols): symbols[k] = tf.gather(symbol, 0) for k, output in enumerate(outputs): outputs[k] = tf.expand_dims(tf.gather(output, 0), 0) self.gen_loss = self._compute_loss(outputs, oups, N) self.symbols = symbols with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) self.loss = self.gen_loss if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))