def vgg_fc7(input_batch, name, apply_dropout, reuse=None): pool5 = vgg_pool5(input_batch, name, reuse) with tf.variable_scope(name, reuse=reuse): # layer 6 fc6 = fc_relu('fc6', pool5, output_dim=4096) if apply_dropout: fc6 = drop(fc6, 0.5) # layer 7 fc7 = fc_relu('fc7', fc6, output_dim=4096) if apply_dropout: fc7 = drop(fc7, 0.5) return fc7
def deeplab_fc8(input_batch, name, apply_dropout=False): pool5a = deeplab_pool5(input_batch, name) with tf.variable_scope(name): fc6 = fc_relu('fc6', pool5a, output_dim=1024) if apply_dropout: fc6 = drop(fc6, 0.5) fc7 = fc_relu('fc7', fc6, output_dim=1024) if apply_dropout: fc7 = drop(fc7, 0.5) fc8 = fc('fc8', fc7, output_dim=1000) return fc8
def vgg_fc7(input_batch, name, apply_dropout): pool5 = vgg_pool5(input_batch, name) with tf.variable_scope(name): # layer 6 fc6 = fc_relu('fc6', pool5, output_dim=4096) if apply_dropout: fc6 = drop(fc6, 0.5) # layer 7 fc7 = fc_relu('fc7', fc6, output_dim=4096) if apply_dropout: fc7 = drop(fc7, 0.5) return fc7
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, vgg_dropout, mlp_dropout): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim)[0] # Local image feature feat_vis = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=vgg_dropout) # L2-normalize the features (except for spatial_batch) # and concatenate them feat_all = tf.concat(axis=1, values=[ tf.nn.l2_normalize(feat_lang, 1), tf.nn.l2_normalize(feat_vis, 1), spatial_batch ]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1) return mlp_l2
def vgg_roi_fc7_from_conv5(conv5, roi_batch, name, apply_dropout, reuse=None): with tf.variable_scope(name, reuse=reuse): # ROI Pooling roi_pool5, _ = roi_pool(conv5, roi_batch, pooled_height=7, pooled_width=7, spatial_scale=1. / 16, name='roi_pool5') # layer 6 fc6 = fc_relu('fc6', roi_pool5, output_dim=4096) if apply_dropout: fc6 = drop(fc6, 0.5) # layer 7 fc7 = fc_relu('fc7', fc6, output_dim=4096) if apply_dropout: fc7 = drop(fc7, 0.5) return fc7
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, deeplab_dropout, mlp_dropout): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # Local image feature feat_vis = deeplab.deeplab_fc8_full_conv(imcrop_batch, 'deeplab', apply_dropout=deeplab_dropout) input_dim = 1 for d in feat_vis.get_shape().as_list()[1:]: input_dim *= d feat_vis_flatten = tf.reshape(feat_vis, [-1, input_dim]) # L2-normalize the features (except for spatial_batch) # and concatenate them feat_all = tf.concat(axis=1, values=[ tf.nn.l2_normalize(feat_lang, 1), tf.nn.l2_normalize(feat_vis_flatten, 1), spatial_batch ]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1) return mlp_l2
def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False): with tf.variable_scope(name): if reuse==True: print name+" reuse variables" tf.get_variable_scope().reuse_variables() else: print name+" doesn't reuse variables" layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim) layer1=drop(layer1,0.5) outputs = fc('layer2', layer1,output_dim=2) return outputs
def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False): with tf.variable_scope(name): if reuse==True: print name+" reuse variables" tf.get_variable_scope().reuse_variables() else: print name+" doesn't reuse variables" layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim) layer1=drop(layer1,0.5) outputs = fc('layer2', layer1,output_dim=4) return outputs
def __init__(self, images, q_encoding, image_valid_batch, num_choices, scope='single_hop', reuse=None): x_loc = self.loc_init(images, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): x_loc_shape = tf.shape(x_loc) B, H, W = x_loc_shape[0], x_loc_shape[1], x_loc_shape[2] dim = x_loc.get_shape().as_list()[-1] # static shape # attention over x_loc proj_q = fc('fc_q_map1', q_encoding, output_dim=dim)[:, ax, ax, :] interactions = tf.nn.l2_normalize(x_loc * proj_q, axis=-1) raw_att = conv('conv_att_score', interactions, kernel_size=1, stride=1, output_dim=1) raw_att = tf.reshape(raw_att, to_T([B, H * W])) # (N, H*W) valid_mask = tf.reshape(image_valid_batch, tf.shape(raw_att)) raw_att = raw_att * valid_mask - 1e18 * (1 - valid_mask) att = tf.nn.softmax(raw_att, axis=-1) # (N, H*W) # collect attended image feature x_att = tf.matmul(tf.reshape(att, to_T([B, 1, H * W])), tf.reshape(x_loc, to_T([B, H * W, dim]))) # (N, 1, D_kb) x_att = tf.reshape(x_att, to_T([B, dim])) # (N, D_kb) # VQA classification eQ = fc('fc_q_map2', q_encoding, output_dim=dim) if cfg.OUT_QUESTION_MUL: features = tf.concat([x_att, eQ, x_att * eQ], axis=-1) else: features = tf.concat([x_att, eQ], axis=-1) fc1 = fc_relu('fc_hidden', features, output_dim=cfg.OUT_CLASSIFIER_DIM) logits = fc('fc_scores', fc1, output_dim=num_choices) self.logits = logits
def vs_multilayer(input_batch, name, middle_layer_dim=1000, output_layer_dim=21 * 3, dropout=True, reuse=False): with tf.variable_scope(name): if reuse == True: print name + " reuse variables" tf.get_variable_scope().reuse_variables() else: print name + " doesn't reuse variables" layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim) # (4096*3)--->1000 if dropout: layer1 = drop(layer1, 0.5) sim_score = fc('layer2', layer1, output_dim=output_layer_dim) # 1000---->21*3 return sim_score
def vs_multilayer(input_batch, name, middle_layer_dim=1000, class_num=20, dropout=False, reuse=False): """This function is inherited from CBR project(https://github.com/jiyanggao/CBR) """ print('--I am using vs_multilayer--') with tf.variable_scope(name): if reuse == True: print(name + " reuse variables") tf.get_variable_scope().reuse_variables() else: print(name + " doesn't reuse variables") layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim) if dropout: layer1 = drop(layer1, 0.5) sim_score = fc('layer2', layer1, output_dim=(class_num + 1) * 3) return sim_score
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, vgg_dropout, mlp_dropout): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # Local image feature feat_vis = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=vgg_dropout) # L2-normalize the features (except for spatial_batch) # and concatenate them feat_all = tf.concat(1, [tf.nn.l2_normalize(feat_lang, 1), tf.nn.l2_normalize(feat_vis, 1), spatial_batch]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1) return mlp_l2
def question_prior_net(encoder_states, num_choices, qpn_dropout, hidden_dim=500, scope='question_prior_net', reuse=None): with tf.variable_scope(scope, reuse=reuse): # concate the LSTM states from all layers assert (isinstance(encoder_states, tuple)) h_list = [] for s in encoder_states: assert (isinstance(s, tf.contrib.rnn.LSTMStateTuple)) h_list.append(s.h) # h_concat has shape [N, D_lstm1 + ... + D_lstm_n] h_concat = tf.concat(h_list, axis=1) if qpn_dropout: h_concat = drop(h_concat, 0.5) fc1 = fc_relu('fc1', h_concat, output_dim=hidden_dim) if qpn_dropout: fc1 = drop(fc1, 0.5) fc2 = fc('fc2', fc1, output_dim=num_choices) return fc2
# Language feature (LSTM hidden state) lstm_top = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # Local image feature fc8_crop = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=False) # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 1 (feature dimension) feat_all = tf.concat(1, [tf.nn.l2_normalize(lstm_top_batch, 1), tf.nn.l2_normalize(fc8_crop_batch, 1), spatial_batch]) # Outputs # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims) mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1) scores = mlp_l2 # Load pretrained model snapshot_saver = tf.train.Saver() sess = tf.Session() snapshot_saver.restore(sess, pretrained_model) ################################################################################ # Load annotations and bounding box proposals ################################################################################ query_dict = json.load(open(query_file)) bbox_dict = json.load(open(bbox_file)) imcrop_dict = json.load(open(imcrop_file))
lstm_top = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # Local image feature fc8_crop = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=False) # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 1 (feature dimension) feat_all = tf.concat(1, [ tf.nn.l2_normalize(lstm_top_batch, 1), tf.nn.l2_normalize(fc8_crop_batch, 1), spatial_batch ]) # Outputs # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims) mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1) scores = mlp_l2 # Load pretrained model snapshot_saver = tf.train.Saver() sess = tf.Session() snapshot_saver.restore(sess, pretrained_model) ################################################################################ # Load annotations and bounding box proposals ################################################################################ query_dict = json.load(open(query_file)) bbox_dict = json.load(open(bbox_file)) imcrop_dict = json.load(open(imcrop_file))
def main(args): ################################################################################ # Validate input arguments ################################################################################ assert not ( args.concat and (not args.multicrop) ), "Cannot test concatenated labels on single image crop per batch." assert not (args.classes and args.concat ), "Cannot test concatenated labels when using image classes" assert not ( args.classes and (not args.multicrop) ), "Cannot test on single image per batch when using image classes" # Initialize GPU os.environ['CUDA_VISIBLE_DEVICES'] = args.GPU_ID # print mode print() print("Model:", pretrained_model) print("All crops per batch - True | First crop per batch - False:", args.multicrop) print("Concatenated captions - True | Simple captions - False:", args.concat) print("Image Classes - True | Image Descriptions - False:", args.classes) print() ################################################################################ # Evaluation network ################################################################################ # Inputs text_seq_batch = tf.placeholder(tf.int32, [T, N]) imcrop_batch = tf.placeholder(tf.float32, [N, 224, 224, 3]) lstm_top_batch = tf.placeholder(tf.float32, [N, D_text]) fc8_crop_batch = tf.placeholder(tf.float32, [N, D_im]) # Language feature (LSTM hidden state) lstm_top = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # Local image feature fc8_crop = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=False) # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 1 (feature dimension) feat_all = tf.concat(axis=1, values=[ tf.nn.l2_normalize(lstm_top_batch, 1), tf.nn.l2_normalize(fc8_crop_batch, 1) ]) # Outputs # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims) mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1) scores = mlp_l2 # Load pretrained model snapshot_restorer = tf.train.Saver(None) sess = tf.Session() snapshot_restorer.restore(sess, pretrained_model) ################################################################################ # Load annotations and bounding box proposals ################################################################################ coco = COCO(query_file) coco_captions = COCO(caption_file) imgid_list = coco.getImgIds() catid_list = coco.getCatIds() ################################################################################ # Load testing data ################################################################################ testing_samples_pos = [] testing_samples_neg = [] num_imcrop = len(imgid_list) # Gather a testing example per full image. for n_imcrop in range(num_imcrop): # image img_id = imgid_list[n_imcrop] # get the decriptions of the image caption_ids = coco_captions.getAnnIds(imgIds=img_id) captions = [ x['caption'].strip() for x in coco_captions.loadAnns(caption_ids) ] if args.concat: # append two positive captions; one with itself if only one present pos_desc = captions[0] + ' and ' + captions[len(captions) - 1] testing_samples_pos.append((img_id, pos_desc, 1)) # form negative examples by choosing random image # that is not the current image, get its descriptions, # and choose one at random. false_idx = n_imcrop while false_idx == n_imcrop: false_idx = randint(0, num_imcrop - 1) desc_ids = coco_captions.getAnnIds(imgid_list[false_idx]) desc_idx = randint(0, len(desc_ids) - 1) neg_desc1 = coco_captions.loadAnns( desc_ids[desc_idx])[0]['caption'].strip() false_idx = n_imcrop while false_idx == n_imcrop: false_idx = randint(0, num_imcrop - 1) desc_ids = coco_captions.getAnnIds(imgid_list[false_idx]) desc_idx = randint(0, len(desc_ids) - 1) neg_desc2 = coco_captions.loadAnns( desc_ids[desc_idx])[0]['caption'].strip() # negative example: append two negative captions neg_desc = neg_desc1 + ' and ' + neg_desc2 testing_samples_neg.append((img_id, neg_desc, 0)) # negative example: append one negative and one positive example neg_desc = neg_desc1 + ' and ' + captions[0].strip() testing_samples_neg.append((img_id, neg_desc, 0)) neg_desc = captions[0].strip() + ' and ' + neg_desc1 testing_samples_neg.append((img_id, neg_desc, 0)) # for appending image captions elif args.classes: img_catids = coco.getCatIds(imgIds=img_id) img_cat_names = [cat['name'] for cat in coco.loadCats(img_catids)] for category in img_cat_names: testing_samples_pos.append((img_id, category, 1)) # form one negative example by choosing random category that # img is not in false_catid = img_catids[0] while false_catid in img_catids: false_catid = catid_list[randint(0, len(catid_list) - 1)] false_cat_name = coco.loadCats(false_catid)[0]['name'] testing_samples_neg.append((img_id, false_cat_name, 0)) else: for caption in captions: # append one positive sample per description testing_samples_pos.append((img_id, caption, 1)) # form one negative example by choosing random image # that is not the current image, get its descriptions, # and choose one at random. false_idx = n_imcrop while false_idx == n_imcrop: false_idx = randint(0, num_imcrop - 1) desc_ids = coco_captions.getAnnIds(imgid_list[false_idx]) desc_idx = randint(0, len(desc_ids) - 1) false_cap = coco_captions.loadAnns( desc_ids[desc_idx])[0]['caption'].strip() testing_samples_neg.append((img_id, false_cap, 0)) # Combine samples print('#pos=', len(testing_samples_pos)) print('#neg=', len(testing_samples_neg)) # TODO: Not exactly sure what your multicrop is testing here? Just removes the # positive examples from being tested? How is this useful? if args.multicrop: testing_samples = testing_samples_pos + testing_samples_neg else: testing_samples = testing_samples_neg print('#total testing examples=', len(testing_samples)) num_batch = len(testing_samples) // N print('total batch number: %d' % num_batch) ################################################################################ # Testing ################################################################################ # Pre-allocate arrays imcrop_val = np.zeros((N, 224, 224, 3), dtype=np.float32) text_seq_val = np.zeros((T, N), dtype=np.int32) lstm_top_val = np.zeros((N, D_text)) label_val = np.zeros((N, 1), dtype=np.bool) correct_predictions = 0 total_predictions = 0 # optimization for faster image loading last_img_id = -100 last_imcrop = None for n_batch in range(num_batch): print('batch %d / %d' % (n_batch + 1, num_batch)) batch_begin = n_batch * N batch_end = (n_batch + 1) * N # load and preprocess last image from previous batch first_img_id = testing_samples[max(batch_begin - 1, 0)][0] first_imname = coco.loadImgs(first_img_id)[0]['coco_url'] first_im = skimage.io.imread(first_imname) first_imcrop = skimage.img_as_ubyte( skimage.transform.resize(first_im, [224, 224])) if len(np.shape(first_im)) != 3: continue for n_sample in range(batch_begin, batch_end): img_id, description, label = testing_samples[n_sample] # Preprocess image and caption if args.multicrop: # Optimization: do not reload image if it is the same as the last one if img_id == last_img_id: imcrop = last_imcrop else: imname = coco.loadImgs(img_id)[0]['coco_url'] im = skimage.io.imread(imname) # ignore grayscale images if len(np.shape(im)) != 3: continue imcrop = skimage.img_as_ubyte( skimage.transform.resize(im, [224, 224])) last_img_id = img_id last_imcrop = imcrop else: imcrop = first_imcrop text_seq = text_processing.preprocess_sentence( description, vocab_dict, T) # Form batch idx = n_sample - batch_begin text_seq_val[:, idx] = text_seq imcrop_val[idx, ...] = imcrop - vgg_net.channel_mean label_val[idx] = label # Extract visual feature fc8_crop_val = sess.run(fc8_crop, feed_dict={imcrop_batch: imcrop_val}) # Extract language feature lstm_top_val[...] = sess.run(lstm_top, feed_dict={text_seq_batch: text_seq_val}) # Compute scores per proposal scores_val = sess.run(scores, feed_dict={ lstm_top_batch: lstm_top_val, fc8_crop_batch: fc8_crop_val }) scores_val = scores_val[:batch_end - batch_begin + 1, ...].reshape(-1) # Evaluate on bounding labels for indx in range(len(scores_val)): correct_predictions += ((scores_val[indx] > 0) == label_val[indx]) total_predictions += 1 print("%d correct predictions out of %d" % (correct_predictions, total_predictions)) print(correct_predictions / total_predictions) print('Final results on the whole test set') result_str = 'recall = %0.4f \n' % (float(correct_predictions) / total_predictions) print(result_str)
def model_structure(self, sen_data, enc_data, dec_data, msk_data, vis_data, batch_size, is_train, dropout=None): def set_drop_test(): return tf.cast(1.0, tf.float32) def set_drop_train(): return tf.cast(self.dropout, tf.float32) dropout = tf.cond(is_train, set_drop_train, set_drop_test) seq_length = tf.reduce_sum(msk_data, 1) text_seq_batch = sen_data with tf.variable_scope('word_embedding'), tf.device("/cpu:0"): embedding_mat = tf.get_variable( "embedding", [self.vocab_size, self.lstm_dim], tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True)) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch) # we encode phrase based on the last step of hidden states outputs, states = lstm('enc_lstm', embedded_seq, None, seq_length, output_dim=self.lstm_dim, num_layers=1, forget_bias=1.0, apply_dropout=True, keep_prob=dropout, concat_output=False, initializer=tf.random_uniform_initializer( minval=-0.08, maxval=0.08)) sen_raw = states[-1].h sen_raw = tf.nn.l2_normalize(sen_raw, dim=1) # print sen_raw.get_shape() vis_raw = tf.reshape( vis_data, [self.batch_size * self.num_prop, self.img_feat_size]) sen_output = tf.reshape(sen_raw, [self.batch_size, 1, 1, self.lstm_dim]) vis_output = tf.reshape( vis_raw, [self.batch_size, self.num_prop, 1, self.img_feat_size]) sen_tile = tf.tile(sen_output, [1, self.num_prop, 1, 1]) feat_concat = tf.concat([sen_tile, vis_output], 3) feat_proj_init = msr_init( [1, 1, self.lstm_dim + self.img_feat_size, self.hidden_size]) feat_proj = conv("feat_proj", feat_concat, 1, 1, self.hidden_size, weights_initializer=feat_proj_init) feat_relu = tf.nn.relu(feat_proj) att_conv_init = msr_init([1, 1, self.hidden_size, 1]) att_conv = conv("att_conv", feat_relu, 1, 1, 1, weights_initializer=att_conv_init) #Generate the visual attention feature att_scores_t = tf.reshape(att_conv, [self.batch_size, self.num_prop]) # att_prob = tf.nn.softmax(att_scores_t) att_prob = tf.nn.relu(att_scores_t) att_scores = tf.reshape(att_prob, [self.batch_size, self.num_prop, 1]) vis_att_feat = tf.reduce_sum( tf.multiply(vis_data, tf.tile(att_scores, [1, 1, self.img_feat_size])), 1) vis_att_featFC = fc_relu( "vis_enc", vis_att_feat, self.lstm_dim, weights_initializer=tf.random_uniform_initializer(minval=-0.002, maxval=0.002)) vis_att_tile = tf.reshape(vis_att_featFC, [self.batch_size, 1, self.lstm_dim]) text_enc_batch = enc_data # embedded_enc: batch_size x phrase_len x lstm_dim with tf.variable_scope('enc_embedding'), tf.device("/cpu:0"): embedding_enc = tf.get_variable( "embedding", [self.vocab_size, self.lstm_dim], tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True)) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_enc = tf.nn.embedding_lookup(embedding_enc, text_enc_batch) # dec_vis_embed = batch_size x phrase_len x (2*lstm_dim) dec_vis_embed = tf.concat([ embedded_enc, tf.concat([ vis_att_tile, tf.zeros((self.batch_size, self.phrase_len - 1, self.lstm_dim)) ], 1) ], 2) # dec_outputs: batch_size x phrase_len x lstm_dim dec_outs, _ = lstm('dec_lstm', dec_vis_embed, None, seq_length, output_dim=self.lstm_dim, num_layers=1, forget_bias=1.0, apply_dropout=True, keep_prob=dropout, concat_output=True, initializer=tf.random_uniform_initializer( minval=-0.08, maxval=0.08)) dec_outs = tf.reshape( dec_outs, [self.batch_size * self.phrase_len, self.lstm_dim]) # dec_logits: (batch_size*phrase_len) x vocab_size dec_logits = fc( 'dec_logits', dec_outs, self.vocab_size, weights_initializer=tf.contrib.layers.xavier_initializer( uniform=True)) return att_scores_t, dec_logits, vis_data