Ejemplo n.º 1
0
    def __init__(self, imdb, data_params):
        self.imdb = imdb
        self.data_params = data_params

        self.vocab_dict = text_processing.VocabDict(data_params['vocab_question_file'])
        self.T_encoder = data_params['T_encoder']

        # peek one example to see whether answer and gt_layout are in the data
        self.load_answer = ('answer' in self.imdb[0]) and (self.imdb[0]['answer'] is not None)
        self.load_gt_layout = ('gt_layout_tokens' in self.imdb[0]) and (self.imdb[0]['gt_layout_tokens'] is not None)
        if 'load_gt_layout' in data_params:
            self.load_gt_layout = data_params['load_gt_layout']
        # the answer dict is always loaded, regardless of self.load_answer
        self.answer_dict = text_processing.VocabDict(data_params['vocab_answer_file'])
        if not self.load_answer:
            print('imdb does not contain answers')
        if self.load_gt_layout:
            self.T_decoder = data_params['T_decoder']
            self.assembler = data_params['assembler']
            self.prune_filter_module = (data_params['prune_filter_module']
                                        if 'prune_filter_module' in data_params
                                        else False)
        else:
            print('imdb does not contain ground-truth layout')

        # load one feature map to peek its size
        feats = np.load(self.imdb[0]['feature_path'])
        self.feat_H, self.feat_W, self.feat_D = feats.shape[1:]
 def __init__(self, scene_graph_file, vocab_name_file, vocab_attr_file,
              max_num):
     print('Loading scene graph from %s' % scene_graph_file)
     with open(scene_graph_file) as f:
         self.SGs = json.load(f)
     print('Done')
     self.name_dict = text_processing.VocabDict(vocab_name_file)
     self.attr_dict = text_processing.VocabDict(vocab_attr_file)
     self.num_name = self.name_dict.num_vocab
     self.num_attr = self.attr_dict.num_vocab
     self.max_num = max_num
Ejemplo n.º 3
0
    def __init__(self, imdb, data_params):
        self.imdb = imdb
        self.data_params = data_params

        self.vocab_dict = text_processing.VocabDict(
            data_params['vocab_question_file'])
        self.T_encoder = data_params['T_encoder']

        # peek one example to see whether answer and gt_layout are in the data
        self.load_answer = ('answer' in self.imdb[0]
                            and self.imdb[0]['answer'] is not None)
        self.load_bbox = ('bbox' in self.imdb[0]
                          and self.imdb[0]['bbox'] is not None)
        self.load_gt_layout = (
            ('load_gt_layout' in data_params and data_params['load_gt_layout'])
            and ('gt_layout_tokens' in self.imdb[0]
                 and self.imdb[0]['gt_layout_tokens'] is not None))

        # Jiangnan: image understanding
        self.load_question = 'load_question' in self.imdb[0]
        # Jiangnan: question understanding
        self.load_sent_percent = 'sent_percent' in self.imdb[0]

        # the answer dict is always loaded, regardless of self.load_answer
        self.answer_dict = text_processing.VocabDict(
            data_params['vocab_answer_file'])
        self.layout_dict = text_processing.VocabDict(
            data_params['vocab_layout_file'])
        if not self.load_answer:
            print('imdb does not contain answers')
        if not self.load_bbox:
            print('imdb does not contain bounding boxes')
        if self.load_gt_layout:
            self.T_decoder = data_params['T_decoder']
            # Prune multiple filter modules by default
            self.prune_filter_module = (data_params['prune_filter_module']
                                        if 'prune_filter_module' in data_params
                                        else True)
        else:
            print('imdb does not contain ground-truth layout')

        # load one feature map to peek its size
        feats = np.load(self.imdb[0]['feature_path'])
        self.feat_H, self.feat_W, self.feat_D = feats.shape[1:]
        if self.load_bbox:
            self.img_H = data_params['img_H']
            self.img_W = data_params['img_W']
            self.stride_H = self.img_H * 1. / self.feat_H
            self.stride_W = self.img_W * 1. / self.feat_W
Ejemplo n.º 4
0
def main(FLAGS):
  """Main function.

  1. Extracts vocabularies from questions and answers.
  2. Creates and saves image dialog databases for train | valid | test splits.

  Args:
    FLAGS: Command-line options.
  """

  # Read the dataset.
  with open(FLAGS.json_path) as file_id:
    data = json.load(file_id)

  # Extract vocabulary and answer list.
  save_vocabularies(data['trainExamples'], FLAGS)

  # Extract mean and std of train images.
  save_mean_std_image(FLAGS)

  # Read the vocabulary files (questions | answers) and create objects
  vocab = text_processing.VocabDict(FLAGS.vocab_save_path)

  with open(FLAGS.answers_save_path, 'r') as file_id:
    ans_list = [ii.strip('\n') for ii in file_id.readlines()]

  # data splits
  for split in ['train', 'valid', 'test']:
    imdb_split = build_imdb(data, split, vocab, ans_list, FLAGS)
    save_path = os.path.join(FLAGS.imdb_save_path, 'imdb_%s.npy' % split)
    print('Saving imdb build: %s' % save_path)
    np.save(save_path, np.array(imdb_split))
Ejemplo n.º 5
0
    def __init__(self, imdb, data_params):
        self.imdb = imdb
        self.data_params = data_params

        self.vocab_dict = text_processing.VocabDict(
            data_params['vocab_question_file'])
        self.T_encoder = data_params['T_encoder']

        # peek one example to see whether answer is in the data
        self.load_answer = ('answer' in self.imdb[0])
        # peek one example to see whether bbox is in the data
        self.load_bbox = ('bbox' in self.imdb[0])
        # the answer dict is always loaded, regardless of self.load_answer
        self.answer_dict = text_processing.VocabDict(
            data_params['vocab_answer_file'])
        if not (self.load_answer or self.load_bbox):
            print('imdb has no answer labels or bbox. Using dummy labels.\n\n'
                  '**The final accuracy will be zero (no labels provided)**\n')

        # positional encoding
        self.add_pos_enc = data_params.get('add_pos_enc', False)
        self.pos_enc_dim = data_params.get('pos_enc_dim', 0)
        assert self.pos_enc_dim % 4 == 0, \
            'positional encoding dim must be a multiply of 4'
        self.pos_enc_scale = data_params.get('pos_enc_scale', 1.)

        self.load_spatial_feature = data_params['load_spatial_feature']
        if self.load_spatial_feature:
            spatial_feature_dir = data_params['spatial_feature_dir']
            self.spatial_loader = SpatialFeatureLoader(spatial_feature_dir)
            # load one feature map to peek its size
            x = self.spatial_loader.load_feature(self.imdb[0]['imageId'])
            self.spatial_D, self.spatial_H, self.spatial_W = x.shape
            # positional encoding
            self.pos_enc = self.pos_enc_scale * get_positional_encoding(
                self.spatial_H, self.spatial_W, self.pos_enc_dim)

        if self.load_bbox:
            self.img_H = data_params['img_H']
            self.img_W = data_params['img_W']
            self.stride_H = self.img_H * 1. / self.spatial_H
            self.stride_W = self.img_W * 1. / self.spatial_W
Ejemplo n.º 6
0
    def __init__(self, imdb, data_params):
        self.imdb = imdb
        self.data_params = data_params

        self.vocab_dict = text_processing.VocabDict(
            data_params['vocab_question_file'])
        self.T_encoder = data_params['T_encoder']

        # peek one example to see whether answer and gt_layout are in the data
        self.load_answer = ('valid_answers' in self.imdb[0]
                            and self.imdb[0]['valid_answers'])
        self.load_gt_layout = (
            ('load_gt_layout' in data_params and data_params['load_gt_layout'])
            and ('gt_layout_tokens' in self.imdb[0]
                 and self.imdb[0]['gt_layout_tokens'] is not None))
        # the answer dict is always loaded, regardless of self.load_answer
        self.answer_dict = text_processing.VocabDict(
            data_params['vocab_answer_file'])
        if not self.load_answer:
            print('imdb does not contain answers')
        self.T_decoder = data_params['T_decoder']
        self.layout_dict = text_processing.VocabDict(
            data_params['vocab_layout_file'])
        if self.load_gt_layout:
            # Prune multiple filter modules by default
            self.prune_filter_module = (data_params['prune_filter_module']
                                        if 'prune_filter_module' in data_params
                                        else True)
        else:
            print('imdb does not contain ground-truth layout')
        # Whether to load soft scores (targets for sigmoid regression)
        self.load_soft_score = ('load_soft_score' in data_params
                                and data_params['load_soft_score'])

        # load one feature map to peek its size
        feats = np.load(self.imdb[0]['feature_path'])
        self.feat_H, self.feat_W, self.feat_D = feats.shape[1:]
Ejemplo n.º 7
0
    def __init__(self, num_vocab, num_choices):
        super().__init__()
        if cfg.INIT_WRD_EMB_FROM_FILE:
            embeddingsInit = np.load(cfg.WRD_EMB_INIT_FILE)  # 2956 * 300
            assert embeddingsInit.shape == (num_vocab - 1, cfg.WRD_EMB_DIM)
        else:
            embeddingsInit = np.random.randn(num_vocab - 1, cfg.WRD_EMB_DIM)
        self.num_vocab = num_vocab  # 2957
        self.num_choices = num_choices  # 1845

        self.tokenizer = BertTokenizer.from_pretrained(
            '/home/xdjf/bert_config/bert-base-uncased')
        self.model = BertModel.from_pretrained(
            '/home/xdjf/bert_config/bert-base-uncased')
        self.name_dict = text_processing.VocabDict(cfg.VOCAB_NAME_FILE)
        name_embedding = self.reset_name_embedding()

        self.encoder = Encoder(embeddingsInit, name_embedding)
        self.lcgn = LCGN()
        #self.sema_lcgn = SemanLCGN()
        self.single_hop = SingleHop()
        self.classifier = Classifier(num_choices)
Ejemplo n.º 8
0
import json
import os

import sys; sys.path.append('../../')  # NOQA
from util import text_processing
from collections import Counter

vocab_answer_file = './answers_vqa.txt'
annotation_file = '../vqa_dataset/Annotations/mscoco_%s_annotations.json'
question_file = '../vqa_dataset/Questions/OpenEnded_mscoco_%s_questions.json'
gt_layout_file = './gt_layout_%s_new_parse.npy'

image_dir = '../coco_dataset/images/%s/'
feature_dir = './resnet152_c5_7x7/%s/'

answer_dict = text_processing.VocabDict(vocab_answer_file)
valid_answer_set = set(answer_dict.word_list)


def extract_answers(q_answers):
    all_answers = [answer["answer"] for answer in q_answers]
    valid_answers = [a for a in all_answers if a in valid_answer_set]
    # build soft scores
    soft_score_inds = []
    soft_score_target = []
    valid_answer_counter = Counter(valid_answers)
    for k, v in valid_answer_counter.items():
        soft_score_inds.append(answer_dict.word2idx(k))
        soft_score_target.append(min(1., v / 3.))
    return all_answers, valid_answers, soft_score_inds, soft_score_target
Ejemplo n.º 9
0
    def __init__(self, imdb, data_params):
        self.imdb = imdb
        self.data_params = data_params

        self.vocab_dict = text_processing.VocabDict(
            data_params['vocab_question_file'])
        self.T_encoder = data_params['T_encoder']

        # peek one example to see whether answer and gt_layout are in the data
        self.load_answer = ('valid_answers'
                            in self.imdb[0]) and (self.imdb[0]['valid_answers']
                                                  is not None)
        self.load_gt_layout = ('gt_layout_tokens' in self.imdb[0]) and (
            self.imdb[0]['gt_layout_tokens'] is not None)
        if 'load_gt_layout' in data_params:
            self.load_gt_layout = data_params['load_gt_layout']
        # decide whether or not to load gt textatt
        self.load_gt_txtatt = ('gt_txtatt'
                               in self.imdb[0]) and (self.imdb[0]['gt_txtatt']
                                                     is not None)
        if 'load_gt_txtatt' in data_params:
            self.load_gt_txtatt = data_params['load_gt_txtatt']

        # the answer dict is always loaded, regardless of self.load_answer
        self.answer_dict = text_processing.VocabDict(
            data_params['vocab_answer_file'])
        self.num_choices = self.answer_dict.num_vocab
        if not self.load_answer:
            print('imdb does not contain answers')
        else:
            self.load_binary_labels = ('load_binary_labels' in data_params) \
                and data_params['load_binary_labels']
            if self.load_binary_labels:
                print('loading softmax and binary classification labels.')
            else:
                print('loading softmax labels (but not binary labels).')
        # if 'overriding_layout' is set in data_params, force self.load_gt_layout to True
        # and overrides the ground-truth layout
        self.overriding_layout = None
        if 'overriding_layout' in data_params:
            print('"overriding_layout" key is set in data_params')
            print('overriding all layout with:',
                  data_params['overriding_layout'])
            self.load_gt_layout = True
            self.load_gt_txtatt = False
            self.overriding_layout = data_params['overriding_layout']

        if self.load_gt_layout:
            self.T_decoder = data_params['T_decoder']
            self.assembler = data_params['assembler']
            # self.prune_filter_module = (data_params['prune_filter_module']
            #                             if 'prune_filter_module' in data_params
            #                             else False)
        else:
            print(
                'imdb does not contain ground-truth layout, and "overriding_layout" key is not set'
            )

        if 'use_count_module' in data_params and data_params[
                'use_count_module']:
            print(
                'Use Count module: all "how many" questions will use Count for answer'
            )
            self.use_count_module = True
        else:
            print('Not using Count module')
            self.use_count_module = False

        # load one feature map to peek its size
        feats = np.load(self.imdb[0]['feature_path'])
        self.feat_H, self.feat_W, self.feat_D = feats.shape[1:]
Ejemplo n.º 10
0
  def __init__(self, imdb, params):
    """Initialize by reading the data and pre-processing it.
    """

    self.imdb = imdb
    self.params = params
    self.num_inst = len(self.imdb['data'])
    self.num_rounds = len(self.imdb['data'][0]['question_ind'])

    # load vocabulary
    vocab_path = params['text_vocab_path']
    self.vocab_dict = text_processing.VocabDict(vocab_path)
    self.T_encoder = params['max_enc_len']

    # record special token ids
    self.start_token_id = self.vocab_dict.word2idx('<start>')
    self.end_token_id = self.vocab_dict.word2idx('<end>')
    self.pad_token_id = self.vocab_dict.word2idx('<pad>')
    # Load answers
    with open(params['args']['answer_list_path'], 'r') as file_id:
      choices = [ii.strip('\n') for ii in file_id.readlines()]
      self.num_choices = len(choices)
      self.choices2ind = {ii: index for index, ii in enumerate(choices)}
      self.ind2choices = {index: ii for index, ii in enumerate(choices)}

    # peek one example to see whether answer and gt_layout are in the data
    test_data = self.imdb['data'][0]
    self.load_gt_layout = test_data.get('gt_layout_tokens', False)
    if 'load_gt_layout' in params:
      self.load_gt_layout = params['load_gt_layout']

    if self.load_gt_layout:
      self.T_decoder = params['max_dec_len']
      self.assembler = params['assembler']

    # load the mean of the images
    load_path = params['path'].split('/')[:-1] + ['train_image_mean.npy']
    load_path = '/'.join(load_path)
    print('Loading training image stats from: ' + load_path)
    img_stats = np.load(load_path)[()]
    mean_img = img_stats['mean_img'].reshape([1, 1, -1])
    std_img = img_stats['std_img'].reshape([1, 1, -1])

    # read all the images
    images = {}
    print('Reading images..')
    #TODO: Change this back!
    for datum in progressbar(self.imdb['data'][::3]):
      img_path = datum['image_path']

      if img_path not in images:
        cur_img = support.load_image(img_path)
        cur_img = (cur_img - mean_img) / std_img
        images[img_path] = cur_img

    self.images = images

    # get the shape from random image
    for _, sample in self.images.items():
      self.img_size = sample.shape
      break

    # convert to tokens
    self.digitizer = lambda x: [self.vocab_dict.word2idx(w) for w in x]

    # use history if needed by the program generator
    self.use_history = self.params['generator'] == 'mem'
    if self.use_history:
      self._construct_history()

    # if fact is to be used
    if self.params['use_fact']:
      self._construct_fact()
Ejemplo n.º 11
0
    def __init__(self, imdb, data_params):
        self.imdb = imdb
        self.data_params = data_params

        self.vocab_dict = text_processing.VocabDict(
            data_params['vocab_question_file'])
        self.T_encoder = data_params['T_encoder']
        self.N_encoder = data_params['N_encoder']
        self.O_encoder = data_params['O_encoder']
        # peek one example to see whether answer is in the data
        self.load_answer = ('answer' in self.imdb[0])
        # the answer dict is always loaded, regardless of self.load_answer
        self.answer_dict = text_processing.VocabDict(
            data_params['vocab_answer_file'])
        if not self.load_answer:
            print('imdb has no answer labels. Using dummy labels.\n\n'
                  '**The final accuracy will be zero (no labels provided)**\n')

        #self.nlp = spacy.load('en_core_web_lg')
        # positional encoding
        self.add_pos_enc = data_params.get('add_pos_enc', False)
        self.pos_enc_dim = data_params.get('pos_enc_dim', 0)
        assert self.pos_enc_dim % 4 == 0, \
            'positional encoding dim must be a multiply of 4'
        self.pos_enc_scale = data_params.get('pos_enc_scale', 1.)

        self.load_spatial_feature = False
        self.load_objects_feature = False
        self.load_scene_graph_feature = True
        feature_type = data_params['feature_type']
        if feature_type == 'spatial':
            self.load_spatial_feature = True
        elif feature_type == 'objects':
            self.load_objects_feature = True
        elif feature_type == 'scene_graph':
            self.load_scene_graph_feature = True
        else:
            raise ValueError('Unknown feature type: %s' % feature_type)

        if self.load_spatial_feature:
            spatial_feature_dir = data_params['spatial_feature_dir']
            self.spatial_loader = SpatialFeatureLoader(spatial_feature_dir)
            # load one feature map to peek its size
            x = self.spatial_loader.load_feature(self.imdb[0]['imageId'])
            self.spatial_D, self.spatial_H, self.spatial_W = x.shape
            # positional encoding
            self.pos_enc = self.pos_enc_scale * get_positional_encoding(
                self.spatial_H, self.spatial_W, self.pos_enc_dim)

        if self.load_objects_feature:
            objects_feature_dir = data_params['objects_feature_dir']
            self.objects_loader = ObjectsFeatureLoader(objects_feature_dir)
            # load one feature map to peek its size
            self.objects_M = data_params.get('objects_max_num', 100)
            x, _ = self.objects_loader.load_feature(self.imdb[0]['imageId'])
            _, self.objects_D = x.shape

        if self.load_scene_graph_feature:
            scene_graph_file = data_params['scene_graph_file']
            vocab_name_file = data_params['vocab_name_file']
            vocab_attr_file = data_params['vocab_attr_file']
            self.objects_M = data_params.get('objects_max_num', 100)
            self.scene_graph_loader = SceneGraphFeatureLoader(
                scene_graph_file,
                vocab_name_file,
                vocab_attr_file,
                max_num=self.objects_M)
            if feature_type == 'scene_graph':
                # load one feature map to peek its size
                x, _, _ = self.scene_graph_loader.load_feature_normalized_bbox(
                    self.imdb[0]['imageId'])
                _, self.objects_D = x.shape
            else:
                self.load_scene_graph_feature = False

        self.se_max_len = -1
        self.se_zero_len = 0
        self.se_count = Counter()
        self.stop_words = ['of', 'the', 'to', 'on', 'in', 'at', 'a', 'and']
Ejemplo n.º 12
0
  def __init__(self, imdb, params):
    """Initialize by reading the data and pre-processing it.
    """

    self.imdb = imdb
    self.params = params
    self.fetch_options = self.params.get('fetch_options', False)
    self.preload_features = params['preload_features']
    self.num_inst = len(self.imdb['data'])
    self.num_rounds = len(self.imdb['data'][0]['question_ind'])

    # check if vgg features are to be used
    self.use_vgg = 'vgg' in self.params['feature_path']

    # load vocabulary
    vocab_path = params['text_vocab_path']
    self.vocab_dict = text_processing.VocabDict(vocab_path)
    self.T_encoder = params['max_enc_len']

    # record special token ids
    self.start_token_id = self.vocab_dict.word2idx('<start>')
    self.end_token_id = self.vocab_dict.word2idx('<end>')
    self.pad_token_id = self.vocab_dict.word2idx('<pad>')

    # peek one example to see whether answer and gt_layout are in the data
    test_data = self.imdb['data'][0]
    self.load_gt_layout = test_data.get('gt_layout_tokens', False)
    if 'load_gt_layout' in params:
      self.load_gt_layout = params['load_gt_layout']

    # decide whether or not to load gt textatt
    self.supervise_attention = params['supervise_attention']
    self.T_decoder = params['max_dec_len']
    self.assembler = params['assembler']

    # load one feature map to peek its size
    feats = np.load(self._adjust_image_dir(test_data['feature_path']))
    self.feat_H, self.feat_W, self.feat_D = feats.shape[1:]

    # convert to tokens
    self.digitizer = lambda x: [self.vocab_dict.word2idx(w) for w in x]

    if 'prog' in self.params['model']:
      # preload features
      if self.preload_features:
        img_paths = set([ii['feature_path'] for ii in self.imdb['data']])
        self.img_feats = {ii:np.load(ii) for ii in progressbar(img_paths)}

      # if VGG is to be used
      if self.use_vgg:
        # inform the dataloader to use self.img_feats
        self.preload_features = True
        img_paths = set([ii['feature_path'] for ii in self.imdb['data']])

        # first read the index file
        index_file = os.path.join(self.params['input_img'], 'img_id.json')
        with open(index_file, 'r') as file_id:
          index_data = json.load(file_id)

        # get the split -- either train / val
        for ii in img_paths: break
        split = ii.split('/')[-2][:-4]

        # read the features for that particular split
        self.img_index = {img_id: index for index, img_id
                          in enumerate(index_data[split])}
        feature_file = os.path.join(self.params['input_img'],
                                    'data_img_%s.h5' % split)
        key = 'images_test' if split == 'val' else 'images_train'
        self.img_feats = h5py.File(feature_file)[key]

        # check if all the images in img_paths are in img_index
        count = 0
        for ii in img_paths:
          img_id = '/'.join(ii.split('/')[-2:])
          if img_id.replace('npy', 'jpg') not in self.img_index:
            count += 1
        print('Missing: %d image features' % count)

        # adjust the feature sizes
        self.feat_H, self.feat_W, self.feat_D = self.img_feats.shape[1:]
        self.zero_feature = np.zeros((1,) + self.img_feats.shape[1:])

    # use history if needed by the program generator
    self.use_history = self.params['generator'] == 'mem'
    if self.use_history:
      self._construct_history()

    # if fact is to be used
    if self.params['use_fact']:
      self._construct_fact()
    def __init__(self, imdb, data_params):
        self.imdb = imdb
        self.data_params = data_params

        self.vocab_dict = text_processing.VocabDict(
            data_params['vocab_question_file'])
        self.T_encoder = data_params['T_encoder']

        # peek one example to see whether answer is in the data
        self.load_answer = ('answer' in self.imdb[0])
        # the answer dict is always loaded, regardless of self.load_answer
        self.answer_dict = text_processing.VocabDict(
            data_params['vocab_answer_file'])
        if not self.load_answer:
            print('imdb does not contain answers')

        self.load_spatial_feature = False
        self.load_objects_feature = False
        self.load_scene_graph_feature = False
        feature_type = data_params['feature_type']
        if feature_type == 'spatial':
            self.load_spatial_feature = True
        elif feature_type == 'objects':
            self.load_objects_feature = True
        elif feature_type == 'scene_graph':
            self.load_scene_graph_feature = True
        else:
            raise ValueError('Unknown feature type: %s' % feature_type)

        if self.load_spatial_feature:
            spatial_feature_dir = data_params['spatial_feature_dir']
            self.spatial_loader = SpatialFeatureLoader(spatial_feature_dir)
            # load one feature map to peek its size
            x = self.spatial_loader.load_feature(self.imdb[0]['imageId'])
            self.spatial_D, self.spatial_H, self.spatial_W = x.shape
            # positional encoding
            self.spatial_pos_enc_dim = data_params['spatial_pos_enc_dim']
            self.pos_enc = get_positional_encoding(self.spatial_H,
                                                   self.spatial_W,
                                                   self.spatial_pos_enc_dim)

        if self.load_objects_feature:
            objects_feature_dir = data_params['objects_feature_dir']
            self.objects_M = data_params['objects_max_num']
            self.objects_loader = ObjectsFeatureLoader(objects_feature_dir)
            # load one feature map to peek its size
            x, _ = self.objects_loader.load_feature(self.imdb[0]['imageId'])
            _, self.objects_D = x.shape
            self.bbox_tile_num = data_params['bbox_tile_num']

        if self.load_scene_graph_feature:
            scene_graph_file = data_params['scene_graph_file']
            vocab_name_file = data_params['vocab_name_file']
            vocab_attr_file = data_params['vocab_attr_file']
            self.objects_M = data_params['objects_max_num']
            self.scene_graph_loader = SceneGraphFeatureLoader(
                scene_graph_file,
                vocab_name_file,
                vocab_attr_file,
                max_num=self.objects_M)
            # load one feature map to peek its size
            x, _, _ = self.scene_graph_loader.load_feature_normalized_bbox(
                self.imdb[0]['imageId'])
            _, self.objects_D = x.shape
            self.bbox_tile_num = data_params['bbox_tile_num']
Ejemplo n.º 14
0
def build_imdb(FLAGS):
  """Method to construct and save the image-database for the dataset
  """

  print('Building imdb for visdial split: %s' % FLAGS.visdial_file)
  qid2layout_dict = np.load(FLAGS.ques_prog_file)[()]

  ques_att_file = FLAGS.ques_prog_file.replace('.layout', '.attention')
  ques_prog_att = np.load(ques_att_file)[()]

  cap_progs = np.load(FLAGS.cap_prog_file)[()]
  cap_att_file = FLAGS.cap_prog_file.replace('.layout', '.attention')
  cap_prog_att = np.load(cap_att_file)[()]
  vocab = text_processing.VocabDict(FLAGS.vocab_file)

  # load the data
  with open(FLAGS.visdial_file, 'r') as file_id:
    vd_data = json.load(file_id)

  # load the reference data
  with open(FLAGS.coreference_file, 'r') as file_id:
    references = json.load(file_id)
    references = references['data']['dialogs']

  # coco_name = img_split + '2014'
  # img_root = os.path.abspath(image_dir % coco_name)
  # feat_root = os.path.abspath(feature_dir % coco_name)
  # img_name_format = 'COCO_' + coco_name + '_%012d'

  # process and tokenize all questions and answers
  tokenizer = lambda x, suff: [vocab.word2idx(ii) for ii in
                               word_tokenize(clean.clean_non_ascii(x + suff))]

  print('Tokenizing captions')
  caption_list = [ii['caption'] for ii in vd_data['data']['dialogs']]
  clean_cap = [tokenizer(cap, '') for cap in progressbar(caption_list)]
  max_cap_len = max([len(ii) for ii in clean_cap])

  cap_tokens = np.zeros((len(clean_cap), max_cap_len)).astype('int32')
  cap_tokens.fill(vocab.word2idx('<pad>'))
  cap_lens = np.zeros(len(clean_cap)).astype('int32')

  for q_id, tokens in progressbar(enumerate(clean_cap)):
    cap_lens[q_id] = len(tokens)
    cap_tokens[q_id, :cap_lens[q_id]] = np.array(tokens)

  print('Tokenizing questions')
  question_list = vd_data['data']['questions']
  clean_ques = [tokenizer(ques, '?') for ques in progressbar(question_list)]
  max_ques_len = max([len(ii) for ii in clean_ques])

  ques_tokens = np.zeros((len(clean_ques), max_ques_len)).astype('int32')
  ques_tokens.fill(vocab.word2idx('<pad>'))
  ques_lens = np.zeros(len(clean_ques)).astype('int32')

  for q_id, tokens in progressbar(enumerate(clean_ques)):
    ques_lens[q_id] = len(tokens)
    ques_tokens[q_id, :ques_lens[q_id]] = np.array(tokens)

  print('Tokenizing answers')
  answer_list = vd_data['data']['answers']
  clean_ans = [tokenizer(ans, '') for ans in progressbar(answer_list)]
  max_ans_len = max([len(ii) for ii in clean_ans])

  ans_tokens = np.zeros((len(clean_ans), max_ans_len)).astype('int32')
  ans_tokens.fill(vocab.word2idx('<pad>'))
  ans_lens = np.zeros(len(clean_ans)).astype('int32')

  ans_in = np.zeros((len(clean_ans), max_ans_len + 1)).astype('int32')
  ans_out = np.zeros((len(clean_ans), max_ans_len + 1)).astype('int32')
  ans_in.fill(vocab.word2idx('<pad>'))
  ans_out.fill(vocab.word2idx('<pad>'))
  start_token_id = vocab.word2idx('<start>')
  end_token_id = vocab.word2idx('<end>')
  ans_in[:, 0] = start_token_id

  for a_id, tokens in progressbar(enumerate(clean_ans)):
    ans_lens[a_id] = len(tokens)
    answer = np.array(tokens)
    ans_tokens[a_id, :ans_lens[a_id]] = answer
    ans_in[a_id, 1:ans_lens[a_id]+1] = answer
    ans_out[a_id, :ans_lens[a_id]] = answer
    ans_out[a_id, ans_lens[a_id]] = end_token_id

  ans_lens += 1

  imdb = {}
  # number of entries in the database
  num_dialogs = len(vd_data['data']['dialogs'])
  imdb['data'] = [None] * num_dialogs
  imdb['ans'], imdb['ans_len'] = ans_tokens, ans_lens
  imdb['ans_in'], imdb['ans_out'] = ans_in, ans_out
  imdb['ques'], imdb['ques_len'] = ques_tokens, ques_lens
  imdb['cap'], imdb['cap_len'] = cap_tokens, cap_lens
  imdb['cap_prog'], imdb['cap_prog_att'] = cap_progs, np.array(cap_prog_att)

  for dialog_id, datum in progressbar(enumerate(vd_data['data']['dialogs'])):
    img_id = datum['image_id']
    img_path = FLAGS.image_path_format % img_id
    feat_path = FLAGS.feature_path % img_id

    # compact bundle with all the information
    bundle = {'image_name': img_id, 'image_path': img_path,
              'feature_path': feat_path, 'caption_ind': dialog_id,
              'question_id': [], 'question_ind': [], 'answer_ind': [],
              'option_ind': [], 'gt_ind' : [], 'gt_layout_tokens': [],
              'gt_layout_att': []}

    # reference datum
    refer_datum = references[dialog_id]
    assert(refer_datum['image_id'] == img_id)
    # for each cluster, get the first mention
    clusters = {}
    caption_clusters = (refer_datum['caption_reference_clusters'] +
                        refer_datum['caption_coref_clusters'])
    for ii in caption_clusters:
      c_id = ii['cluster_id']
      clusters[c_id] = clusters.get(c_id, 'c')

    # each round
    for r_id in range(10): # assuming 10 rounds for now
      referrer = refer_datum['dialog'][r_id]
      for ii in referrer['question_reference_clusters']:
        c_id = ii['cluster_id']
        clusters[c_id] = clusters.get(c_id, 'q%d' % r_id)

      for ii in referrer['answer_reference_clusters']:
        c_id = ii['cluster_id']
        # to distinguish answer
        clusters[c_id] = clusters.get(c_id, 'a%d' % r_id)

    # bundle as questions in a conversation together
    num_refers = 0
    for r_id, round_data in enumerate(datum['dialog']):
      q_id = img_id * 10 + r_id

      bundle['question_id'].append(q_id)
      bundle['question_ind'].append(round_data['question'])
      bundle['answer_ind'].append(round_data['answer'])
      bundle['option_ind'].append(round_data['answer_options'])
      bundle['gt_ind'].append(round_data['gt_index'])

      # gt attention for parsed layout
      attention = np.array(ques_prog_att[round_data['question']])

      # check if references is non-empty and replace with _Refer
      layout = copy.deepcopy(list(qid2layout_dict[q_id]))
      referrer = refer_datum['dialog'][r_id]['question_referrer_clusters']
      if len(referrer) > 0:
        refer = referrer[0]
        # pick _Find module with max attention overlap
        max_overlap = (0, 0)
        for pos, token in enumerate(layout):
          if token == '_Find':
            start = max(attention[pos][0], refer['start_word'])
            end = min(attention[pos][1], refer['end_word'])
            overlap = min(0, end - start)
            if max_overlap[1] < overlap: max_overlap = (pos, overlap)

        # reset it to _Refer
        pos, _ = max_overlap
        layout[pos] = '_Refer'
        attention[pos] = [refer['start_word'], refer['end_word']]

        # get that cluster id, and corresponding history attention
        num_refers += 1
      bundle['gt_layout_tokens'].append(layout)

      # check for the words attending to
      ques_tokens = imdb['ques'][round_data['question']]
      ques_words = [vocab.idx2word(ii) for ii in ques_tokens]
      for index, pos in enumerate(attention):
        # if single word,  'the', 'a', 'of', 'you'
        try:
          if (pos[1] - pos[0]) == 1 and ques_words[pos[0]] in stop_words:
            attention[index] = [0, 0]
        except: pdb.set_trace()
      bundle['gt_layout_att'].append(attention)

    # record
    imdb['data'][dialog_id] = bundle
  return imdb