Ejemplo n.º 1
0
 def tokenize(self, max_length=config.max_question_len):
     """ Tokenizes the questions.
         This will add q_token in each entry of the dataset.
         -1 represent nil, and should be treated as padding_idx in embedding.
     """
     for entry in self.entries:
         tokens = self.dictionary.tokenize(entry['question'], False)
         tokens = tokens[:max_length]
         if len(tokens) < max_length:
             # Note here we pad in front of the sentence
             padding = [self.dictionary.padding_idx] * (max_length - len(tokens))
             tokens = padding + tokens
         utils.assert_eq(len(tokens), max_length)
         entry['q_token'] = tokens
Ejemplo n.º 2
0
def _load_dataset(dataroot, name, img_id2val, label2ans):
    """Load entries

    img_id2val: dict {img_id -> val} val can be used to retrieve image or features
    dataroot: root path of dataset
    name: 'train', 'val', 'test'
    """
    data_path = os.path.join(dataroot, name + 'set.json')
    samples = json.load(open(data_path))
    samples = sorted(samples, key=lambda x: x['qid'])

    answer_path = os.path.join(dataroot, 'cache', '%s_target.pkl' % name)
    answers = cPickle.load(open(answer_path, 'rb'))
    answers = sorted(answers, key=lambda x: x['qid'])

    utils.assert_eq(len(samples), len(answers))
    entries = []
    for sample, answer in zip(samples, answers):
        utils.assert_eq(sample['qid'], answer['qid'])
        utils.assert_eq(sample['image_name'], answer['image_name'])
        img_id = sample['image_name']
        if not COUNTING_ONLY or is_howmany(sample['question'], answer,
                                           label2ans):
            entries.append(_create_entry(img_id2val[img_id], sample, answer))

    return entries
Ejemplo n.º 3
0
def _load_dataset(cache_path, name, img_id2val):
    """ Load entries. img_id2val: dict {img_id -> val} ,
        val can be used to retrieve image or features.
    """
    train, val, test = False, False, False
    if name == 'train':
        train = True
    elif name == 'val':
        val = True
    else:
        test = True
    question_path = utils.path_for(train=train,
                                   val=val,
                                   test=test,
                                   question=True)
    questions = json.load(open(question_path, 'r'))
    if not config.cp_data:
        questions = questions['questions']
    questions = sorted(questions, key=lambda x: x['question_id'])
    if test:  # will be ignored anyway
        answers = [{
            'image_id': 0,
            'question_id': 0,
            'question_type': '',
            'labels': [],
            'scores': []
        } for _ in range(len(questions))]
    else:
        answer_path = os.path.join(cache_path, '{}_target.json'.format(name))
        answers = json.load(open(answer_path, 'r'))
        answers = sorted(answers, key=lambda x: x['question_id'])
        utils.assert_eq(len(questions), len(answers))

    entries = []
    for question, answer in zip(questions, answers):
        if not test:
            utils.assert_eq(question['question_id'], answer['question_id'])
            utils.assert_eq(question['image_id'], answer['image_id'])
        img_id = question['image_id']
        entries.append(_create_entry(img_id2val[img_id], question, answer))
    return entries