Esempio n. 1
0
    def ans_tokenize(self, max_length=6):
        """Tokenizes the questions.

        This will add q_token in each entry of the dataset.
        -1 represent nil, and should be treated as padding_idx in embedding
        """
        for entry in self.entries:
            tokens = self.dictionary.tokenize(entry['ans_gt'], False)
            tokens = tokens[:max_length]
            if len(tokens) < max_length:
                # Note here we pad in front of the sentence
                padding = [self.dictionary.padding_idx
                           ] * (max_length - len(tokens))
                tokens = tokens + padding
            utils.assert_eq(len(tokens), max_length)
            entry['ans_gt_token'] = tokens
            ans_mc = []
            for ans in entry['ans_mc']:
                token = self.dictionary.tokenize(ans, False)
                token = token[:max_length]
                if len(token) < max_length:
                    # Note here we pad in front of the sentence
                    padding = [self.dictionary.padding_idx
                               ] * (max_length - len(token))
                    token = token + padding
                utils.assert_eq(len(token), max_length)
                ans_mc.append(token)
            entry['ans_mc_token'] = ans_mc
Esempio n. 2
0
def make_json(logits, qIds, dataloader):
    utils.assert_eq(logits.size(0), len(qIds))
    results = []
    for i in range(logits.size(0)):
        result = {}
        result['question_id'] = int(qIds[i])
        result['answer'] = get_answer(logits[i], dataloader)
        results.append(result)
    return results
Esempio n. 3
0
    def tokenize(self, max_length=14):
        """Tokenizes the questions.

        This will add q_token in each entry of the dataset.
        -1 represent nil, and should be treated as padding_idx in embedding
        """
        for entry in self.entries:
            tokens = self.dictionary.tokenize(entry['question'], False)
            tokens = tokens[:max_length]
            if len(tokens) < max_length:
                # Note here we pad in front of the sentence
                padding = [self.dictionary.padding_idx] * (max_length - len(tokens))
                tokens = tokens + padding
            utils.assert_eq(len(tokens), max_length)
            entry['q_token'] = tokens
Esempio n. 4
0
def _load_TDIUC_dataset(dataroot, name, img_id2val, label2ans, teacher_logits):
    """Load entries

    img_id2val: dict {img_id -> val} val can be used to retrieve image or features
    dataroot: root path of dataset
    name: 'train', 'val'
    """
    question_path = os.path.join(dataroot, 'TDIUC_%s_questions.json' % name)
    questions = sorted(json.load(open(question_path))['questions'],
                       key=lambda x: x['question_id'])
    answer_path = os.path.join(dataroot, 'cache', '%s_target.pkl' % name)
    answers = cPickle.load(open(answer_path, 'rb'))
    answers = sorted(answers, key=lambda x: x['question_id'])

    utils.assert_eq(len(questions), len(answers))
    entries = []
    # questions = questions[:10000]
    # answers = answers[:10000]
    for question, answer in zip(questions, answers):
        utils.assert_eq(question['question_id'], answer['question_id'])
        utils.assert_eq(question['image_id'], answer['image_id'])
        img_id = question['image_id']

        if not COUNTING_ONLY or is_howmany(question['question'], answer,
                                           label2ans):
            entries.append(_create_entry(img_id2val[img_id], question, answer, \
                           teacher_logits[question['question_id']] if len(teacher_logits) > 0 else None))

    return entries
Esempio n. 5
0
def _load_dataset(dataroot, name, img_id2val, label2ans, teacher_logits):
    """Load entries

    img_id2val: dict {img_id -> val} val can be used to retrieve image or features
    dataroot: root path of dataset
    name: 'train', 'val', 'test-dev2015', test2015'
    """
    question_path = os.path.join(
        dataroot, 'v2_OpenEnded_mscoco_%s_questions.json' % \
        (name + '2014' if 'test'!=name[:4] else name))
    questions = sorted(json.load(open(question_path))['questions'],
                       key=lambda x: x['question_id'])
    if 'test' != name[:4]:  # train, val
        answer_path = os.path.join(dataroot, 'cache', '%s_target.pkl' % name)
        answers = cPickle.load(open(answer_path, 'rb'))
        answers = sorted(answers, key=lambda x: x['question_id'])

        utils.assert_eq(len(questions), len(answers))
        entries = []
        questions = questions[:10000]
        answers = answers[:10000]
        for question, answer in zip(questions, answers):
            utils.assert_eq(question['question_id'], answer['question_id'])
            utils.assert_eq(question['image_id'], answer['image_id'])
            img_id = question['image_id']
            if answer['scores']:
                idx_ans = np.argmax(answer['scores'])
                ans = answer['labels'][idx_ans]
                ans = label2ans[ans]
            else:
                ans = ''
            if not COUNTING_ONLY or is_howmany(question['question'], answer,
                                               label2ans):
                entries.append(_create_entry(img_id2val[img_id], question, answer, ans, \
                               teacher_logits[question['question_id']] if len(teacher_logits)>0 else None))

    else:  # test2015
        entries = []
        for question in questions:
            img_id = question['image_id']
            if not COUNTING_ONLY or is_howmany(question['question'], None,
                                               None):
                entries.append(
                    _create_entry(img_id2val[img_id], question, None, None))

    return entries
Esempio n. 6
0
def make_json_with_logits(logits, qIds):
    utils.assert_eq(logits.size(0), len(qIds))
    results = {}
    for i in range(logits.size(0)):
        results[int(qIds[i])] = np.float16(logits[i].detach().numpy())
    return results