Exemple #1
0
 def __init__(self, directory, parts=None):
     world_size = tuple(next(clevr_util.images_iter(directory=directory, parts=parts, mode='train')).shape[:2])
     self.question_size = 0
     self.answer_size = 0
     words = set()
     for _, question, _, answer in clevr_util.questions_iter(directory=directory, parts=parts, mode='train'):
         question = util.string2tokens(string=question)
         answer = util.string2tokens(string=answer)
         self.question_size = max(self.question_size, len(question))
         self.answer_size = max(self.answer_size, len(answer))
         words.update(question)
         words.update(answer)
     words = sorted(words)
     super(CLEVRDataset, self).__init__(world_size=world_size, vectors=dict(question=self.question_size, answer=self.answer_size), words=words)
     self.clevr = {mode: clevr_util.clevr(directory=directory, parts=parts, mode=mode) for mode in ('train', 'validation', 'test')}
Exemple #2
0
 def realize(self, captions):
     try:
         ace = subprocess.Popen([self.ace_path, '-g', self.erg_path, '-1Te', '-r', 'root_strict'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     except Exception as e:
         import sys
         from datetime import datetime
         print(datetime.now().strftime('%H:%M:%S'))
         print(e.strerror)
         print(sys.exc_info()[0])
         raise
     dmrs_list = list()
     mrs_list = list()
     for caption in captions:
         dmrs = self.clause_dmrs(caption)
         dmrs.apply_paraphrases(self.post_processing.values())
         dmrs.remove_underspecifications()
         dmrs_list.append(dmrs)
         mrs_list.append(dmrs.get_mrs() + '\n')
     stdout_data, stderr_data = ace.communicate(''.join(mrs_list).encode())
     stderr_data = stderr_data.decode('utf-8').splitlines()
     stdout_data = stdout_data.decode('utf-8').splitlines()
     assert all(self.regex.match(line) for line in stderr_data), '\n\n' + '\n'.join('{}\n{}\n{}\n'.format(line, dmrs.dumps_xml().decode(), mrs) for line, dmrs, mrs in zip(stderr_data, dmrs_list, mrs_list) if not self.regex.match(line)) + '\nFailures: {}\n'.format(len(captions) - int(stderr_data[-2][16:stderr_data[-2].index(' ', 16)]))  # self.proposition_dmrs(caption).dumps_xml()
     caption_strings = [line for line in stdout_data if line]
     assert len(caption_strings) == len(captions)
     for n, caption in enumerate(caption_strings):
         captions[n] = util.string2tokens(string=caption)
     return captions
Exemple #3
0
    def realize(self, captions):
        try:
            ace = subprocess.Popen([
                self.ace_path, '-g', self.erg_path, '-1e', '-r', 'root_strict'
            ],
                                   stdin=subprocess.PIPE,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        except Exception as e:
            import sys
            from datetime import datetime
            print(datetime.now().strftime('%H:%M:%S'))
            print(e.strerror)
            print(sys.exc_info()[0])
            raise
        dmrs_list = list()
        mrs_list = list()
        for caption in captions:
            dmrs = self.caption_dmrs(caption=caption)
            dmrs = dmrs.apply_paraphrases(self.post_processing.values())
            dmrs.remove_underspecifications()
            dmrs_list.append(dmrs)
            mrs_list.append(dmrs.get_mrs() + '\n')
        stdout_data, stderr_data = ace.communicate(
            input=''.join(mrs_list).encode())
        stderr_data = stderr_data.decode('utf-8').splitlines()
        stdout_data = stdout_data.decode('utf-8').splitlines()

        failures = 0
        n = 0
        unexpected = False
        for line in stderr_data:
            if n == len(captions):
                assert self.final_regex.match(line), line
                continue
            if self.successful_regex.match(line):
                if unexpected:
                    print(dmrs_list[n].dumps_xml().decode())
                    print(mrs_list[n])
                    unexpected = False
                n += 1
            elif self.unsuccessful_regex.match(line):
                print(dmrs_list[n].dumps_xml().decode())
                print(mrs_list[n])
                failures += 1
                n += 1
            else:
                print('Unexpected: ' + line)
                unexpected = True
        if failures > 0:
            print('Failures: {}'.format(failures))
            exit(0)

        caption_strings = [line for line in stdout_data if line]
        assert len(caption_strings) == len(
            captions), stdout_data + '\n' + stderr_data
        for n, caption in enumerate(caption_strings):
            captions[n] = util.string2tokens(string=caption)
        return captions
Exemple #4
0
 def __init__(self, directory):
     world_size = tuple(
         next(nlvr_util.images_iter(directory=directory,
                                    mode='train'))[1][0].shape[:2])
     self.description_size = 0
     words = set()
     for _, _, description, _ in nlvr_util.descriptions_iter(
             directory=directory, mode='train'):
         description = util.string2tokens(string=description)
         self.description_size = max(self.description_size,
                                     len(description))
         words.update(description)
     words = sorted(words)
     super(NLVRDataset,
           self).__init__(world_size=world_size,
                          vectors=dict(description=self.description_size),
                          words=words)
     self.nlvr = {
         mode: nlvr_util.nlvr(directory=directory, mode=mode)
         for mode in ('train', 'validation', 'test')
     }
def descriptions_iter(directory, mode):
    mode = 'dev' if mode == 'validation' else mode
    path = os.path.join(directory, mode, mode + '.json')
    with open(path, 'r') as filehandle:
        for line in filehandle:
            line = line.strip()
            description_dict = json.loads(s=line)
            identifier = description_dict['identifier']
            assert identifier[-2:] in ('-0', '-1', '-2', '-3')
            world_model1, world_model2, world_model3 = description_dict[
                'structured_rep']
            description = description_dict['sentence'].lower()
            if description[-1] != '.':
                description += '.'
            description = util.string2tokens(string=description)
            agreement = description_dict['label']
            assert agreement in ('true', 'false')
            agreement = (agreement == 'true')
            assert len(
                description_dict['evals']) == (1 if mode == 'train' else 5)
            assert len(description_dict) == 5
            yield identifier, (world_model1, world_model2,
                               world_model3), description, agreement
Exemple #6
0
def questions_iter(directory, mode, parts=None):
    split = 'val' if mode == 'validation' else mode
    if parts is not None:
        split += parts[mode]
    path = os.path.join(directory, 'questions',
                        'CLEVR_{}_questions.json'.format(split))
    with open(path, 'r') as filehandle:
        chars = filehandle.read(2)
        assert chars == '{"'
        chars = filehandle.read(1)
        while chars != 'q':
            while filehandle.read(1) != '"':
                pass
            chars = filehandle.read(3)
            assert chars == ': {'
            while filehandle.read(1) != '}':
                pass
            chars = filehandle.read(3)
            assert chars == ', "'
            chars = filehandle.read(1)
        chars = filehandle.read(11)
        assert chars == 'uestions": '
        image_index = 0
        for n, question_dict in enumerate(json_list_generator(fp=filehandle)):
            if image_index != question_dict['image_index']:
                image_index += 1
                assert image_index == question_dict['image_index']
            question = question_dict['question'].lower()
            if question[-1] != '?':
                question += '?'
            question = util.string2tokens(string=question)
            if mode == 'test':
                question_model = dict()
                answer = '[UNKNOWN]'
            else:
                family = question_dict['question_family_index']
                program = question_dict['program']
                question_model = dict(family=family, program=program)
                answer = question_dict['answer'].lower()
            if answer in numbers:
                answer = numbers[answer]
            answer = util.string2tokens(string=answer)
            assert question_dict['question_index'] == n
            assert question_dict['split'] == split
            assert question_dict[
                'image_filename'] == 'CLEVR_{}_{:0>6}.png'.format(
                    split, image_index)
            assert len(question_dict) == 8 or (mode == 'test'
                                               and len(question_dict) == 5)
            yield image_index, question, question_model, answer
        chars = filehandle.read(1)
        while chars == ',':
            chars = filehandle.read(2)
            assert chars == ' "'
            while filehandle.read(1) != '"':
                pass
            chars = filehandle.read(3)
            assert chars == ': {'
            while filehandle.read(1) != '}':
                pass
            chars = filehandle.read(1)
        assert chars == '}'
        chars = filehandle.read()
        assert not chars