Ejemplo n.º 1
0
    def _read(self, file_path: str):
        logger.info("Opening base tarball file at %s", self._base_tarball_path)
        base_tarball = tarfile.open(cached_path(self._base_tarball_path), 'r')
        if 'unfiltered' in file_path:
            logger.info("Opening unfiltered tarball file at %s", self._unfiltered_tarball_path)
            unfiltered_tarball = tarfile.open(cached_path(self._unfiltered_tarball_path), 'r')
            logger.info("Loading question file from tarball")
            data_json = json.loads(unfiltered_tarball.extractfile(file_path).read().decode('utf-8'))
        else:
            logger.info("Loading question file from tarball")
            path = os.path.join('qa', file_path)
            data_json = json.loads(base_tarball.extractfile(path).read().decode('utf-8'))

        logger.info("Reading the dataset")
        for question_json in data_json['Data']:
            question_text = question_json['Question']
            question_tokens = self._tokenizer.tokenize(question_text)

            evidence_files: List[List[str]] = []  # contains lines from each evidence file
            if 'web' in file_path:
                for result in question_json['SearchResults']:
                    filename = result['Filename']
                    evidence_file = base_tarball.extractfile(os.path.join("evidence", "web", filename))
                    evidence_files.append([line.decode('utf-8') for line in evidence_file.readlines()])
            else:
                for result in question_json['EntityPages']:
                    filename = result['Filename']
                    evidence_file = base_tarball.extractfile(os.path.join("evidence", "wikipedia", filename))
                    evidence_files.append([line.decode('utf-8') for line in evidence_file.readlines()])

            answer_json = question_json['Answer']
            human_answers = [util.normalize_text(answer) for answer in answer_json.get('HumanAnswers', [])]
            answer_texts = answer_json['NormalizedAliases'] + human_answers
            for paragraph in self.pick_paragraphs(evidence_files, question_text, answer_texts):
                paragraph_tokens = self._tokenizer.tokenize(paragraph)
                token_spans = util.find_valid_answer_spans(paragraph_tokens, answer_texts)
                if not token_spans:
                    # For now, we'll just ignore instances that we can't find answer spans for.
                    # Maybe we can do something smarter here later, but this will do for now.
                    continue
                instance = self.text_to_instance(question_text,
                                                 paragraph,
                                                 token_spans,
                                                 answer_texts,
                                                 question_tokens,
                                                 paragraph_tokens)
                yield instance
Ejemplo n.º 2
0
    def _read(self, file_path: str):
        logger.info("Opening base tarball file at %s", self._base_tarball_path)
        base_tarball = tarfile.open(cached_path(self._base_tarball_path), 'r')
        if 'unfiltered' in file_path:
            logger.info("Opening unfiltered tarball file at %s",
                        self._unfiltered_tarball_path)
            unfiltered_tarball = tarfile.open(
                cached_path(self._unfiltered_tarball_path), 'r')
            logger.info("Loading question file from tarball")
            data_json = json.loads(
                unfiltered_tarball.extractfile(file_path).read().decode(
                    'utf-8'))
        else:
            logger.info("Loading question file from tarball")
            path = os.path.join('qa', file_path)
            data_json = json.loads(
                base_tarball.extractfile(path).read().decode('utf-8'))

        logger.info("Reading the dataset")
        for question_json in Tqdm.tqdm(data_json['Data']):
            question_text = question_json['Question']
            question_tokens = self._tokenizer.tokenize(question_text)

            evidence_files: List[List[str]] = [
            ]  # contains lines from each evidence file
            if 'web' in file_path:
                for result in question_json['SearchResults']:
                    filename = result['Filename']
                    evidence_file = base_tarball.extractfile(
                        os.path.join("evidence", "web", filename))
                    evidence_files.append([
                        line.decode('utf-8')
                        for line in evidence_file.readlines()
                    ])
            else:
                for result in question_json['EntityPages']:
                    filename = result['Filename']
                    evidence_file = base_tarball.extractfile(
                        os.path.join("evidence", "wikipedia", filename))
                    evidence_files.append([
                        line.decode('utf-8')
                        for line in evidence_file.readlines()
                    ])

            answer_json = question_json['Answer']
            human_answers = [
                util.normalize_text(answer)
                for answer in answer_json.get('HumanAnswers', [])
            ]
            answer_texts = answer_json['NormalizedAliases'] + human_answers
            for paragraph in self.pick_paragraphs(evidence_files,
                                                  question_text, answer_texts):
                paragraph_tokens = self._tokenizer.tokenize(paragraph)
                token_spans = util.find_valid_answer_spans(
                    paragraph_tokens, answer_texts)
                if not token_spans:
                    # For now, we'll just ignore instances that we can't find answer spans for.
                    # Maybe we can do something smarter here later, but this will do for now.
                    continue
                instance = self.text_to_instance(question_text, paragraph,
                                                 token_spans, answer_texts,
                                                 question_tokens,
                                                 paragraph_tokens)
                yield instance