Exemple #1
0
    def _read(self, file_path: str):
        logger.info("Opening base tarball file at %s", self._base_tarball_path)
        base_tarball = tarfile.open(cached_path(self._base_tarball_path), 'r')
        if 'unfiltered' in file_path:
            logger.info("Opening unfiltered tarball file at %s", self._unfiltered_tarball_path)
            unfiltered_tarball = tarfile.open(cached_path(self._unfiltered_tarball_path), 'r')
            logger.info("Loading question file from tarball")
            data_json = json.loads(unfiltered_tarball.extractfile(file_path).read().decode('utf-8'))
        else:
            logger.info("Loading question file from tarball")
            path = os.path.join('qa', file_path)
            data_json = json.loads(base_tarball.extractfile(path).read().decode('utf-8'))

        logger.info("Reading the dataset")
        for question_json in data_json['Data']:
            question_text = question_json['Question']
            question_tokens = self._tokenizer.tokenize(question_text)

            evidence_files: List[List[str]] = []  # contains lines from each evidence file
            if 'web' in file_path:
                for result in question_json['SearchResults']:
                    filename = result['Filename']
                    evidence_file = base_tarball.extractfile(os.path.join("evidence", "web", filename))
                    evidence_files.append([line.decode('utf-8') for line in evidence_file.readlines()])
            else:
                for result in question_json['EntityPages']:
                    filename = result['Filename']
                    evidence_file = base_tarball.extractfile(os.path.join("evidence", "wikipedia", filename))
                    evidence_files.append([line.decode('utf-8') for line in evidence_file.readlines()])

            answer_json = question_json['Answer']
            human_answers = [util.normalize_text(answer) for answer in answer_json.get('HumanAnswers', [])]
            answer_texts = answer_json['NormalizedAliases'] + human_answers
            for paragraph in self.pick_paragraphs(evidence_files, question_text, answer_texts):
                paragraph_tokens = self._tokenizer.tokenize(paragraph)
                token_spans = util.find_valid_answer_spans(paragraph_tokens, answer_texts)
                if not token_spans:
                    # For now, we'll just ignore instances that we can't find answer spans for.
                    # Maybe we can do something smarter here later, but this will do for now.
                    continue
                instance = self.text_to_instance(question_text,
                                                 paragraph,
                                                 token_spans,
                                                 answer_texts,
                                                 question_tokens,
                                                 paragraph_tokens)
                yield instance
    def _read(self, file_path: str):
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)
        with open(file_path) as dataset_file:
            dataset = json.load(dataset_file)

        span_file = open(self._span_file_path)

        logger.info("Reading the dataset")
        for data, best_span in zip(dataset, span_file):
            answer = data['answers'][0]
            question = data['query']
            well_formed_answer = data['wellFormedAnswers'][0]
            passages_json = data['passages']
            passages = [
                passages_json[i]['passage_text']
                for i in range(len(passages_json))
            ]
            passages_is_selected = [
                passages_json[i]['is_selected']
                for i in range(len(passages_json))
            ]

            tokenized_passages_list = [
                self._tokenizer.tokenize(util.normalize_text(p))
                for p in passages
            ]
            passages_length = [len(p) for p in tokenized_passages_list]
            #cumulative_passages_length = np.cumsum(passages_length)

            normalized_answer = util.normalize_text(answer)
            normalized_question = util.normalize_text(question)

            tokenized_answer = self._tokenizer.tokenize(normalized_answer)
            tokenized_question = self._tokenizer.tokenize(normalized_question)

            question_field = TextField(tokenized_question,
                                       self._token_indexers)
            fields = {'question': question_field}

            start_idx, end_idx, rouge_score, passage_idx = None, None, None, None

            tokenized_passage = [
                token for sublist in tokenized_passages_list
                for token in sublist
            ]
            start_idx, end_idx, passage_idx, rouge_score = best_span.strip(
            ).split(' ')
            start_idx, end_idx, passage_idx, rouge_score = int(start_idx), int(
                end_idx), int(passage_idx), float(rouge_score)

            if start_idx + 5 > end_idx:
                continue
            if rouge_score > 0.7:
                passage_field = TextField(tokenized_passage,
                                          self._token_indexers)
                fields['passage'] = passage_field

                span_start_field = IndexField(start_idx, None)
                span_end_field = IndexField(end_idx, None)

                fields['passages_length'] = ArrayField(
                    np.asarray(passages_length))
                fields['span_start'] = span_start_field
                fields['span_end'] = span_end_field
                #list = []
                #for i in range(len(passages_length)):
                #    list.append(TextField(tokenized_passages_list[i], self._token_indexers))
                #fields['passages_list'] = ListField(list)

                # TODO:
                correct_passage_field = LabelField(passage_idx,
                                                   skip_indexing=True)
                fields['correct_passage'] = correct_passage_field
                yield Instance(fields)
    def _read(self, file_path: str):
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)
        with open(file_path) as dataset_file:
            dataset = json.load(dataset_file)

        # if self._span_file_path is not None:
        span_file = open(self._span_file_path)

        span_file = json.load(span_file)
        #archive = load_archive(self._extraction_model_path)
        #model = archive.model
        model = None
        p1_dataset_reader = DatasetReader.from_params(
            archive.config["dataset_reader"])
        p1_token_indexers = p1_dataset_reader._token_indexers

        logger.info("Reading the dataset")
        for data, best_span in zip(dataset, span_file):
            answer = data['answers'][0]
            question = data['query']
            well_formed_answer = data['wellFormedAnswers'][0]
            passages_json = data['passages']
            passages = [
                passages_json[i]['passage_text']
                for i in range(len(passages_json))
            ]
            # passages_length = [len(p) for p in passages]
            passages_is_selected = [
                passages_json[i]['is_selected']
                for i in range(len(passages_json))
            ]
            # concatenated_passage = ' '.join(passages)
            tokenized_passages_list = [
                self._tokenizer.tokenize(util.normalize_text(p))
                for p in passages
            ]
            passages_length = [len(p) for p in tokenized_passages_list]
            cumulative_passages_length = np.cumsum(passages_length)

            normalized_answer = None
            if answer != None:
                normalized_answer = util.normalize_text(answer)
            normalized_question = util.normalize_text(question)

            tokenized_answer = self._tokenizer.tokenize(normalized_answer)
            tokenized_question = self._tokenizer.tokenize(normalized_question)

            question_field = TextField(tokenized_question,
                                       self._token_indexers)
            fields = {'question': question_field}

            start_idx, end_idx, rouge_score, passage_idx = None, None, None, None

            tokenized_answer.insert(0, Token(START_SYMBOL))
            tokenized_answer.append(Token(END_SYMBOL))
            tokenized_passage = [
                token for sublist in tokenized_passages_list
                for token in sublist
            ]
            passage_field = TextField(tokenized_passage, self._token_indexers)
            fields['passage'] = passage_field

            p1_question_field = TextField(tokenized_question,
                                          p1_token_indexers)
            p1_passage_field = TextField(tokenized_passage, p1_token_indexers)
            p1_fields = {
                'question': p1_question_field,
                'passage': p1_passage_field
            }
            p1_instance = Instance(p1_fields)
            outputs = model.forward_on_instance(p1_instance, -1)

            start_idx = outputs['span_start_idx']
            end_idx = outputs['span_end_idx']
            for idx in range(len(cumulative_passages_length)):
                if start_idx < cumulative_passages_length[idx]:
                    break

            if idx != 0:
                start_idx = start_idx - cumulative_passages_length[idx - 1]
                end_idx = end_idx - cumulative_passages_length[idx - 1]

            assert start_idx <= end_idx, "Span prediction does not make sense!!!"

            # yield instance from predicted span
            span_start_field = IndexField(int(start_idx), passage_field)
            span_end_field = IndexField(int(end_idx), passage_field)
            answer_field = TextField(tokenized_answer, self._token_indexers)

            fields['passage'] = passage_field
            fields['span_start'] = span_start_field
            fields['span_end'] = span_end_field
            fields['answer'] = answer_field

            evidence = self.get_evidence(tokenized_passage, int(start_idx),
                                         int(end_idx))
            fields['metadata'] = MetadataField({
                'evidence': evidence,
                'question_text': normalized_question,
                'answer_text': normalized_answer
            })

            yield Instance(fields)

            # yield instances from gold spans
            for item in best_span:
                if item['score'] > 0.5:
                    passage_field = TextField(
                        tokenized_passages_list[item['passage']],
                        self._token_indexers)
                    span_start_field = IndexField(item['start'], passage_field)
                    span_end_field = IndexField(item['end'], passage_field)
                    answer_field = TextField(tokenized_answer,
                                             self._token_indexers)

                    fields['passage'] = passage_field
                    fields['span_start'] = span_start_field
                    fields['span_end'] = span_end_field
                    fields['answer'] = answer_field

                    evidence = self.get_evidence(
                        tokenized_passages_list[item['passage']],
                        int(start_idx), int(end_idx))
                    fields['metadata'] = MetadataField({
                        'evidence':
                        evidence,
                        'question_text':
                        normalized_question,
                        'answer_text':
                        normalized_answer
                    })

                    yield Instance(fields)
Exemple #4
0
    def _read(self, file_path: str):
        logger.info("Opening base tarball file at %s", self._base_tarball_path)
        base_tarball = tarfile.open(cached_path(self._base_tarball_path), 'r')
        if 'unfiltered' in file_path:
            logger.info("Opening unfiltered tarball file at %s",
                        self._unfiltered_tarball_path)
            unfiltered_tarball = tarfile.open(
                cached_path(self._unfiltered_tarball_path), 'r')
            logger.info("Loading question file from tarball")
            data_json = json.loads(
                unfiltered_tarball.extractfile(file_path).read().decode(
                    'utf-8'))
        else:
            logger.info("Loading question file from tarball")
            path = os.path.join('qa', file_path)
            data_json = json.loads(
                base_tarball.extractfile(path).read().decode('utf-8'))

        logger.info("Reading the dataset")
        for question_json in Tqdm.tqdm(data_json['Data']):
            question_text = question_json['Question']
            question_tokens = self._tokenizer.tokenize(question_text)

            evidence_files: List[List[str]] = [
            ]  # contains lines from each evidence file
            if 'web' in file_path:
                for result in question_json['SearchResults']:
                    filename = result['Filename']
                    evidence_file = base_tarball.extractfile(
                        os.path.join("evidence", "web", filename))
                    evidence_files.append([
                        line.decode('utf-8')
                        for line in evidence_file.readlines()
                    ])
            else:
                for result in question_json['EntityPages']:
                    filename = result['Filename']
                    evidence_file = base_tarball.extractfile(
                        os.path.join("evidence", "wikipedia", filename))
                    evidence_files.append([
                        line.decode('utf-8')
                        for line in evidence_file.readlines()
                    ])

            answer_json = question_json['Answer']
            human_answers = [
                util.normalize_text(answer)
                for answer in answer_json.get('HumanAnswers', [])
            ]
            answer_texts = answer_json['NormalizedAliases'] + human_answers
            for paragraph in self.pick_paragraphs(evidence_files,
                                                  question_text, answer_texts):
                paragraph_tokens = self._tokenizer.tokenize(paragraph)
                token_spans = util.find_valid_answer_spans(
                    paragraph_tokens, answer_texts)
                if not token_spans:
                    # For now, we'll just ignore instances that we can't find answer spans for.
                    # Maybe we can do something smarter here later, but this will do for now.
                    continue
                instance = self.text_to_instance(question_text, paragraph,
                                                 token_spans, answer_texts,
                                                 question_tokens,
                                                 paragraph_tokens)
                yield instance