Esempio n. 1
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        logger.info("Reading file at %s", file_path)
        with open(file_path) as dataset_file:
            dataset = json.load(dataset_file)
        logger.info("Reading the dataset")
        kept_count, skip_count = 0, 0
        for passage_id, passage_info in dataset.items():
            passage_text = passage_info["passage"]
            passage_tokens = self._tokenizer.tokenize(passage_text)
            passage_tokens = split_tokens_by_hyphen(passage_tokens)
            for question_answer in passage_info["qa_pairs"]:
                question_id = question_answer["query_id"]
                question_text = question_answer["question"].strip()
                answer_annotations = []
                if "answer" in question_answer:
                    answer_annotations.append(question_answer["answer"])
                if "validated_answers" in question_answer:
                    answer_annotations += question_answer["validated_answers"]

                instance = self.text_to_instance(question_text,
                                                 passage_text,
                                                 question_id,
                                                 passage_id,
                                                 answer_annotations,
                                                 passage_tokens)
                if instance is not None:
                    kept_count += 1
                    yield instance
                else:
                    skip_count += 1
        logger.info(f"Skipped {skip_count} questions, kept {kept_count} questions.")
Esempio n. 2
0
    def _read(self, file_path: str):
        file_path = cached_path(file_path)
        with open(file_path) as dataset_file:
            dataset = json.load(dataset_file)

        for passage_id, passage_info in tqdm(dataset.items()):
            passage_text = passage_info["passage"].strip()

            if self.wordpiece_numbers:
                word_tokens = split_tokens_by_hyphen(
                    self.number_tokenizer.tokenize(passage_text))
            else:
                word_tokens = self.tokenizer.tokenize(passage_text)
            numbers_in_passage = []
            number_indices = []
            number_words = []
            number_len = []
            passage_tokens = []
            curr_index = 0
            # Get all passage numbers
            for token in word_tokens:
                number = self.word_to_num(token.text)
                wordpieces = self.tokenizer.tokenize(token.text)
                num_wordpieces = len(wordpieces)
                if number is not None:
                    numbers_in_passage.append(number)
                    number_indices.append(curr_index)
                    number_words.append(token.text)
                    number_len.append(num_wordpieces)
                passage_tokens += wordpieces
                curr_index += num_wordpieces

            # Process questions from this passage
            for question_answer in passage_info["qa_pairs"]:
                question_id = question_answer["query_id"]
                question_text = question_answer["question"].strip()
                answer_annotations = []
                if "answer" in question_answer:
                    answer_type = get_answer_type(question_answer['answer'])
                    if self.answer_type is not None and get_answer_type(
                            answer_type) not in self.answer_type:
                        continue
                    answer_annotations.append(question_answer["answer"])
                if self.question_type is not None and get_question_type(
                        question_text, answer_type, question_answer['answer'],
                        self.max_count) not in self.question_type:
                    continue
                if self.use_validated and "validated_answers" in question_answer:
                    answer_annotations += question_answer["validated_answers"]
                instance = self.text_to_instance(
                    question_text, passage_text, passage_tokens,
                    numbers_in_passage, number_words, number_indices,
                    number_len, question_id, passage_id, answer_annotations)
                if instance is not None:
                    yield instance
Esempio n. 3
0
    def get_instance(self, passage_info):
        passage_text = passage_info["context"].strip()

        if self.wordpiece_numbers:
            word_tokens = split_tokens_by_hyphen(
                self.number_tokenizer.tokenize(passage_text))
        else:
            word_tokens = self.tokenizer.tokenize(passage_text)
        numbers_in_passage = []
        number_indices = []
        number_words = []
        number_len = []
        passage_tokens = []
        curr_index = 0
        # Get all passage numbers
        for token in word_tokens:
            number = self.word_to_num(token.text, True)
            wordpieces = self.tokenizer.tokenize(token.text)
            num_wordpieces = len(wordpieces)
            if number is not None:
                numbers_in_passage.append(number)
                number_indices.append(curr_index)
                number_words.append(token.text)
                number_len.append(num_wordpieces)
            passage_tokens += wordpieces
            curr_index += num_wordpieces

        # Process questions from this passage
        for question_answer in passage_info["qa_pairs"]:
            question_id = question_answer["qid"]
            question_text = question_answer["question"].strip()
            dataset = question_answer["dataset"]
            if len(question_answer["answers"]
                   ) != 0 and question_answer["answers"][0][0] != "":
                answer_annotations = question_answer["answers"]
            else:
                answer_annotations = None
            instance = self.text_to_instance(question_text, passage_text,
                                             passage_tokens,
                                             numbers_in_passage, number_words,
                                             number_indices, number_len,
                                             question_id, answer_annotations,
                                             dataset)

            yield instance
Esempio n. 4
0
def extract_passage_numbers(passage):
    word_tokens = split_tokens_by_hyphen(number_tokenizer.tokenize(passage))
    numbers_in_passage = []
    number_indices = []
    number_words = []
    number_len = []
    passage_tokens = []
    curr_index = 0
    # Get all passage numbers
    for token in word_tokens:
        number = _get_number_from_word(token.text)
        wordpieces = tokenizer.tokenize(token.text)
        num_wordpieces = len(wordpieces)
        if number is not None:
            numbers_in_passage.append(number)
            number_indices.append(token.idx)
            number_words.append(token.text)
            number_len.append(num_wordpieces)
        passage_tokens += wordpieces
        curr_index += num_wordpieces

    return numbers_in_passage, number_indices
Esempio n. 5
0
    def _read(self, file_path: str):
        file_path = cached_path(file_path)
        with open(file_path) as dataset_file:
            dataset = json.load(dataset_file)

        instances = []
        failed_passages = []

        for passage_id, passage_info in tqdm(dataset.items()):
            passage_text = passage_info["passage"].strip()

            if self.wordpiece_numbers:
                word_tokens = split_tokens_by_hyphen(
                    self.number_tokenizer.tokenize(passage_text))
            else:
                word_tokens = self.tokenizer.tokenize(passage_text)

            try:
                passage_spans = [] if self.extract_spans is False \
                    else self.span_extractor.extract_passage_spans(word_tokens, self.spans_labels, self.span_max_length)
            except:
                failed_passages.append(passage_id)
                continue

            numbers_in_passage = []
            number_indices = []
            number_words = []
            number_len = []
            passage_tokens = []
            curr_index = 0
            # Get all passage numbers
            for token in word_tokens:
                number = self.word_to_num(token.text)
                wordpieces = self.tokenizer.tokenize(token.text)
                num_wordpieces = len(wordpieces)
                if self.extract_spans and num_wordpieces > 1:
                    passage_spans = list(
                        map(
                            lambda span: adapt_span_by_wordpieces(
                                span, curr_index, num_wordpieces),
                            passage_spans))
                if number is not None:
                    numbers_in_passage.append(number)
                    number_indices.append(curr_index)
                    number_words.append(token.text)
                    number_len.append(num_wordpieces)
                passage_tokens += wordpieces
                curr_index += num_wordpieces

            # Process questions from this passage
            for question_answer in passage_info["qa_pairs"]:
                question_id = question_answer["query_id"]
                question_text = question_answer["question"].strip()
                answer_annotations = []
                if "answer" in question_answer:
                    answer_type = get_answer_type(question_answer['answer'])
                    if self.answer_type is not None and answer_type not in self.answer_type:
                        continue
                    if self.question_type is not None and get_question_type(
                            question_text, answer_type,
                            question_answer['answer'],
                            self.max_count) not in self.question_type:
                        continue
                    answer_annotations.append(question_answer["answer"])
                if self.use_validated and "validated_answers" in question_answer:
                    answer_annotations += question_answer["validated_answers"]
                instance = self.text_to_instance(
                    question_text, passage_text, passage_tokens, passage_spans,
                    numbers_in_passage, number_words, number_indices,
                    number_len, question_id, passage_id, answer_annotations)
                if instance is not None:
                    instances.append(instance)

        if self.extract_spans:
            with open(
                    file_path.split('/')[-1].split('.')[0] +
                    "_failed_passages.json", 'w+') as outfile:
                json.dump(failed_passages, outfile)
        return instances
    def _read(self, file_path: str):
        file_path = cached_path(file_path)
        with open(file_path, encoding = "utf8") as dataset_file:
            dataset = json.load(dataset_file)

        if self.standardize_texts and self._is_training:
            dataset = standardize_dataset(dataset)

        instances_count = 0
        for passage_id, passage_info in tqdm(dataset.items()):
            passage_text = passage_info["passage"].strip()

            if self.wordpiece_numbers:
                # In this case we actually first use a basic `WordTokenizer`, where each token is
                # additionally split on any hyphen it contains.
                word_tokens = split_tokens_by_hyphen(self.number_tokenizer.tokenize(passage_text))
            else:
                word_tokens = self.tokenizer.tokenize(passage_text)

            # Auxiliary variables for handling numbers from the passage
            numbers_in_passage = []
            number_indices = []
            number_words = []
            number_len = []
            passage_tokens = []
            curr_index = 0

            # Get all passage numbers
            for token in word_tokens:
                # Wordpiece tokenization is done here.
                # In addition, every token recognized as a number is stored for arithmetic processing.
                number = self.word_to_num(token.text, self.improve_number_extraction)
                wordpieces = self.tokenizer.tokenize(token.text)
                num_wordpieces = len(wordpieces)
                if number is not None:
                    numbers_in_passage.append(number)
                    number_indices.append(curr_index)
                    number_words.append(token.text)
                    number_len.append(num_wordpieces)
                passage_tokens += wordpieces
                curr_index += num_wordpieces
            # if len(number_indices) == 0:
            #     import pdb; pdb.set_trace()
            passage_tokens = fill_token_indices(passage_tokens, passage_text, self._uncased, self.basic_tokenizer, word_tokens)

            # Process questions from this passage
            for qa_pair in passage_info["qa_pairs"]:
                if 0 < self.max_instances <= instances_count:
                    return

                question_id = qa_pair["query_id"]
                question_text = qa_pair["question"].strip()
                
                answer_annotations: List[Dict] = list()
                specific_answer_type = None
                if 'answer' in qa_pair and qa_pair['answer']:
                    answer = qa_pair['answer']

                    specific_answer_type = get_answer_type(answer)
                    if specific_answer_type not in self.answer_types:
                        continue

                    answer_annotations.append(answer)

                if self.use_validated and "validated_answers" in qa_pair and qa_pair["validated_answers"]:
                    answer_annotations += qa_pair["validated_answers"]

                instance = self.text_to_instance(question_text,
                                                 passage_text,
                                                 passage_tokens,
                                                 numbers_in_passage,
                                                 number_words,
                                                 number_indices,
                                                 number_len,
                                                 question_id,
                                                 passage_id,
                                                 answer_annotations,
                                                 specific_answer_type)
                if instance is not None:
                    instances_count += 1
                    yield instance
Esempio n. 7
0
    def text_to_instance(self,  # type: ignore
                         question_text: str,
                         passage_text: str,
                         question_id: str = None,
                         passage_id: str = None,
                         answer_annotations: List[Dict] = None,
                         passage_tokens: List[Token] = None) -> Union[Instance, None]:
        # pylint: disable=arguments-differ
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
            passage_tokens = split_tokens_by_hyphen(passage_tokens)
        question_tokens = self._tokenizer.tokenize(question_text)
        question_tokens = split_tokens_by_hyphen(question_tokens)
        if self.passage_length_limit is not None:
            passage_tokens = passage_tokens[: self.passage_length_limit]
        if self.question_length_limit is not None:
            question_tokens = question_tokens[: self.question_length_limit]

        answer_type: str = None
        answer_texts: List[str] = []
        if answer_annotations:
            # Currently we only use the first annotated answer here, but actually this doesn't affect
            # the training, because we only have one annotation for the train set.
            answer_type, answer_texts = self.extract_answer_info_from_annotation(answer_annotations[0])

        # Tokenize the answer text in order to find the matched span based on token
        tokenized_answer_texts = []
        for answer_text in answer_texts:
            answer_tokens = self._tokenizer.tokenize(answer_text)
            answer_tokens = split_tokens_by_hyphen(answer_tokens)
            tokenized_answer_texts.append(' '.join(token.text for token in answer_tokens))

        if self.instance_format == "squad":
            valid_passage_spans = \
                self.find_valid_spans(passage_tokens, tokenized_answer_texts) if tokenized_answer_texts else []
            if not valid_passage_spans:
                if "passage_span" in self.skip_when_all_empty:
                    return None
                else:
                    valid_passage_spans.append((len(passage_tokens) - 1, len(passage_tokens) - 1))
            return make_reading_comprehension_instance(question_tokens,
                                                       passage_tokens,
                                                       self._token_indexers,
                                                       passage_text,
                                                       valid_passage_spans,
                                                       # this `answer_texts` will not be used for evaluation
                                                       answer_texts,
                                                       additional_metadata={
                                                               "original_passage": passage_text,
                                                               "original_question": question_text,
                                                               "passage_id": passage_id,
                                                               "question_id": question_id,
                                                               "valid_passage_spans": valid_passage_spans,
                                                               "answer_annotations": answer_annotations})
        elif self.instance_format == "bert":
            question_concat_passage_tokens = question_tokens + [Token("[SEP]")] + passage_tokens
            valid_passage_spans = []
            for span in self.find_valid_spans(passage_tokens, tokenized_answer_texts):
                # This span is for `question + [SEP] + passage`.
                valid_passage_spans.append((span[0] + len(question_tokens) + 1,
                                            span[1] + len(question_tokens) + 1))
            if not valid_passage_spans:
                if "passage_span" in self.skip_when_all_empty:
                    return None
                else:
                    valid_passage_spans.append((len(question_concat_passage_tokens) - 1,
                                                len(question_concat_passage_tokens) - 1))
            answer_info = {"answer_texts": answer_texts,  # this `answer_texts` will not be used for evaluation
                           "answer_passage_spans": valid_passage_spans}
            return self.make_bert_drop_instance(question_tokens,
                                                passage_tokens,
                                                question_concat_passage_tokens,
                                                self._token_indexers,
                                                passage_text,
                                                answer_info,
                                                additional_metadata={
                                                        "original_passage": passage_text,
                                                        "original_question": question_text,
                                                        "passage_id": passage_id,
                                                        "question_id": question_id,
                                                        "answer_annotations": answer_annotations})
        elif self.instance_format == "drop":
            numbers_in_passage = []
            number_indices = []
            for token_index, token in enumerate(passage_tokens):
                number = self.convert_word_to_number(token.text)
                if number is not None:
                    numbers_in_passage.append(number)
                    number_indices.append(token_index)
            # hack to guarantee minimal length of padded number
            numbers_in_passage.append(0)
            number_indices.append(-1)
            numbers_as_tokens = [Token(str(number)) for number in numbers_in_passage]

            valid_passage_spans = \
                self.find_valid_spans(passage_tokens, tokenized_answer_texts) if tokenized_answer_texts else []
            valid_question_spans = \
                self.find_valid_spans(question_tokens, tokenized_answer_texts) if tokenized_answer_texts else []

            target_numbers = []
            # `answer_texts` is a list of valid answers.
            for answer_text in answer_texts:
                number = self.convert_word_to_number(answer_text)
                if number is not None:
                    target_numbers.append(number)
            valid_signs_for_add_sub_expressions: List[List[int]] = []
            valid_counts: List[int] = []
            if answer_type in ["number", "date"]:
                valid_signs_for_add_sub_expressions = self.find_valid_add_sub_expressions(numbers_in_passage,
                                                                                          target_numbers)
            if answer_type in ["number"]:
                # Currently we only support count number 0 ~ 9
                numbers_for_count = list(range(10))
                valid_counts = self.find_valid_counts(numbers_for_count, target_numbers)

            type_to_answer_map = {"passage_span": valid_passage_spans,
                                  "question_span": valid_question_spans,
                                  "addition_subtraction": valid_signs_for_add_sub_expressions,
                                  "counting": valid_counts}

            if self.skip_when_all_empty \
                    and not any(type_to_answer_map[skip_type] for skip_type in self.skip_when_all_empty):
                return None

            answer_info = {"answer_texts": answer_texts,  # this `answer_texts` will not be used for evaluation
                           "answer_passage_spans": valid_passage_spans,
                           "answer_question_spans": valid_question_spans,
                           "signs_for_add_sub_expressions": valid_signs_for_add_sub_expressions,
                           "counts": valid_counts}

            return self.make_marginal_drop_instance(question_tokens,
                                                    passage_tokens,
                                                    numbers_as_tokens,
                                                    number_indices,
                                                    self._token_indexers,
                                                    passage_text,
                                                    answer_info,
                                                    additional_metadata={
                                                            "original_passage": passage_text,
                                                            "original_question": question_text,
                                                            "original_numbers": numbers_in_passage,
                                                            "passage_id": passage_id,
                                                            "question_id": question_id,
                                                            "answer_info": answer_info,
                                                            "answer_annotations": answer_annotations})
        else:
            raise ValueError(f"Expect the instance format to be \"drop\", \"squad\" or \"bert\", "
                             f"but got {self.instance_format}")
Esempio n. 8
0
    def text_to_instance(self,  # type: ignore
                         question_text: str,
                         passage_text: str,
                         question_id: str = None,
                         passage_id: str = None,
                         answer_annotations: List[Dict] = None,
                         passage_tokens: List[Token] = None) -> Union[Instance, None]:


        # pylint: disable=arguments-differ
        
        if not passage_tokens:
            # [w1,w2,w3,...,wn]
            passage_tokens = self._tokenizer.tokenize(passage_text)
            passage_tokens = split_tokens_by_hyphen(passage_tokens)    
            passage_tokens = self.word_tokenizer(passage_tokens)

        question_tokens = self._tokenizer.tokenize(question_text)
        question_tokens = split_tokens_by_hyphen(question_tokens)
        question_tokens = self.word_tokenizer(question_tokens)
                

        if self.passage_length_limit is not None:
            passage_tokens = passage_tokens[: self.passage_length_limit]
        if self.question_length_limit is not None:
            question_tokens = question_tokens[: self.question_length_limit]
        
        passage_question_tokens = [Token("[CLS]")] + passage_tokens + [Token("[SEP]")] + question_tokens + [Token("[SEP]")] + self.implicit_tokens

        #passage_question_tokens = [Token(token) for token in passage_question_tokens]

        answer_type: str = None
        answer_texts: List[str] = []
        if answer_annotations:
            # Currently we only use the first annotated answer here, but actually this doesn't affect
            # the training, because we only have one annotation for the train set.
            answer_type, answer_texts = self.extract_answer_info_from_annotation(answer_annotations[0])

        # Tokenize the answer text in order to find the matched span based on token
        tokenized_answer_texts = []
        for answer_text in answer_texts:
            answer_tokens = self._tokenizer.tokenize(answer_text)
            #answer_tokens = [Token(token) for token in answer_tokens]
            answer_tokens = split_tokens_by_hyphen(answer_tokens)
            answer_tokens = self.word_tokenizer(answer_tokens)
            tokenized_answer_texts.append(' '.join([token.text for token in answer_tokens]))

        
        if self.instance_format == "force_add":
            numbers_in_passage_question = []
            number_indices = []
            for token_index, token in enumerate(passage_question_tokens):
                number = self.convert_word_to_number(token.text)
                if number is not None:
                    numbers_in_passage_question.append(number)
                    number_indices.append(token_index)
            
            # hack to guarantee minimal length of padded number
            numbers_in_passage_question.append(0)
            number_indices.append(-1)
            numbers_as_tokens = [Token(str(number)) for number in numbers_in_passage_question]
                    
            valid_passage_question_spans = \
                self.find_valid_spans(passage_question_tokens,tokenized_answer_texts) if tokenized_answer_texts else []

            valid_signs_for_add_sub_expressions: List[List[int]] = []
            valid_counts: List[int] = []
            
            #arithmetic answer
            if answer_type in ["number", "date"]:
                target_numbers = []
                for answer_text in answer_texts:
                    number = self.convert_word_to_number(answer_text)
                    if number is not None:
                        target_numbers.append(number)

                
                valid_signs_for_add_sub_expressions = self.find_valid_add_sub_expressions(numbers_in_passage_question,
                                                                                          target_numbers)
                #if not valid_signs_for_add_sub_expressions:            
                #    valid_signs_for_add_sub_expressions = self.find_valid_self_add_sub_expressions(numbers_in_passage_question,target_numbers,self.implicit_number)


            #count answer
            if answer_type in ["number"] :
                # Currently we only support count number 0 ~ 9
                numbers_for_count = list(range(10))
                valid_counts = self.find_valid_counts(numbers_for_count, target_numbers)
            
            type_to_answer_map = {"spans": valid_passage_question_spans,
                                  "addition_subtraction": valid_signs_for_add_sub_expressions,
                                  "counting": valid_counts}
            
           
            
            
            if self.skip_when_all_empty \
                    and not any(type_to_answer_map[skip_type] for skip_type in self.skip_when_all_empty):
           
                return None
            
            
            answer_info = {"answer_texts": answer_texts,  # this `answer_texts` will not be used for evaluation
                           "answer_spans": valid_passage_question_spans,
                           "signs_for_add_sub_expressions": valid_signs_for_add_sub_expressions,
                           "counts": valid_counts}


            return self.make_marginal_bert_drop_instance(passage_question_tokens,
                                                    #passage_tokens,
                                                    self.implicit_tokens,
                                                    numbers_as_tokens,
                                                    number_indices,
                                                    self._token_indexers,
                                                    passage_text,
                                                    answer_info,
                                                    additional_metadata={
                                                            "original_passage": passage_text,
                                                            "original_question": question_text,
                                                            "original_numbers": numbers_in_passage_question,
                                                            "passage_id": passage_id,
                                                            "question_id": question_id,
                                                            "answer_info": answer_info,
                                                            "answer_annotations": answer_annotations})
            

        else:
            raise ValueError(f"Expect the instance format to be \"bert_drop\", "
                             f"but got {self.instance_format}")
Esempio n. 9
0
    def dataset_iterator(self, single_file_path_cached, dataset):
        jsonl = open(single_file_path_cached, 'r')
        for passage_info in jsonl:
            passage_info = json.loads(passage_info)
            passage_text = passage_info["passage"].strip()

            if self.wordpiece_numbers:
                word_tokens = split_tokens_by_hyphen(
                    self.number_tokenizer.tokenize(passage_text))
            else:
                word_tokens = self.tokenizer.tokenize(passage_text)
            numbers_in_passage = []
            number_indices = []
            number_words = []
            number_len = []
            passage_tokens = []
            curr_index = 0
            # Get all passage numbers
            for token in word_tokens:
                number = self.word_to_num(token.text, True)
                wordpieces = self.tokenizer.tokenize(token.text)
                num_wordpieces = len(wordpieces)
                if number is not None:
                    numbers_in_passage.append(number)
                    number_indices.append(curr_index)
                    number_words.append(token.text)
                    number_len.append(num_wordpieces)
                passage_tokens += wordpieces
                curr_index += num_wordpieces

            # Process questions from this passage
            for question_answer in passage_info["qa_pairs"]:
                question_id = question_answer["query_id"]
                question_text = question_answer["question"].strip()
                if question_answer["answer"] != "impossible":
                    spans = question_answer["answer"]["spans"]
                    spans = [span.strip() for span in spans if span.strip()]
                    question_answer["answer"]["spans"] = spans
                    #dataset = question_answer.get("dataset", currentDataset)

                    answer_annotations = []
                    if "answer" in question_answer:
                        answer_annotations.append(question_answer["answer"])
                    if self.use_validated and "validated_answers" in question_answer:
                        answer_annotations += question_answer[
                            "validated_answers"]
                else:
                    answer_annotations = None
                #print(passage_text)
                #print('Question: ', question_text)
                #print('Answer: ', answer_annotations)
                instance = self.text_to_instance(question_text, passage_text,
                                                 passage_tokens,
                                                 numbers_in_passage,
                                                 number_words, number_indices,
                                                 number_len, question_id,
                                                 answer_annotations, dataset)
                if instance is not None:
                    self.numInstances += 1
                    self.dataset_numbers[dataset] = self.dataset_numbers.get(
                        dataset, 0) + 1
                    if self.numInstances % 5000 == 0:
                        pass
                        #print('i: ', self.numInstances)
                        #print('Dataset Numbers: ', self.dataset_numbers)
                    yield instance

        print('Dataset Numbers after finishing %s: ' % dataset,
              self.dataset_numbers)
Esempio n. 10
0
    def process_numbers(self,  # type: ignore
                        question_text: str,
                        passage_text: str,
                        question_id: str = None,
                        passage_id: str = None,
                        answer_annotations: List[Dict] = None,
                        passage_tokens: List[Token] = None) -> Union[Instance, None]:
        '''extract all numbers from passage and append to self.all_numbers; note this is done multiple times
        for each paragraph at the moment'''

        # pylint: disable=arguments-differ
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
            passage_tokens = split_tokens_by_hyphen(passage_tokens)
        question_tokens = self._tokenizer.tokenize(question_text)
        question_tokens = split_tokens_by_hyphen(question_tokens)
        if self.passage_length_limit is not None:
            #logger.info("passage_length = %s", len(passage_tokens))
            passage_tokens = passage_tokens[: self.passage_length_limit]
        if self.question_length_limit is not None:
            question_tokens = question_tokens[: self.question_length_limit]
            #logger.info("question_length = %s", len(question_tokens))

        answer_type: str = None
        answer_texts: List[str] = []
        if answer_annotations:
            # Currently we only use the first annotated answer here, but actually this doesn't affect
            # the training, because we only have one annotation for the train set.
            answer_type, answer_texts = self.extract_answer_info_from_annotation(answer_annotations[0])

        # Tokenize the answer text in order to find the matched span based on token
        tokenized_answer_texts = []
        for answer_text in answer_texts:
            answer_tokens = self._tokenizer.tokenize(answer_text)
            answer_tokens = split_tokens_by_hyphen(answer_tokens)
            tokenized_answer_texts.append(' '.join(token.text for token in answer_tokens))

        if self.instance_format == "drop":
            numbers_in_passage = []
            number_indices = []
            for token_index, token in enumerate(passage_tokens):
                number = self.convert_word_to_number(token.text)
                if number is not None:
                    numbers_in_passage.append(number)
                    number_indices.append(token_index)

            # hack to guarantee minimal length of padded number
            number_indices.append(-1)
            numbers_in_passage.append(0)

            valid_passage_spans = \
                self.find_valid_spans(passage_tokens, tokenized_answer_texts) if tokenized_answer_texts else []
            valid_question_spans = \
                self.find_valid_spans(question_tokens, tokenized_answer_texts) if tokenized_answer_texts else []

            target_numbers = []
            # `answer_texts` is a list of valid answers.
            for answer_text in answer_texts:
                number = self.convert_word_to_number(answer_text)
                if number is not None:
                    target_numbers.append(number)
            valid_signs_for_add_sub_expressions: List[List[int]] = []
            valid_counts: List[int] = []
            # if answer_type in ["number", "date"]:
            if answer_type in ["number"]:
                valid_signs_for_add_sub_expressions = self.find_valid_add_sub_expressions(numbers_in_passage,
                                                                                          target_numbers)
            if answer_type in ["number"]:
                # Currently we only support count number 0 ~ 9
                numbers_for_count = list(range(10))
                valid_counts = self.find_valid_counts(numbers_for_count, target_numbers)

            type_to_answer_map = {"passage_span": valid_passage_spans,
                                  "question_span": valid_question_spans,
                                  "addition_subtraction": valid_signs_for_add_sub_expressions,
                                  "counting": valid_counts}

            if self.skip_when_all_empty \
                    and not any(type_to_answer_map[skip_type] for skip_type in self.skip_when_all_empty):
                return None
            else:
                self.all_numbers = np.concatenate((self.all_numbers, np.array(numbers_in_passage)))
                l = len(numbers_in_passage)
                if l in self.num_numbers:
                    self.num_numbers[l] += 1
                else:
                    self.num_numbers[l] = 1
Esempio n. 11
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        logger.info("Reading file at %s", file_path)

        if "train" in file_path:
            with open(file_path) as dataset_file:
                dataset = json.load(dataset_file)
            logger.info("Finding mean and sd")
            for passage_id, passage_info in dataset.items():
                passage_text = passage_info["passage"]
                passage_tokens = self._tokenizer.tokenize(passage_text)
                passage_tokens = split_tokens_by_hyphen(passage_tokens)

                for question_answer in passage_info["qa_pairs"]:
                    question_id = question_answer["query_id"]
                    question_text = question_answer["question"].strip()
                    answer_annotations = []
                    if "answer" in question_answer:
                        answer_annotations.append(question_answer["answer"])
                    if "validated_answers" in question_answer:
                        answer_annotations += question_answer["validated_answers"]

            # UNCOMMENT TO FIND NUMBER DISTRIBUTION

                    # self.process_numbers(question_text,
                    #                      passage_text,
                    #                      question_id,
                    #                      passage_id,
                    #                      answer_annotations,
                    #                      passage_tokens)

            # if self.all_numbers:
            #     self.mean = self.all_numbers.mean()
            # else:
            #     self.mean = 0
            # if len(self.all_numbers) > 1:
            #     self.sd = self.all_numbers.std()
            # else:
            #     self.sd = 1

            # logger.info("Mean of all numbers = %f \n", self.mean)
            # logger.info("Sd of all numbers = %f \n", self.sd)
            # logger.info("Number distribution %s", self.num_numbers)

        with open(file_path) as dataset_file:
            dataset = json.load(dataset_file)
        logger.info("Reading the dataset")
        kept_count, skip_count = 0, 0
        for passage_id, passage_info in dataset.items():
            passage_text = passage_info["passage"]
            passage_tokens = self._tokenizer.tokenize(passage_text)
            passage_tokens = split_tokens_by_hyphen(passage_tokens)
            for question_answer in passage_info["qa_pairs"]:
                question_id = question_answer["query_id"]
                question_text = question_answer["question"].strip()
                answer_annotations = []
                if "answer" in question_answer:
                    answer_annotations.append(question_answer["answer"])
                if "validated_answers" in question_answer:
                    answer_annotations += question_answer["validated_answers"]
                instance = self.text_to_instance(question_text,
                                                 passage_text,
                                                 question_id,
                                                 passage_id,
                                                 answer_annotations,
                                                 passage_tokens)
                if instance is not None:
                    kept_count += 1
                    yield instance
                else:
                    skip_count += 1
        logger.info(f"Skipped {skip_count} questions, kept {kept_count} questions.")
Esempio n. 12
0
    def _read(self, file_path: str):
        file_path = cached_path(file_path)
        with open(file_path) as dataset_file:
            dataset = json.load(dataset_file)
            
        passage_count = 0
        count_questions_prefixes = ["how many field goal", "how many touchdown", "how many pass", "how many times",
                                    "how many interception", "how many win", "how many of the"]
        for passage_id, passage_info in tqdm(dataset.items()):

            # passage_count += 1
            # if passage_count > 30:
            #     break

            passage_text = passage_info["passage"].strip()
            passages_sentences = self.sentence_tokenizer.split_sentences(passage_text)
            numbers_in_passage = []
            number_indices = []
            number_words = []
            number_len = []
            passage_tokens = []
            passage_sentence_tokens= []
            sentence_indices = []
            curr_index = 0

            for sentence_idx, sentence in enumerate(passages_sentences):
                if self.wordpiece_numbers:
                    word_tokens = split_tokens_by_hyphen(self.number_tokenizer.tokenize(sentence))
                else:
                    word_tokens = self.tokenizer.tokenize(sentence)

                sentence_tokens = []

                # Get all passage numbers
                for token in word_tokens:
                    number = self.word_to_num(token.text)
                    wordpieces = self.tokenizer.tokenize(token.text)
                    num_wordpieces = len(wordpieces)
                    if number is not None:
                        numbers_in_passage.append(number)
                        number_indices.append(curr_index)
                        number_words.append(token.text)
                        number_len.append(num_wordpieces)
                    passage_tokens += wordpieces
                    sentence_tokens += wordpieces
                    sentence_indices += [sentence_idx] * num_wordpieces
                    curr_index += num_wordpieces

                passage_sentence_tokens.append(sentence_tokens)
            
            # Process questions from this passage
            for question_answer in passage_info["qa_pairs"]:
                question_id = question_answer["query_id"]
                question_text = question_answer["question"].strip()
                if not any(question_text.lower().startswith(prefix) for prefix in count_questions_prefixes):
                    continue
                answer_annotations = []
                if "answer" in question_answer:
                    if self.answer_type is not None and get_answer_type(question_answer['answer']) not in self.answer_type:
                        continue
                    answer_annotations.append(question_answer["answer"])
                if self.use_validated and "validated_answers" in question_answer:
                    answer_annotations += question_answer["validated_answers"]
                instance = self.text_to_instance(question_text,
                                                 passage_text,
                                                 passage_tokens,
                                                 passage_sentence_tokens,
                                                 numbers_in_passage,
                                                 number_words,
                                                 number_indices,
                                                 number_len,
                                                 sentence_indices,
                                                 question_id,
                                                 passage_id,
                                                 answer_annotations)
                if instance is not None:
                    yield instance