Python tokenize_document_for_bert Examples

Programming Language: Python

Namespace/Package Name: readtwice.data_utils.data_utils

Method/Function: tokenize_document_for_bert

Examples at hotexamples.com: 4

Python tokenize_document_for_bert - 4 examples found. These are the top rated real world Python examples of readtwice.data_utils.data_utils.tokenize_document_for_bert extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: preprocess_lib.py Project: MitchellTesla/google-research

    def process(self, question_answer_evidence):
        metrics.Metrics.counter(METRICS_NAMESPACE, 'num_questions').inc()

        if self.generate_answers:
            oracle_answers = []
            for answer in question_answer_evidence.answer.values:
                oracle_answers.extend(
                    self.extractive_oracle.find_approximate_answers(
                        question_answer_evidence.evidence.text,
                        answer,
                        remove_all_stopwords_answers=True))
            metrics.Metrics.distribution(METRICS_NAMESPACE,
                                         'oracle_answers_per_question').update(
                                             len(oracle_answers))
            answer_set = question_answer_evidence.answer.make_answer_set(
                oracle_answers)
            normalized_answer_set = {
                normalize_answer(answer)
                for answer in answer_set
            }

        sentences = []
        for sentence in self._split_into_sentences(
                question_answer_evidence.evidence):
            sentence_obj = self._annotate_entities(sentence)
            metrics.Metrics.counter(METRICS_NAMESPACE, 'nltk_entities').inc(
                sentence_obj.num_annotations(1))
            if self.generate_answers:
                annotations = find_answer_annotations(sentence_obj.text,
                                                      answer_set)
                sentence_obj.annotations.extend(annotations)

            sentences.append(sentence_obj)

        big_document = data_utils.BertDocument(
            sentences=sentences,
            document_id=question_answer_evidence.question.id)
        metrics.Metrics.distribution(METRICS_NAMESPACE,
                                     'num_sentences_per_question').update(
                                         len(sentences))
        metrics.Metrics.distribution(METRICS_NAMESPACE,
                                     'doc_length_per_question').update(
                                         big_document.num_characters())

        if self.generate_answers:
            num_annotations = big_document.num_annotations(0)
            metrics.Metrics.distribution(
                METRICS_NAMESPACE,
                'num_annotations_per_question').update(num_annotations)
            if num_annotations == 0:
                metrics.Metrics.counter(
                    METRICS_NAMESPACE,
                    'make_example_status.answer_span_not_found').inc()
                yield beam.pvalue.TaggedOutput(
                    MakeExampleOutput.NO_ANSWER,
                    question_answer_evidence.to_json())
                return

        tokenized_big_document = data_utils.tokenize_document_for_bert(
            big_document, self.tokenizer)

        metrics.Metrics.distribution(
            METRICS_NAMESPACE, 'tokenized_doc_length_per_question').update(
                tokenized_big_document.num_tokens())

        tokenized_question = self._tokenize_text(
            question_answer_evidence.question.value)

        metrics.Metrics.distribution(METRICS_NAMESPACE,
                                     'question_length').update(
                                         len(tokenized_question))

        filtered_annotations = []
        if self.generate_answers:
            for i, sentence in enumerate(tokenized_big_document.sentences):
                should_update, annotations, current_filtered_annotations = self._verify_annotations(
                    sentence.annotations, normalized_answer_set)
                if should_update:
                    tokenized_big_document.sentences[
                        i].annotations = annotations

                    # pylint: disable=g-complex-comprehension
                    filtered_annotations.extend([
                        FilteredAnnotation(
                            question=question_answer_evidence.question,
                            answer=question_answer_evidence.answer,
                            annotation=annotation,
                            sentence=''.join(sentence.tokens))
                        for annotation in current_filtered_annotations
                    ])
                    metrics.Metrics.counter(
                        METRICS_NAMESPACE, 'num_filtered_annotations').inc(
                            len(current_filtered_annotations))

        tokenized_big_document = data_utils.split_tokenized_sentences(
            tokenized_big_document,
            max_tokens=self._get_max_tokens_per_raw_doc(
                len(tokenized_question)),
            min_tokens_for_graceful_split=math.ceil(
                self._get_max_tokens_per_raw_doc(len(tokenized_question)) *
                0.5))

        if self.generate_answers:
            num_annotations = tokenized_big_document.num_annotations(0)
            metrics.Metrics.distribution(
                METRICS_NAMESPACE,
                'num_annotations_tokenized_per_question').update(
                    num_annotations)
            if num_annotations == 0:
                metrics.Metrics.counter(
                    METRICS_NAMESPACE,
                    'make_example_status.answer_not_found_tokenized').inc()
                yield beam.pvalue.TaggedOutput(
                    MakeExampleOutput.NO_ANSWER_TOKENIZED,
                    question_answer_evidence.to_json())
                yield beam.pvalue.TaggedOutput(
                    MakeExampleOutput.NO_ANSWER_TOKENIZED_FILTERED_ANNOTATIONS,
                    filtered_annotations)
                return
            else:
                approx_num_blocks = (
                    tokenized_big_document.num_tokens() /
                    (self.block_length - self.block_overlap_length -
                     len(tokenized_question)))
                if (num_annotations > self.max_num_annotations_per_block *
                        approx_num_blocks):
                    metrics.Metrics.counter(
                        METRICS_NAMESPACE,
                        'num_questions_with_too_many_answers').inc()
                    yield beam.pvalue.TaggedOutput(
                        MakeExampleOutput.TOO_MANY_ANSWERS,
                        question_answer_evidence.to_json())

                yield beam.pvalue.TaggedOutput(
                    MakeExampleOutput.SUCCESS_FILTERED_ANNOTATIONS,
                    filtered_annotations)

        # message = question_answer_evidence.evidence.info.id
        tokenized_documents = data_utils.split_tokenized_documents(
            tokenized_big_document,
            max_tokens=self._get_max_tokens_per_raw_doc(
                len(tokenized_question)),
            max_sentences=None)

        metrics.Metrics.distribution(METRICS_NAMESPACE,
                                     'num_examples_per_question').update(
                                         len(tokenized_documents))
        if len(tokenized_documents) > 1:
            metrics.Metrics.counter(METRICS_NAMESPACE,
                                    'num_too_large_evidence').inc()

        if self.generate_summaries:
            tokenized_summary = self._tokenize_text(
                question_answer_evidence.evidence.summary)
            if len(tokenized_summary) < self.block_length:
                tokenized_summary.extend(
                    [self.padding_token_id] *
                    (self.block_length - len(tokenized_summary)))

        for tokenized_document in tokenized_documents:
            if self.generate_answers and tokenized_document.num_annotations(
                    0) == 0:
                metrics.Metrics.counter(
                    METRICS_NAMESPACE,
                    'make_example_status.answer_not_found_splitted').inc()
                continue
            metrics.Metrics.counter(METRICS_NAMESPACE, 'num_examples').inc()
            tf_example = tokenized_document.to_tf_strided_large_example(
                overlap_length=self.block_overlap_length,
                block_length=self.block_length,
                padding_token_id=self.padding_token_id,
                prefix_token_ids=tokenized_question,
                max_num_annotations=self.max_num_annotations_per_block)
            if self.generate_summaries:
                num_blocks = len(
                    tf_example.features.feature['block_ids'].int64_list.value)
                tf_example.features.feature[
                    'summary_token_ids'].int64_list.value.extend(
                        tokenized_summary * num_blocks)
            yield tf_example

        metrics.Metrics.counter(METRICS_NAMESPACE,
                                'make_example_status.success').inc()

Example #2

Show file

File: beam_utils.py Project: MitchellTesla/google-research

 def process(self, element):
     yield data_utils.tokenize_document_for_bert(element, self.tokenizer)

Example #3

Show file

    def process(self, question_answer_evidence):
        metrics.Metrics.counter(METRICS_NAMESPACE, 'num_questions').inc()

        if self.generate_answers:
            oracle_answers = []
            answer_set = question_answer_evidence.answer.make_answer_set(
                oracle_answers)
            normalized_answer_set = {
                evaluation.normalize_answer(answer)
                for answer in answer_set
            }

        tokenized_question = self._tokenize_text(
            question_answer_evidence.question.value)

        metrics.Metrics.distribution(METRICS_NAMESPACE,
                                     'question_length').update(
                                         len(tokenized_question))

        filtered_annotations = []
        tf_examples = []
        num_answer_annotations = 0
        num_answer_annotations_tokenized = 0
        num_entity_annotations = 0
        num_entity_annotations_tokenized = 0

        no_answer, yes_answer, yes_no_answer = False, False, False
        if question_answer_evidence.answer.values[0] == 'yes':
            metrics.Metrics.counter(METRICS_NAMESPACE,
                                    'num_answer_type.yes').inc()
            yes_no_answer = True
            yes_answer = True
        if question_answer_evidence.answer.values[0] == 'no':
            metrics.Metrics.counter(METRICS_NAMESPACE,
                                    'num_answer_type.no').inc()
            yes_no_answer = True
            no_answer = True
        if yes_no_answer:
            metrics.Metrics.counter(METRICS_NAMESPACE,
                                    'num_answer_type.yes_no').inc()
        else:
            metrics.Metrics.counter(METRICS_NAMESPACE,
                                    'num_answer_type.span').inc()

        for evidence in question_answer_evidence.evidence:
            sentence = self._split_into_sentences(evidence)
            sentence_obj = self._annotate_entities(sentence)
            metrics.Metrics.counter(METRICS_NAMESPACE, 'nltk_entities').inc(
                sentence_obj.num_annotations(1))

            if self.generate_answers and not yes_no_answer:
                annotations = find_answer_annotations(sentence_obj.text,
                                                      answer_set)
                sentence_obj.annotations.extend(annotations)

            document = data_utils.BertDocument(
                sentences=[sentence_obj],
                document_id=question_answer_evidence.question.id)

            num_entity_annotations += document.num_annotations(1)
            num_answer_annotations += document.num_annotations(0)

            tokenized_document = data_utils.tokenize_document_for_bert(
                document, self.tokenizer)

            metrics.Metrics.distribution(
                METRICS_NAMESPACE,
                'tokenized_doc_length_per_paragraph').update(
                    tokenized_document.num_tokens())

            if self.generate_answers and not yes_no_answer:
                assert len(tokenized_document.sentences) == 1
                (should_update, annotations,
                 current_filtered_annotations) = self._verify_annotations(
                     tokenized_document.sentences[0].annotations,
                     normalized_answer_set)
                if should_update:
                    tokenized_document.sentences[0].annotations = annotations
                    # pylint: disable=g-complex-comprehension
                    filtered_annotations.extend([
                        FilteredAnnotation(
                            question=question_answer_evidence.question,
                            answer=question_answer_evidence.answer,
                            annotation=annotation,
                            sentence=''.join(
                                tokenized_document.sentences[0].tokens))
                        for annotation in current_filtered_annotations
                    ])
                    metrics.Metrics.counter(
                        METRICS_NAMESPACE, 'num_filtered_annotations').inc(
                            len(current_filtered_annotations))

            num_entity_annotations_tokenized += tokenized_document.num_annotations(
                1)
            num_answer_annotations_tokenized += tokenized_document.num_annotations(
                0)

            tf_example = tokenized_document.to_tf_strided_large_example(
                overlap_length=self.block_overlap_length,
                block_length=self.block_length,
                padding_token_id=self.padding_token_id,
                prefix_token_ids=tokenized_question,
                max_num_annotations=self.max_num_annotations_per_block)

            if yes_answer:
                assert yes_no_answer
                assert not no_answer
                tf_example.features.feature[
                    'answer_type'].int64_list.value[:] = [1]
            elif no_answer:
                assert yes_no_answer
                assert not yes_answer
                tf_example.features.feature[
                    'answer_type'].int64_list.value[:] = [2]
            else:
                assert not yes_no_answer
                tf_example.features.feature[
                    'answer_type'].int64_list.value[:] = [0]

            if evidence.is_supporting_fact:
                tf_example.features.feature[
                    'is_supporting_fact'].int64_list.value[:] = [1]
            else:
                tf_example.features.feature[
                    'is_supporting_fact'].int64_list.value[:] = [0]

            tf_examples.append(tf_example)

        metrics.Metrics.distribution(METRICS_NAMESPACE,
                                     'num_paragraphs_per_question').update(
                                         len(tf_examples))
        metrics.Metrics.distribution(
            METRICS_NAMESPACE, 'num_answer_annotations_per_question').update(
                num_answer_annotations)
        metrics.Metrics.distribution(
            METRICS_NAMESPACE, 'num_entity_annotations_per_question').update(
                num_entity_annotations)

        if (self.generate_answers and not yes_no_answer
                and num_answer_annotations == 0):
            metrics.Metrics.counter(METRICS_NAMESPACE,
                                    'make_example_status.no_answer').inc()
            yield beam.pvalue.TaggedOutput(MakeExampleOutput.NO_ANSWER,
                                           question_answer_evidence.to_json())
            return

        metrics.Metrics.distribution(
            METRICS_NAMESPACE,
            'num_answer_tokenize_annotations_per_question').update(
                num_answer_annotations_tokenized)
        metrics.Metrics.distribution(
            METRICS_NAMESPACE,
            'num_entity_tokenize_annotations_per_question').update(
                num_entity_annotations_tokenized)
        metrics.Metrics.distribution(METRICS_NAMESPACE,
                                     'num_filtered_annotations').update(
                                         len(filtered_annotations))

        if (self.generate_answers and not yes_no_answer
                and num_answer_annotations_tokenized == 0):
            metrics.Metrics.counter(
                METRICS_NAMESPACE,
                'make_example_status.no_answer_tokenized_annotations').inc()
            yield beam.pvalue.TaggedOutput(
                MakeExampleOutput.NO_ANSWER_TOKENIZED_FILTERED_ANNOTATIONS,
                filtered_annotations)
            return

        yield beam.pvalue.TaggedOutput(
            MakeExampleOutput.SUCCESS_FILTERED_ANNOTATIONS,
            filtered_annotations)

        if len(tf_examples) != 10:
            metrics.Metrics.counter(
                METRICS_NAMESPACE, 'num_not_10_paragraphs_per_question').inc()

        tf_example = tf_examples[0]
        for i in range(1, len(tf_examples)):
            for name in tf_example.features.feature:
                repeated_values = get_repeated_values(name, tf_example)
                extension_values = list(
                    get_repeated_values(name, tf_examples[i]))
                repeated_values.extend(extension_values)
        metrics.Metrics.counter(METRICS_NAMESPACE,
                                'make_example_status.success').inc()
        yield tf_example

Example #4

Show file

    def process(self, question_answer_evidence):
        metrics.Metrics.counter(METRICS_NAMESPACE, 'num_questions').inc()

        if self.generate_answers:
            answer_set = question_answer_evidence.answer.make_answer_set()

        sentences = []
        for sentence in self._split_into_sentences(
                question_answer_evidence.evidence):
            sentence_obj = self._annotate_entities(sentence)
            metrics.Metrics.counter(METRICS_NAMESPACE, 'nltk_entities').inc(
                sentence_obj.num_annotations(1))
            if self.generate_answers:
                annotations = find_answer_annotations(sentence_obj.text,
                                                      answer_set)
                sentence_obj.annotations.extend(annotations)

            sentences.append(sentence_obj)

        big_document = data_utils.BertDocument(
            sentences=sentences,
            document_id=question_answer_evidence.question.id)
        metrics.Metrics.distribution(METRICS_NAMESPACE,
                                     'doc_length_per_question').update(
                                         big_document.num_characters())

        if self.generate_answers:
            num_annotations = big_document.num_annotations(0)
            metrics.Metrics.distribution(
                METRICS_NAMESPACE,
                'num_annotations_per_question').update(num_annotations)
            if num_annotations == 0:
                metrics.Metrics.counter(
                    METRICS_NAMESPACE,
                    'make_example_status.answer_span_not_found').inc()
                yield beam.pvalue.TaggedOutput(
                    MakeExampleOutput.NO_ANSWER,
                    question_answer_evidence.to_json())
                return

        tokenized_big_document = data_utils.tokenize_document_for_bert(
            big_document, self.tokenizer)

        metrics.Metrics.distribution(
            METRICS_NAMESPACE, 'tokenized_doc_length_per_question').update(
                tokenized_big_document.num_tokens())

        tokenized_question = self._tokenize_question(
            question_answer_evidence.question.value)

        metrics.Metrics.distribution(METRICS_NAMESPACE,
                                     'question_length').update(
                                         len(tokenized_question))

        filtered_annotations = []
        if self.generate_answers:
            for i, sentence in enumerate(tokenized_big_document.sentences):
                (should_update, annotations,
                 current_filtered_annotations) = self._verify_annotations(
                     sentence.annotations, answer_set)
                if should_update:
                    tokenized_big_document.sentences[
                        i].annotations = annotations

                    # pylint: disable=g-complex-comprehension
                    filtered_annotations.extend([
                        FilteredAnnotation(
                            question=question_answer_evidence.question,
                            answer=question_answer_evidence.answer,
                            annotation=annotation,
                            sentence=''.join(sentence.tokens))
                        for annotation in current_filtered_annotations
                    ])
                    metrics.Metrics.counter(
                        METRICS_NAMESPACE, 'num_filtered_annotations').inc(
                            len(current_filtered_annotations))

            num_annotations = tokenized_big_document.num_annotations(0)
            metrics.Metrics.distribution(
                METRICS_NAMESPACE,
                'num_annotations_tokenized_per_question').update(
                    num_annotations)
            if num_annotations == 0:
                metrics.Metrics.counter(
                    METRICS_NAMESPACE,
                    'make_example_status.answer_not_found_tokenized').inc()
                yield beam.pvalue.TaggedOutput(
                    MakeExampleOutput.NO_ANSWER_TOKENIZED,
                    question_answer_evidence.to_json())
                yield beam.pvalue.TaggedOutput(
                    MakeExampleOutput.NO_ANSWER_TOKENIZED_FILTERED_ANNOTATIONS,
                    filtered_annotations)
                return
            else:
                approx_num_blocks = (
                    tokenized_big_document.num_tokens() /
                    (self.block_length - self.block_overlap_length -
                     len(tokenized_question)))
                if num_annotations > self.max_num_annotations_per_block * approx_num_blocks:
                    metrics.Metrics.counter(
                        METRICS_NAMESPACE,
                        'num_questions_with_too_many_answers').inc()
                    yield beam.pvalue.TaggedOutput(
                        MakeExampleOutput.TOO_MANY_ANSWERS,
                        question_answer_evidence.to_json())

                yield beam.pvalue.TaggedOutput(
                    MakeExampleOutput.SUCCESS_FILTERED_ANNOTATIONS,
                    filtered_annotations)

        tokenized_documents = data_utils.split_tokenized_documents(
            tokenized_big_document,
            max_tokens=self._get_max_tokens_per_raw_doc(
                len(tokenized_question)),
            max_sentences=None)

        metrics.Metrics.distribution(METRICS_NAMESPACE,
                                     'num_examples_per_question').update(
                                         len(tokenized_documents))
        if len(tokenized_documents) > 1:
            metrics.Metrics.counter(METRICS_NAMESPACE,
                                    'num_too_large_evidence').inc()

        for tokenized_document in tokenized_documents:
            if self.generate_answers and tokenized_document.num_annotations(
                    0) == 0:
                metrics.Metrics.counter(
                    METRICS_NAMESPACE,
                    'make_example_status.answer_not_found_splitted').inc()
                continue
            metrics.Metrics.counter(METRICS_NAMESPACE, 'num_examples').inc()
            yield tokenized_document.to_tf_strided_large_example(
                overlap_length=self.block_overlap_length,
                block_length=self.block_length,
                padding_token_id=self.padding_token_id,
                prefix_token_ids=tokenized_question,
                max_num_annotations=self.max_num_annotations_per_block)

        metrics.Metrics.counter(METRICS_NAMESPACE,
                                'make_example_status.success').inc()