Beispiel #1
0
 def _update_characterwise_target_labels(self, tokenizer, labeled_example,
                                         characterwise_target_labels):
     """Updates target_labels and characters."""
     prefix_without_whitespace = remove_whitespace_and_parse(
         labeled_example.prefix, tokenizer)
     labeled_text_without_whitespace = remove_whitespace_and_parse(
         labeled_example.selection, tokenizer)
     if len(labeled_text_without_whitespace) > 0:
         start = len(prefix_without_whitespace)
         end = start + len(labeled_text_without_whitespace) - 1
         characterwise_target_labels[start] = "B-%s" % labeled_example.label
         characterwise_target_labels[start + 1:end +
                                     1] = ["I-%s" % labeled_example.label
                                           ] * (end - start)
Beispiel #2
0
    def _get_unique_documents(self, tokenizer):
        """Provides an iterator over all documents, skipping duplicated text."""
        document = proto_document.Document()
        characters_per_sentence = set()
        with GFile(self.path, "rb") as src_file:
            msg_buf = src_file.read(_MAX_BINPROTO_PREFIX_LENGTH)
            while msg_buf:
                # Get the message length.
                msg_len, new_pos = _DecodeVarint32(msg_buf, 1)
                msg_buf = msg_buf[new_pos:]
                # Read the rest of the message.
                msg_buf += src_file.read(msg_len - len(msg_buf))
                document.ParseFromString(msg_buf)
                msg_buf = msg_buf[msg_len:]
                # Read the length prefix for the next message.
                msg_buf += src_file.read(_MAX_BINPROTO_PREFIX_LENGTH)

                characters = remove_whitespace_and_parse(
                    document.text, tokenizer)
                if characters in characters_per_sentence or len(
                        characters) == 0:
                    continue
                characters_per_sentence.add(characters)

                yield self._convert_token_boundaries_to_codeunits(document)
Beispiel #3
0
 def _get_unique_text_and_characters(self, tokenizer):
     characters_per_sentence = set()
     with GFile(self.path, "r") as file:
         for text in file:
             characters = remove_whitespace_and_parse(text, tokenizer)
             if (characters in characters_per_sentence
                     or len(characters)) == 0:
                 continue
             characters_per_sentence.add(characters)
             yield text, characters
Beispiel #4
0
 def get_words(self, tokenizer):
     """Returns all words as defined by the tokenizer."""
     words_per_sentence = []
     characters_per_sentence = set()
     for labeled_example in self.get_labeled_text(tokenizer):
         characters = remove_whitespace_and_parse(
             labeled_example.complete_text, tokenizer)
         if characters in characters_per_sentence:
             continue
         else:
             characters_per_sentence.add(characters)
         words = split_into_words(labeled_example.complete_text, tokenizer)
         words_per_sentence.append(words)
     return words_per_sentence
Beispiel #5
0
    def get_characterwise_target_labels(self, tokenizer):
        """Returns a label for each character."""
        characterwise_target_labels_per_sentence = []
        characters_per_sentence = []
        for document in self._get_unique_documents(tokenizer):
            characters = remove_whitespace_and_parse(document.text, tokenizer)
            characterwise_target_labels = [LABEL_OUTSIDE] * len(characters)
            total_prefix = ""
            for labeled_example in self._get_labeled_text(
                    document, only_main_labels=True):
                assert labeled_example.suffix == ""
                total_prefix += labeled_example.prefix
                labeled_example = labeled_example._replace(prefix=total_prefix)
                self._update_characterwise_target_labels(
                    tokenizer, labeled_example, characterwise_target_labels)
                total_prefix += labeled_example.selection

            characterwise_target_labels_per_sentence.append(
                characterwise_target_labels)
            characters_per_sentence.append(characters)
        return (characterwise_target_labels_per_sentence,
                characters_per_sentence)
Beispiel #6
0
    def get_labeled_text(self, tokenizer):
        """Provides an iterator over all labeled texts in the linkfragments.

        This cannot skip entries with duplicated text like similar methods in
        the other readers, because text may be duplicated if there are multiple
        labels. This is handled by the caller.
        """
        with GFile(self.path, "r") as file:
            for linkfragment in file:
                text, label_description = linkfragment.split("\t")
                prefix, remaining_text = text.split("{{{")
                labeled_text, suffix = remaining_text.split("}}}")

                prefix = prefix.strip()
                labeled_text = labeled_text.strip()
                label_description = label_description.strip()
                suffix = suffix.strip()

                if label_description == LF_ADDRESS_LABEL:
                    label = MAIN_LABEL_ADDRESS
                elif label_description == LF_TELEPHONE_LABEL:
                    label = MAIN_LABEL_TELEPHONE
                else:
                    label = LABEL_OUTSIDE

                text_without_braces = text.replace("{{{",
                                                   "").replace("}}}", "")
                text_without_braces = text_without_braces.strip()

                characters = remove_whitespace_and_parse(
                    text_without_braces, tokenizer)
                if len(characters) == 0:
                    continue

                yield LabeledExample(prefix=prefix,
                                     selection=labeled_text,
                                     suffix=suffix,
                                     complete_text=text_without_braces,
                                     label=label)
Beispiel #7
0
    def get_characterwise_target_labels(self, tokenizer):
        """Returns a label for each character."""
        characterwise_target_labels_per_sentence = []
        # The list is used to preserve the insertion order, the set to test for
        # containment.
        characters_per_sentence_list = []
        characters_per_sentence_set = set()
        characterwise_target_labels = []
        characters = ""
        prev_text = ""
        for labeled_example in self.get_labeled_text(tokenizer):
            if prev_text == labeled_example.complete_text:
                # The last entry is updated, it will be added again.
                del characterwise_target_labels_per_sentence[-1]
                del characters_per_sentence_list[-1]
            else:
                characters = remove_whitespace_and_parse(
                    labeled_example.complete_text, tokenizer)
                characterwise_target_labels = [LABEL_OUTSIDE] * len(characters)
                # The sentence could be a duplicate (not repeated because it has
                # multiple labels, but a completely separate entry). Those
                # should be ignored.
                if characters in characters_per_sentence_set:
                    prev_text = ""
                    continue

            self._update_characterwise_target_labels(
                tokenizer, labeled_example, characterwise_target_labels)

            characterwise_target_labels_per_sentence.append(
                characterwise_target_labels)
            characters_per_sentence_set.add(characters)
            characters_per_sentence_list.append(characters)
            prev_text = labeled_example.complete_text

        return (characterwise_target_labels_per_sentence,
                characters_per_sentence_list)
Beispiel #8
0
def _get_predictions_from_lf_directory(lf_directory, raw_path, tokenizer):
    """Gets the characterwise labels from all .lftxt files in the directory.

    Args:
        lf_directory: Path to the directory. All contained .lftxt files are
            parsed.
        raw_path: Path to the file containing all sentences as they are used as
            the input for inference. Necessary to get the correct sentence
            order for the evaluation.
        tokenizer: Tokenizer. Necessary to split the text into words and to
            remove whitespace characters.

    Returns:
        List of characterwise target labels per sentence.
    """
    labeled_sentences = OrderedDict()  # Map sentences to their labels.

    _, characters_per_sentence = get_file_reader(
        raw_path).get_characterwise_target_labels(tokenizer)
    characters_without_whitespace_per_sentence = [
        remove_whitespace_and_parse(characters, tokenizer)
        for characters in characters_per_sentence
    ]
    labeled_sentences = {
        characters_without_whitespace:
        [LABEL_OUTSIDE] * len(characters_without_whitespace)
        for characters_without_whitespace in
        characters_without_whitespace_per_sentence
    }

    for file_name in os.listdir(lf_directory):
        if not file_name.endswith(".lftxt"):
            continue

        for labeled_example in get_file_reader(
                os.path.join(lf_directory,
                             file_name)).get_labeled_text(tokenizer):
            if labeled_example.label == LABEL_OUTSIDE:
                continue
            labeled_example = _unescape_backslashes(labeled_example)
            prefix_length = len(
                remove_whitespace_and_parse(labeled_example.prefix, tokenizer))
            label_length = len(
                remove_whitespace_and_parse(labeled_example.selection,
                                            tokenizer))
            assert label_length > 0

            characters = remove_whitespace_and_parse(
                labeled_example.complete_text, tokenizer)
            # If the .lftxt file was generated as the output of another models
            # prediction, the tokenizer will have lowercased the [UNK] token.
            characters = characters.replace("[unk]", "[UNK]")
            characterwise_labels = labeled_sentences[characters]
            first_label_previously_unset = characterwise_labels[
                prefix_length] in [
                    LABEL_OUTSIDE,
                    "B-%s" % labeled_example.label.upper()
                ]
            other_labels_previously_unset = all(
                label in
                [LABEL_OUTSIDE,
                 "I-%s" % labeled_example.label.upper()]
                for label in characterwise_labels[prefix_length +
                                                  1:prefix_length +
                                                  label_length])
            if (not first_label_previously_unset) or (
                    not other_labels_previously_unset):
                # Because whitespace is removed, we cannot handle cases where
                # the only difference between two sentences is the whitespace.
                # This could be fixed by not removing whitespace. However,
                # removing whitespace significantly simplifies the alignment of
                # individual words to text (see the documentation of
                # remove_whitespace_and_parse).
                print(
                    "[WARNING] there are conflicting labels for the sentence "
                    "'%s'. This could be due to multiple versions of the same "
                    "sentence with different whitespace. Only the first label "
                    "will be used, the conflicting assignment is ignored.")
                continue
            characterwise_labels[
                prefix_length] = "B-%s" % labeled_example.label.upper()
            characterwise_labels[prefix_length + 1:prefix_length +
                                 label_length] = [
                                     "I-%s" % labeled_example.label.upper()
                                 ] * (label_length - 1)

    # The order is important, because it controls which label sequences are
    # compared in the evaluation. The usage of OrderedDict allows this.
    return list(labeled_sentences.values())
Beispiel #9
0
    def get_examples(self, tokenizer, use_additional_labels,
                     use_gold_tokenization_and_include_target_labels):
        """Reads one file and returns a list of `InputExample` instances."""
        examples = []
        sentence_id = 0
        example = tagging_data_lib.InputExample(sentence_id=0)
        prev_text = ""
        characters_per_sentence = set()
        for labeled_example in self.get_labeled_text(tokenizer):
            if use_gold_tokenization_and_include_target_labels:
                if prev_text == labeled_example.complete_text:
                    # Recover the previous example object.
                    sentence_id -= 1
                    example = examples[-1]
                    del examples[-1]
                    prefix_word_length = len(
                        split_into_words(labeled_example.prefix, tokenizer))
                    if any(label_id != LABEL_ID_MAP[LABEL_OUTSIDE] for label_id
                           in example.label_ids[prefix_word_length:]):
                        raise NotImplementedError(
                            "If the .lftxt file contains the same sentence"
                            " multiple times, they are assumed to be sorted in"
                            " the order of labelled sequences.")
                    del example.label_ids[prefix_word_length:]
                    del example.words[prefix_word_length:]
                else:
                    # The sentence could be a duplicate (not repeated because it
                    # has multiple labels, but a completely separate entry).
                    # Those should be ignored.
                    characters = remove_whitespace_and_parse(
                        labeled_example.complete_text, tokenizer)
                    if characters in characters_per_sentence:
                        continue
                    characters_per_sentence.add(characters)
                    add_tfrecord_label(labeled_example.prefix, LABEL_OUTSIDE,
                                       tokenizer, example,
                                       use_additional_labels)
                add_tfrecord_label(labeled_example.selection,
                                   labeled_example.label, tokenizer, example,
                                   use_additional_labels)
                add_tfrecord_label(labeled_example.suffix, LABEL_OUTSIDE,
                                   tokenizer, example, use_additional_labels)
            else:
                characters = remove_whitespace_and_parse(
                    labeled_example.complete_text, tokenizer)
                if characters in characters_per_sentence:
                    continue
                characters_per_sentence.add(characters)
                add_tfrecord_label(labeled_example.complete_text,
                                   LABEL_OUTSIDE,
                                   tokenizer,
                                   example,
                                   use_additional_labels=False)
            prev_text = labeled_example.complete_text

            if example.words:
                examples.append(example)
                sentence_id += 1
                example = tagging_data_lib.InputExample(
                    sentence_id=sentence_id)
        return examples