Beispiel #1
0
    def _process_parser(self, sentences, input_pack: DataPack):
        """Parse the sentence. Default behaviour is to segment sentence, POSTag
        and Lemmatize.

        Args:
            sentences: Generator object which yields sentences in document
            input_pack: input pack which needs to be modified

        Returns:

        """
        for sentence in sentences:
            Sentence(input_pack, sentence.start_char, sentence.end_char)

            if "tokenize" in self.processors:
                # Iterating through spaCy token objects
                for word in sentence:
                    begin_pos_word = word.idx
                    end_pos_word = begin_pos_word + len(word.text)
                    token = Token(input_pack, begin_pos_word, end_pos_word)

                    if "pos" in self.processors:
                        token.pos = word.tag_

                    if "lemma" in self.processors:
                        token.lemma = word.lemma_
Beispiel #2
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = self.new_pack()
        doc = codecs.open(file_path, "r", encoding="utf8")

        text = ""
        offset = 0
        has_rows = False

        sentence_begin = 0
        sentence_cnt = 0

        for line in doc:
            line = line.strip()

            if line != "" and not line.startswith("#"):
                conll_components = line.split()

                word = conll_components[1]
                pos = conll_components[2]
                chunk_id = conll_components[3]
                ner_tag = conll_components[4]

                word_begin = offset
                word_end = offset + len(word)

                # Add tokens.
                token = Token(pack, word_begin, word_end)
                token.pos = pos
                token.chunk = chunk_id
                token.ner = ner_tag

                text += word + " "
                offset = word_end + 1
                has_rows = True
            else:
                if not has_rows:
                    # Skip consecutive empty lines.
                    continue
                # add sentence
                Sentence(pack, sentence_begin, offset - 1)

                sentence_begin = offset
                sentence_cnt += 1
                has_rows = False

        if has_rows:
            # Add the last sentence if exists.
            Sentence(pack, sentence_begin, offset - 1)
            sentence_cnt += 1

        pack.set_text(text, replace_func=self.text_replace_operation)

        Document(pack, 0, len(text))

        pack.pack_name = file_path
        doc.close()

        yield pack
Beispiel #3
0
def _space_token(pack: DataPack):
    begin = 0
    for i, c in enumerate(pack.text):
        if c == ' ':
            pack.add_entry(Token(pack, begin, i))
            begin = i + 1

    if begin < len(pack.text):
        pack.add_entry(Token(pack, begin, len(pack.text)))
    def _process(self, input_pack: DataPack):
        doc = input_pack.text

        if len(doc) == 0:
            logging.warning("Find empty text in doc.")

        # sentence parsing
        sentences = self.nlp(doc).sentences

        # Iterating through stanfordnlp sentence objects
        for sentence in sentences:
            Sentence(
                input_pack,
                sentence.tokens[0].start_char,
                sentence.tokens[-1].end_char,
            )

            tokens: List[Token] = []
            if "tokenize" in self.processors:
                # Iterating through stanfordnlp word objects
                for word in sentence.words:
                    misc = word.misc.split("|")

                    t_start = -1
                    t_end = -1
                    for m in misc:
                        k, v = m.split("=")
                        if k == "start_char":
                            t_start = int(v)
                        elif k == "end_char":
                            t_end = int(v)

                    if t_start < 0 or t_end < 0:
                        raise ValueError(
                            "Cannot determine word start or end for "
                            "stanfordnlp."
                        )

                    token = Token(input_pack, t_start, t_end)

                    if "pos" in self.processors:
                        token.pos = word.pos
                        token.ud_xpos = word.xpos

                    if "lemma" in self.processors:
                        token.lemma = word.lemma

                    tokens.append(token)

            # For each sentence, get the dependency relations among tokens
            if "depparse" in self.processors:
                # Iterating through token entries in current sentence
                for token, word in zip(tokens, sentence.words):
                    child = token  # current token
                    parent = tokens[word.head - 1]  # Head token
                    relation_entry = Dependency(input_pack, parent, child)
                    relation_entry.rel_type = word.deprel
Beispiel #5
0
    def _process(self, input_pack: DataPack):
        pattern = r"\s+"
        start = 0

        for m in re.finditer(pattern, input_pack.text):
            input_pack.add_entry(Token(input_pack, start, m.start()))
            start = m.end()

        if start < len(input_pack.text):
            input_pack.add_entry(Token(input_pack, start, len(input_pack.text)))
Beispiel #6
0
    def _create_tokens(self, input_pack, sentence, result):
        words, pos = result['words'], result['pos']
        tokens = []
        offset = sentence.span.begin
        word_end = 0
        for i, word in enumerate(words):
            word_begin = sentence.text.find(word, word_end)
            word_end = word_begin + len(word)
            token = Token(input_pack, offset + word_begin, offset + word_end)
            if "pos" in self.configs.processors:
                token.pos = pos[i]
            tokens.append(token)

        return tokens
Beispiel #7
0
    def _parse_pack(self, data: dict) -> Iterator[DataPack]:
        r"""Extracts information from input `data` of one document output from
        Prodigy Annotator including the text, tokens and its annotations into a
        DataPack.

        Args:
            data: a dict that contains information for one document.

        Returns: DataPack containing information extracted from `data`.
        """
        pack = DataPack()
        text = data['text']
        pack.set_text(text, replace_func=self.text_replace_operation)

        Document(pack, 0, len(text))

        tokens = data['tokens']
        spans = data['spans']
        for token in tokens:
            begin = token['start']
            end = token['end']
            Token(pack, begin, end)

        for span_items in spans:
            begin = span_items['start']
            end = span_items['end']
            annotation_entry = EntityMention(pack, begin, end)
            annotation_entry.ner_type = span_items['label']

        pack.meta.doc_id = data['meta']['id']

        yield pack
Beispiel #8
0
    def _parse_pack(self, data: dict) -> Iterator[DataPack]:
        r"""Extracts information from input `data` of one document output from
        Prodigy Annotator including the text, tokens and its annotations into a
        DataPack.

        Args:
            data: a dict that contains information for one document.

        Returns: DataPack containing information extracted from `data`.
        """
        pack = DataPack()
        text = data["text"]
        pack.set_text(text, replace_func=self.text_replace_operation)

        Document(pack, 0, len(text))

        tokens = data["tokens"]
        spans = data["spans"]
        for token in tokens:
            begin = token["start"]
            end = token["end"]
            Token(pack, begin, end)

        for span_items in spans:
            begin = span_items["start"]
            end = span_items["end"]
            annotation_entry = EntityMention(pack, begin, end)
            annotation_entry.ner_type = span_items["label"]

        pack.pack_name = data["meta"]["id"]

        yield pack
    def test_multi_pack_copy_link_or_group(self):
        processor = ReplacementDataAugmentProcessor()
        m_pack = MultiPack()
        src_pack = m_pack.add_pack("src")
        tgt_pack = m_pack.add_pack("tgt")

        src_pack.set_text("input")
        tgt_pack.set_text("output")
        src_token = src_pack.add_entry(Token(src_pack, 0, len(src_pack.text)))
        tgt_token = tgt_pack.add_entry(Token(tgt_pack, 0, len(tgt_pack.text)))

        mpl = m_pack.add_entry(MultiPackLink(m_pack, src_token, tgt_token))
        # The MultiPackLink should not be copied, because its children are not copied.
        self.assertEqual(processor._copy_multi_pack_link_or_group(mpl, m_pack), False)
        new_src_pack = processor._auto_align_annotations(src_pack, [])
        self.assertEqual(len(list(new_src_pack.get(Token))), 1)
Beispiel #10
0
    def test_replace(self):
        data_pack = DataPack()
        data_pack.set_text("auxiliary colleague apple")
        token_1 = Token(data_pack, 0, 9)
        token_2 = Token(data_pack, 10, 19)
        token_3 = Token(data_pack, 20, 25)
        data_pack.add_entry(token_1)
        data_pack.add_entry(token_2)
        data_pack.add_entry(token_3)

        self.assertIn(
            self.tyre.replace(token_1)[1],
            ["auxilliary", "auxilary", "auxillary"],
        )
        self.assertIn(self.tyre.replace(token_2)[1], ["collegue", "colleaque"])
        self.assertIn(self.tyre.replace(token_3)[1], ["apple"])
Beispiel #11
0
 def test_replace(self):
     data_pack = DataPack()
     data_pack.set_text("google")
     token_1 = Token(data_pack, 0, 6)
     data_pack.add_entry(token_1)
     is_replace, replaced_token = self.esa.replace(token_1)
     self.assertTrue(is_replace)
     self.assertIn(replaced_token,
                   ["yahoo", "aol", "microsoft", "web", "internet"])
    def setUp(self):
        data_pack = DataPack()
        self.word = "eat"
        data_pack.set_text(self.word)
        self.token = Token(data_pack, 0, 3)
        data_pack.add_all_remaining_entries()

        self.word_list = ["apple", "banana", "orange"]
        self.sampler = UniformSampler(self.word_list)
Beispiel #13
0
    def _process(self, input_pack: DataPack):
        doc = input_pack.text
        end_pos = 0

        # sentence parsing
        sentences = self.nlp(doc).sentences  # type: ignore

        # Iterating through stanfordnlp sentence objects
        for sentence in sentences:
            begin_pos = doc.find(sentence.words[0].text, end_pos)
            end_pos = doc.find(sentence.words[-1].text, begin_pos) + len(
                sentence.words[-1].text)
            sentence_entry = Sentence(input_pack, begin_pos, end_pos)

            tokens: List[Token] = []
            if "tokenize" in self.processors:
                offset = sentence_entry.span.begin
                end_pos_word = 0

                # Iterating through stanfordnlp word objects
                for word in sentence.words:
                    begin_pos_word = sentence_entry.text. \
                        find(word.text, end_pos_word)
                    end_pos_word = begin_pos_word + len(word.text)
                    token = Token(input_pack, begin_pos_word + offset,
                                  end_pos_word + offset)

                    if "pos" in self.processors:
                        token.pos = word.pos
                        token.ud_xpos = word.xpos

                    if "lemma" in self.processors:
                        token.lemma = word.lemma

                    tokens.append(token)

            # For each sentence, get the dependency relations among tokens
            if "depparse" in self.processors:
                # Iterating through token entries in current sentence
                for token, word in zip(tokens, sentence.words):
                    child = token  # current token
                    parent = tokens[word.governor - 1]  # Root token
                    relation_entry = Dependency(input_pack, parent, child)
                    relation_entry.rel_type = word.dependency_relation
Beispiel #14
0
    def test_replace(self):
        random.seed(42)
        data_pack = DataPack()
        test_string = "The lazy fox jumped over the fence"
        test_result = "T/-/3 lazy f0>< jumpe|) oveI2 th3 fe^ce"
        data_pack.set_text(test_string)
        token_1 = Token(data_pack, 0, len(test_string))
        data_pack.add_entry(token_1)

        self.assertIn(self.test.replace(token_1)[1], test_result)
Beispiel #15
0
 def _process(self, input_pack: DataPack):
     for sentence in input_pack.get(entry_type=Sentence,
                                    component=self.sentence_component):
         offset = sentence.span.begin
         end_pos = 0
         for word in word_tokenize(sentence.text):
             begin_pos = sentence.text.find(word, end_pos)
             end_pos = begin_pos + len(word)
             token = Token(input_pack, begin_pos + offset, end_pos + offset)
             input_pack.add_or_get_entry(token)
Beispiel #16
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack: DataPack = DataPack()
        text: str = ""
        offset: int = 0

        with open(file_path, "r", encoding="utf8") as f:
            for line in f:
                line = line.strip()
                if line != "":
                    oie_component: List[str] = line.split("\t")
                    sentence: str = oie_component[0]

                    # Add sentence.
                    Sentence(pack, offset, offset + len(sentence))
                    offset += len(sentence) + 1
                    text += sentence + " "

                    head_predicate: str = oie_component[1]
                    full_predicate: str = oie_component[2]

                    # Add head predicate.
                    token: Token = Token(pack,
                                         offset,
                                         offset + len(head_predicate))
                    offset += len(head_predicate) + 1
                    text += head_predicate + " "

                    # Add full predicate.
                    predicate_mention: PredicateMention = PredicateMention(pack,
                                                         offset,
                                                         offset
                                                         + len(full_predicate))
                    predicate_mention.headword = token
                    offset += len(full_predicate) + 1
                    text += full_predicate + " "

                    for arg in oie_component[3:]:
                        # Add predicate argument.
                        predicate_arg: PredicateArgument = \
                            PredicateArgument(pack,
                                              offset,
                                              offset + len(arg))
                        offset += len(arg) + 1
                        text += arg + " "

                        # Add predicate link.
                        PredicateLink(pack, predicate_mention, predicate_arg)

        pack.set_text(text, replace_func=self.text_replace_operation)

        Document(pack, 0, len(text))

        pack.pack_name = file_path

        yield pack
Beispiel #17
0
    def setUp(self):
        data_pack = DataPack()
        self.word = "eat"
        data_pack.set_text(self.word)
        self.token = Token(data_pack, 0, 3)
        data_pack.add_all_remaining_entries()

        self.word_list = ["apple", "banana", "orange"]
        self.word_dict = {
            "apple": 1,
            "banana": 2,
            "mango": 3,
        }
Beispiel #18
0
    def test_segmenter(self):
        data_pack = DataPack()
        data_pack.set_text("eat phone")
        token_1 = Token(data_pack, 0, 3)
        token_2 = Token(data_pack, 4, 9)
        token_1.pos = "VB"
        token_2.pos = None
        data_pack.add_entry(token_1)
        data_pack.add_entry(token_2)

        self.assertIn(
            self.dra.replace(token_1)[1],
            [
                "eat",
                "feed",
                "eat on",
                "consume",
                "eat up",
                "use up",
                "deplete",
                "exhaust",
                "run through",
                "wipe out",
                "corrode",
                "rust",
            ],
        )
        self.assertIn(
            self.dra.replace(token_2)[1],
            [
                "telephone",
                "phone",
                "telephone set",
                "speech sound",
                "sound",
                "earphone",
                "earpiece",
                "headphone",
                "call",
                "telephone",
                "call up",
                "ring",
            ],
        )
Beispiel #19
0
 def _process(self, input_pack: DataPack):
     for begin, end in self.tokenizer.span_tokenize(input_pack.text):
         Token(input_pack, begin, end)
Beispiel #20
0
    def _parse_pack(self, doc_lines) -> Iterator[DataPack]:
        # pylint: disable=no-self-use
        token_comp_fields = [
            "id", "form", "lemma", "pos", "ud_xpos", "features", "head",
            "label", "enhanced_dependency_relations", "ud_misc"
        ]

        token_multi_fields = [
            "features", "ud_misc", "enhanced_dependency_relations"
        ]

        token_feature_fields = ["features", "ud_misc"]

        token_entry_fields = ["lemma", "pos", "ud_xpos", "features", "ud_misc"]

        data_pack: DataPack = DataPack()
        doc_sent_begin: int = 0
        doc_num_sent: int = 0
        doc_text: str = ''
        doc_offset: int = 0
        doc_id: str

        sent_text: str
        sent_tokens: Dict[str, Tuple[Dict[str, Any], Token]] = {}

        for line in doc_lines:
            line = line.strip()
            line_comps = line.split()

            if line.startswith("# newdoc"):
                doc_id = line.split("=")[1].strip()

            elif line.startswith("# sent"):
                sent_text = ''

            elif len(line_comps) > 0 and \
                    line_comps[0].strip().isdigit():
                # token
                token_comps: Dict[str, Any] = {}

                for index, key in enumerate(token_comp_fields):
                    token_comps[key] = str(line_comps[index])

                    if key in token_multi_fields:
                        values = str(token_comps[key]).split("|") \
                            if token_comps[key] != '_' else []
                        if key not in token_feature_fields:
                            token_comps[key] = values
                        else:
                            feature_lst = [
                                elem.split('=', 1) for elem in values
                            ]
                            feature_dict = {
                                elem[0]: elem[1]
                                for elem in feature_lst
                            }
                            token_comps[key] = feature_dict

                word: str = token_comps["form"]
                word_begin = doc_offset
                word_end = doc_offset + len(word)

                token: Token \
                    = Token(data_pack, word_begin, word_end)
                kwargs = {key: token_comps[key] for key in token_entry_fields}

                # add token
                token.set_fields(**kwargs)
                data_pack.add_or_get_entry(token)

                sent_tokens[str(token_comps["id"])] = (token_comps, token)

                sent_text += word + " "
                doc_offset = word_end + 1

            elif line == "":
                # sentence ends
                sent_text = sent_text.strip()
                doc_text += ' ' + sent_text

                # add dependencies for a sentence when all the tokens have been
                # added
                for token_id in sent_tokens:
                    token_comps, token = sent_tokens[token_id]

                    def add_dependency(dep_parent, dep_child, dep_label,
                                       dep_type, data_pack_):
                        """Adds dependency to a data_pack
                        Args:
                            dep_parent: dependency parent token
                            dep_child: dependency child token
                            dep_label: dependency label
                            dep_type: "primary" or "enhanced" dependency
                            data_pack_: data_pack to which the
                            dependency is to be added
                        """
                        dependency = Dependency(data_pack, dep_parent,
                                                dep_child)
                        dependency.dep_label = dep_label
                        dependency.type = dep_type
                        data_pack_.add_or_get_entry(dependency)

                    # add primary dependency
                    label = token_comps["label"]
                    if label == "root":
                        token.is_root = True
                    else:
                        token.is_root = False
                        head = sent_tokens[token_comps["head"]][1]
                        add_dependency(head, token, label, "primary",
                                       data_pack)

                    # add enhanced dependencies
                    for dep in token_comps["enhanced_dependency_relations"]:
                        head_id, label = dep.split(":", 1)
                        if label != "root":
                            head = sent_tokens[head_id][1]
                            add_dependency(head, token, label, "enhanced",
                                           data_pack)

                # add sentence
                sent = Sentence(data_pack, doc_sent_begin, doc_offset - 1)
                data_pack.add_or_get_entry(sent)

                doc_sent_begin = doc_offset
                doc_num_sent += 1

        # add doc to data_pack
        document = Document(data_pack, 0, len(doc_text))
        data_pack.add_or_get_entry(document)
        data_pack.meta.doc_id = doc_id
        data_pack.set_text(doc_text.strip())

        yield data_pack
Beispiel #21
0
    def _parse_pack(self, collection: str) -> Iterator[DataPack]:
        with open(collection, "r", encoding="utf8") as doc:
            pack_id: int = 0

            pack: DataPack = DataPack()
            text: str = ""
            offset: int = 0
            has_rows: bool = False

            sentence_begin: int = 0
            sentence_cnt: int = 0

            # NER tag is either "O" or in the format "X-Y",
            # where X is one of B, I,
            # Y is a tag like ORG, PER etc
            prev_y = None
            prev_x = None
            start_index = -1

            for line in doc:
                line = line.strip()

                if line.find("DOCSTART") != -1:
                    # Skip the first DOCSTART.
                    if offset == 0:
                        continue
                    # Add remaining sentence.
                    if has_rows:
                        # Add the last sentence if exists.
                        Sentence(pack, sentence_begin, offset - 1)
                        sentence_cnt += 1

                    pack.set_text(text,
                                  replace_func=self.text_replace_operation)
                    Document(pack, 0, len(text))
                    pack.pack_name = collection + "_%d" % pack_id
                    pack_id += 1
                    yield pack

                    # Create a new datapack.
                    pack = DataPack()
                    text = ""
                    offset = 0
                    has_rows = False

                    sentence_begin = 0
                    sentence_cnt = 0

                    prev_y = None
                    prev_x = None
                    start_index = -1

                elif line != "" and not line.startswith("#"):
                    conll_components = line.split()

                    word = conll_components[0]
                    pos = conll_components[1]
                    chunk_id = conll_components[2]

                    ner_tag = conll_components[3]

                    # A new ner tag occurs.
                    if ner_tag == "O" or ner_tag.split("-")[0] == "B":
                        # Add previous ner tag to sentence if it exists.
                        if prev_y is not None:
                            entity_mention = EntityMention(
                                pack, start_index, offset - 1)
                            entity_mention.ner_type = prev_y

                        # Start process current ner tag.
                        if ner_tag == "O":
                            # Current ner tag is O, reset information.
                            prev_x = None
                            prev_y = None
                            start_index = -1
                        else:
                            # Current ner tag is B.
                            prev_x = "B"
                            prev_y = ner_tag.split("-")[1]
                            start_index = offset
                    # This ner tag is connected to previous one.
                    else:
                        x, y = ner_tag.split("-")
                        assert x == "I", "Unseen tag %s in the file." % x
                        assert y == prev_y, "Error in %s." % ner_tag
                        assert prev_x in ("B", "I"), "Error in %s." % ner_tag
                        prev_x = "I"

                    word_begin = offset
                    word_end = offset + len(word)

                    # Add tokens.
                    token = Token(pack, word_begin, word_end)
                    token.pos = pos
                    token.chunk = chunk_id

                    text += word + " "
                    offset = word_end + 1
                    has_rows = True
                else:
                    if not has_rows:
                        # Skip consecutive empty lines.
                        continue
                    # Add sentence
                    Sentence(pack, sentence_begin, offset - 1)

                    # Handle the last ner tag if exists.
                    if prev_x is not None:
                        entity_mention = EntityMention(pack, start_index,
                                                       offset - 1)
                        entity_mention.ner_type = prev_y

                    # Reset information.
                    sentence_cnt += 1
                    has_rows = False
                    prev_y = None
                    prev_x = None
                    sentence_begin = offset

            if has_rows:
                # Add the last sentence if exists.
                Sentence(pack, sentence_begin, offset - 1)
                sentence_cnt += 1

            pack.set_text(text, replace_func=self.text_replace_operation)
            Document(pack, 0, len(text))
            pack.pack_name = os.path.basename(collection)

            yield pack
Beispiel #22
0
    def _parse_pack(self, doc_lines) -> Iterator[DataPack]:
        token_comp_fields = ["id", "form", "lemma", "pos",
                             "ud_xpos", "ud_features", "head", "label",
                             "enhanced_dependency_relations", "ud_misc"]

        token_multi_fields = ["ud_features", "ud_misc",
                              "enhanced_dependency_relations"]

        token_feature_fields = ["ud_features", "ud_misc"]

        data_pack: DataPack = DataPack()
        doc_sent_begin: int = 0
        doc_num_sent: int = 0
        doc_text: str = ''
        doc_offset: int = 0
        doc_id: str

        sent_text: str
        sent_tokens: Dict[str, Tuple[Dict[str, Any], Token]] = {}

        for line in doc_lines:
            line = line.strip()
            line_comps = line.split()

            if line.startswith("# newdoc"):
                doc_id = line.split("=")[1].strip()

            elif line.startswith("# sent"):
                sent_text = ''

            elif len(line_comps) > 0 and \
                    line_comps[0].strip().isdigit():
                # token
                token_comps: Dict[str, Any] = {}

                for index, key in enumerate(token_comp_fields):
                    token_comps[key] = str(line_comps[index])

                    if key in token_multi_fields:
                        values = str(token_comps[key]).split("|") \
                            if token_comps[key] != '_' else []
                        if key not in token_feature_fields:
                            token_comps[key] = values
                        else:
                            feature_lst = [elem.split('=', 1)
                                           for elem in values]
                            feature_dict = {elem[0]: elem[1]
                                            for elem in feature_lst}
                            token_comps[key] = feature_dict

                word: str = token_comps["form"]
                word_begin = doc_offset
                word_end = doc_offset + len(word)

                # add token
                token: Token = Token(data_pack, word_begin, word_end)

                token.lemma = token_comps['lemma']
                token.pos = token_comps['pos']
                token.ud_xpos = token_comps['ud_xpos']
                token.ud_features = token_comps['ud_features']
                token.ud_misc = token_comps['ud_misc']

                sent_tokens[str(token_comps["id"])] = (token_comps, token)

                sent_text += word + " "
                doc_offset = word_end + 1

            elif line == "":
                # sentence ends
                sent_text = sent_text.strip()
                doc_text += ' ' + sent_text

                # add dependencies for a sentence when all the tokens have been
                # added
                for token_id in sent_tokens:
                    token_comps, token = sent_tokens[token_id]

                    # add primary dependency
                    label = token_comps["label"]
                    if label == "root":
                        token.is_root = True
                    else:
                        token.is_root = False
                        head = sent_tokens[token_comps["head"]][1]
                        dependency = Dependency(data_pack, head, token)
                        dependency.dep_label = label

                    # add enhanced dependencies
                    for dep in token_comps["enhanced_dependency_relations"]:
                        head_id, label = dep.split(":", 1)
                        if label != "root":
                            head = sent_tokens[head_id][1]
                            enhanced_dependency = \
                                EnhancedDependency(data_pack, head, token)
                            enhanced_dependency.dep_label = label

                # add sentence
                Sentence(data_pack, doc_sent_begin, doc_offset - 1)

                doc_sent_begin = doc_offset
                doc_num_sent += 1

        doc_text = doc_text.strip()
        data_pack.set_text(doc_text)

        # add doc to data_pack
        Document(data_pack, 0, len(doc_text))
        data_pack.pack_name = doc_id

        yield data_pack
Beispiel #23
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = DataPack()

        with open(file_path, encoding="utf8") as doc:
            text = ""
            offset = 0
            has_rows = False

            speaker = part_id = document_id = None
            sentence_begin = 0

            # auxiliary structures
            current_entity_mention: Optional[Tuple[int, str]] = None
            verbal_predicates: List[PredicateMention] = []

            current_pred_arg: List[Optional[Tuple[int, str]]] = []
            verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = []

            groups: DefaultDict[int, List[EntityMention]] = defaultdict(list)
            coref_stacks: DefaultDict[int, List[int]] = defaultdict(list)

            for line in doc:
                line = line.strip()

                if line.startswith("#end document"):
                    break

                if line != "" and not line.startswith("#"):
                    conll_components = line.split()
                    document_id = conll_components[0]
                    part_id = int(conll_components[1])
                    word = conll_components[3]
                    pos_tag = conll_components[4]
                    lemmatised_word = conll_components[6]
                    framenet_id = conll_components[7]
                    word_sense = conll_components[8]
                    speaker = conll_components[9]
                    entity_label = conll_components[10]
                    pred_labels = conll_components[11:-1]

                    word_begin = offset
                    word_end = offset + len(word)

                    # add tokens
                    kwargs_i: Dict[str, Any] = {"pos": pos_tag,
                                                "sense": word_sense}
                    token = Token(pack, word_begin, word_end)
                    token.set_fields(**kwargs_i)
                    pack.add_or_get_entry(token)

                    # add entity mentions
                    current_entity_mention = self._process_entity_annotations(
                        pack, entity_label, word_begin, word_end,
                        current_entity_mention
                    )

                    # add predicate mentions
                    if lemmatised_word != "-":
                        word_is_verbal_predicate = any(
                            ["(V" in x for x in pred_labels]
                        )
                        kwargs_i = {
                            "framenet_id": framenet_id,
                            "pred_lemma": lemmatised_word,
                            "pred_type": "verb" if word_is_verbal_predicate
                            else "other"
                        }
                        pred_mention = PredicateMention(
                                pack, word_begin, word_end)
                        pred_mention.set_fields(**kwargs_i)
                        pred_mention = pack.add_or_get_entry(
                            pred_mention
                        )

                        if word_is_verbal_predicate:
                            verbal_predicates.append(pred_mention)

                    if not verbal_pred_args:
                        current_pred_arg = [None for _ in pred_labels]
                        verbal_pred_args = [[] for _ in pred_labels]

                    # add predicate arguments
                    self._process_pred_annotations(
                        pack,
                        conll_components[11:-1],
                        word_begin,
                        word_end,
                        current_pred_arg,
                        verbal_pred_args,
                    )

                    # add coreference mentions
                    self._process_coref_annotations(
                        pack,
                        conll_components[-1],
                        word_begin,
                        word_end,
                        coref_stacks,
                        groups,
                    )

                    text += word + " "
                    offset = word_end + 1
                    has_rows = True

                else:
                    if not has_rows:
                        continue

                    # add predicate links in the sentence
                    for predicate, pred_arg in zip(verbal_predicates,
                                                   verbal_pred_args):
                        for arg in pred_arg:
                            kwargs_i = {
                                "arg_type": arg[1],
                            }
                            link = PredicateLink(pack, predicate, arg[0])
                            link.set_fields(**kwargs_i)
                            pack.add_or_get_entry(link)

                    verbal_predicates = []
                    current_pred_arg = []
                    verbal_pred_args = []

                    # add sentence

                    kwargs_i = {"speaker": speaker, "part_id": part_id}
                    sent = Sentence(pack, sentence_begin, offset - 1)
                    sent.set_fields(**kwargs_i)
                    pack.add_or_get_entry(sent)

                    sentence_begin = offset

                    has_rows = False

            # group the coreference mentions in the whole document
            for _, mention_list in groups.items():
                # kwargs_i = {"coref_type": group_id}
                group = CoreferenceGroup(pack)
                # group.set_fields(**kwargs_i)
                group.add_members(mention_list)
                pack.add_or_get_entry(group)

            document = Document(pack, 0, len(text))
            pack.add_or_get_entry(document)

            kwargs_i = {"doc_id": document_id}
            pack.set_meta(**kwargs_i)
            pack.set_text(text, replace_func=self.text_replace_operation)

        yield pack
Beispiel #24
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = self.new_pack()

        with open(file_path, encoding="utf8") as doc:
            words = []
            offset = 0
            has_rows = False

            speaker = part_id = document_id = None
            sentence_begin = 0

            # auxiliary structures
            current_entity_mention: Optional[Tuple[int, str]] = None
            verbal_predicates: List[PredicateMention] = []

            current_pred_arg: List[Optional[Tuple[int, str]]] = []
            verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = []

            groups: DefaultDict[int, List[EntityMention]] = defaultdict(list)
            coref_stacks: DefaultDict[int, List[int]] = defaultdict(list)

            for line in doc:
                line = line.strip()

                if line.startswith("#end document"):
                    break

                if line != "" and not line.startswith("#"):
                    fields = self._parse_line(line)
                    speaker = fields.speaker
                    if fields.part_number is not None:
                        part_id = int(fields.part_number)
                    document_id = fields.document_id

                    assert fields.word is not None
                    word_begin = offset
                    word_end = offset + len(fields.word)

                    # add tokens
                    token = Token(pack, word_begin, word_end)

                    if fields.pos_tag is not None:
                        token.pos = fields.pos_tag
                    if fields.word_sense is not None:
                        token.sense = fields.word_sense

                    # add entity mentions
                    current_entity_mention = self._process_entity_annotations(
                        pack,
                        fields.entity_label,
                        word_begin,
                        word_end,
                        current_entity_mention,
                    )

                    # add predicate mentions
                    if (fields.lemmatised_word is not None
                            and fields.lemmatised_word != "-"):
                        word_is_verbal_predicate = any(
                            "(V" in x for x in fields.predicate_labels)
                        pred_mention = PredicateMention(
                            pack, word_begin, word_end)

                        pred_mention.predicate_lemma = fields.lemmatised_word
                        pred_mention.is_verb = word_is_verbal_predicate

                        if fields.framenet_id is not None:
                            pred_mention.framenet_id = fields.framenet_id

                        if word_is_verbal_predicate:
                            verbal_predicates.append(pred_mention)

                    if not verbal_pred_args:
                        current_pred_arg = [None] * len(
                            fields.predicate_labels)
                        verbal_pred_args = [[]
                                            for _ in fields.predicate_labels]

                    # add predicate arguments
                    self._process_pred_annotations(
                        pack,
                        fields.predicate_labels,
                        word_begin,
                        word_end,
                        current_pred_arg,
                        verbal_pred_args,
                    )

                    # add coreference mentions
                    self._process_coref_annotations(
                        pack,
                        fields.coreference,
                        word_begin,
                        word_end,
                        coref_stacks,
                        groups,
                    )

                    words.append(fields.word)
                    offset = word_end + 1
                    has_rows = True

                else:
                    if not has_rows:
                        continue

                    # add predicate links in the sentence
                    for predicate, pred_arg in zip(verbal_predicates,
                                                   verbal_pred_args):
                        for arg in pred_arg:
                            link = PredicateLink(pack, predicate, arg[0])
                            link.arg_type = arg[1]

                    verbal_predicates = []
                    current_pred_arg = []
                    verbal_pred_args = []

                    # add sentence

                    sent = Sentence(pack, sentence_begin, offset - 1)
                    if speaker is not None:
                        sent.speaker = speaker
                    if part_id is not None:
                        sent.part_id = int(part_id)

                    sentence_begin = offset

                    has_rows = False

            # group the coreference mentions in the whole document
            for _, mention_list in groups.items():
                group = CoreferenceGroup(pack)
                group.add_members(mention_list)

            text = " ".join(words)
            pack.set_text(text, replace_func=self.text_replace_operation)

            _ = Document(pack, 0, len(text))
            if document_id is not None:
                pack.pack_name = document_id
        yield pack
Beispiel #25
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = DataPack()
        doc = codecs.open(file_path, "r", encoding="utf8")

        text = ""
        offset = 0
        has_rows = False

        sentence_begin = 0
        sentence_cnt = 0

        for line in doc:
            line = line.strip()

            if line != "" and not line.startswith("#"):
                conll_components = line.split()

                word = conll_components[1]
                pos = conll_components[2]
                chunk_id = conll_components[3]
                ner_tag = conll_components[4]

                word_begin = offset
                word_end = offset + len(word)

                # Add tokens.
                kwargs_i = {"pos": pos, "chunk": chunk_id, "ner": ner_tag}
                token = Token(pack, word_begin, word_end)

                token.set_fields(**kwargs_i)
                pack.add_or_get_entry(token)

                text += word + " "
                offset = word_end + 1
                has_rows = True
            else:
                if not has_rows:
                    # Skip consecutive empty lines.
                    continue
                # add sentence
                sent = Sentence(pack, sentence_begin, offset - 1)
                pack.add_or_get_entry(sent)

                sentence_begin = offset
                sentence_cnt += 1
                has_rows = False

        if has_rows:
            # Add the last sentence if exists.
            sent = Sentence(pack, sentence_begin, offset - 1)
            sentence_cnt += 1
            pack.add_or_get_entry(sent)

        document = Document(pack, 0, len(text))
        pack.add_or_get_entry(document)

        pack.set_text(text, replace_func=self.text_replace_operation)
        pack.meta.doc_id = file_path
        doc.close()

        yield pack