Beispiel #1
0
    def _process_parser(self, sentences, input_pack: DataPack):
        """Parse the sentence. Default behaviour is to segment sentence, POSTag
        and Lemmatize.

        Args:
            sentences: Generator object which yields sentences in document
            input_pack: input pack which needs to be modified

        Returns:

        """
        for sentence in sentences:
            Sentence(input_pack, sentence.start_char, sentence.end_char)

            if "tokenize" in self.processors:
                # Iterating through spaCy token objects
                for word in sentence:
                    begin_pos_word = word.idx
                    end_pos_word = begin_pos_word + len(word.text)
                    token = Token(input_pack, begin_pos_word, end_pos_word)

                    if "pos" in self.processors:
                        token.pos = word.tag_

                    if "lemma" in self.processors:
                        token.lemma = word.lemma_
Beispiel #2
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = self.new_pack()
        doc = codecs.open(file_path, "r", encoding="utf8")

        text = ""
        offset = 0
        has_rows = False

        sentence_begin = 0
        sentence_cnt = 0

        for line in doc:
            line = line.strip()

            if line != "" and not line.startswith("#"):
                conll_components = line.split()

                word = conll_components[1]
                pos = conll_components[2]
                chunk_id = conll_components[3]
                ner_tag = conll_components[4]

                word_begin = offset
                word_end = offset + len(word)

                # Add tokens.
                token = Token(pack, word_begin, word_end)
                token.pos = pos
                token.chunk = chunk_id
                token.ner = ner_tag

                text += word + " "
                offset = word_end + 1
                has_rows = True
            else:
                if not has_rows:
                    # Skip consecutive empty lines.
                    continue
                # add sentence
                Sentence(pack, sentence_begin, offset - 1)

                sentence_begin = offset
                sentence_cnt += 1
                has_rows = False

        if has_rows:
            # Add the last sentence if exists.
            Sentence(pack, sentence_begin, offset - 1)
            sentence_cnt += 1

        pack.set_text(text, replace_func=self.text_replace_operation)

        Document(pack, 0, len(text))

        pack.pack_name = file_path
        doc.close()

        yield pack
    def _process(self, input_pack: DataPack):
        doc = input_pack.text

        if len(doc) == 0:
            logging.warning("Find empty text in doc.")

        # sentence parsing
        sentences = self.nlp(doc).sentences

        # Iterating through stanfordnlp sentence objects
        for sentence in sentences:
            Sentence(
                input_pack,
                sentence.tokens[0].start_char,
                sentence.tokens[-1].end_char,
            )

            tokens: List[Token] = []
            if "tokenize" in self.processors:
                # Iterating through stanfordnlp word objects
                for word in sentence.words:
                    misc = word.misc.split("|")

                    t_start = -1
                    t_end = -1
                    for m in misc:
                        k, v = m.split("=")
                        if k == "start_char":
                            t_start = int(v)
                        elif k == "end_char":
                            t_end = int(v)

                    if t_start < 0 or t_end < 0:
                        raise ValueError(
                            "Cannot determine word start or end for "
                            "stanfordnlp."
                        )

                    token = Token(input_pack, t_start, t_end)

                    if "pos" in self.processors:
                        token.pos = word.pos
                        token.ud_xpos = word.xpos

                    if "lemma" in self.processors:
                        token.lemma = word.lemma

                    tokens.append(token)

            # For each sentence, get the dependency relations among tokens
            if "depparse" in self.processors:
                # Iterating through token entries in current sentence
                for token, word in zip(tokens, sentence.words):
                    child = token  # current token
                    parent = tokens[word.head - 1]  # Head token
                    relation_entry = Dependency(input_pack, parent, child)
                    relation_entry.rel_type = word.deprel
Beispiel #4
0
    def test_segmenter(self):
        data_pack = DataPack()
        data_pack.set_text("eat phone")
        token_1 = Token(data_pack, 0, 3)
        token_2 = Token(data_pack, 4, 9)
        token_1.pos = "VB"
        token_2.pos = None
        data_pack.add_entry(token_1)
        data_pack.add_entry(token_2)

        self.assertIn(
            self.dra.replace(token_1)[1],
            [
                "eat",
                "feed",
                "eat on",
                "consume",
                "eat up",
                "use up",
                "deplete",
                "exhaust",
                "run through",
                "wipe out",
                "corrode",
                "rust",
            ],
        )
        self.assertIn(
            self.dra.replace(token_2)[1],
            [
                "telephone",
                "phone",
                "telephone set",
                "speech sound",
                "sound",
                "earphone",
                "earpiece",
                "headphone",
                "call",
                "telephone",
                "call up",
                "ring",
            ],
        )
Beispiel #5
0
    def _create_tokens(self, input_pack, sentence, result):
        words, pos = result['words'], result['pos']
        tokens = []
        offset = sentence.span.begin
        word_end = 0
        for i, word in enumerate(words):
            word_begin = sentence.text.find(word, word_end)
            word_end = word_begin + len(word)
            token = Token(input_pack, offset + word_begin, offset + word_end)
            if "pos" in self.configs.processors:
                token.pos = pos[i]
            tokens.append(token)

        return tokens
Beispiel #6
0
    def _process(self, input_pack: DataPack):
        doc = input_pack.text
        end_pos = 0

        # sentence parsing
        sentences = self.nlp(doc).sentences  # type: ignore

        # Iterating through stanfordnlp sentence objects
        for sentence in sentences:
            begin_pos = doc.find(sentence.words[0].text, end_pos)
            end_pos = doc.find(sentence.words[-1].text, begin_pos) + len(
                sentence.words[-1].text)
            sentence_entry = Sentence(input_pack, begin_pos, end_pos)

            tokens: List[Token] = []
            if "tokenize" in self.processors:
                offset = sentence_entry.span.begin
                end_pos_word = 0

                # Iterating through stanfordnlp word objects
                for word in sentence.words:
                    begin_pos_word = sentence_entry.text. \
                        find(word.text, end_pos_word)
                    end_pos_word = begin_pos_word + len(word.text)
                    token = Token(input_pack, begin_pos_word + offset,
                                  end_pos_word + offset)

                    if "pos" in self.processors:
                        token.pos = word.pos
                        token.ud_xpos = word.xpos

                    if "lemma" in self.processors:
                        token.lemma = word.lemma

                    tokens.append(token)

            # For each sentence, get the dependency relations among tokens
            if "depparse" in self.processors:
                # Iterating through token entries in current sentence
                for token, word in zip(tokens, sentence.words):
                    child = token  # current token
                    parent = tokens[word.governor - 1]  # Root token
                    relation_entry = Dependency(input_pack, parent, child)
                    relation_entry.rel_type = word.dependency_relation
Beispiel #7
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = self.new_pack()

        with open(file_path, encoding="utf8") as doc:
            words = []
            offset = 0
            has_rows = False

            speaker = part_id = document_id = None
            sentence_begin = 0

            # auxiliary structures
            current_entity_mention: Optional[Tuple[int, str]] = None
            verbal_predicates: List[PredicateMention] = []

            current_pred_arg: List[Optional[Tuple[int, str]]] = []
            verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = []

            groups: DefaultDict[int, List[EntityMention]] = defaultdict(list)
            coref_stacks: DefaultDict[int, List[int]] = defaultdict(list)

            for line in doc:
                line = line.strip()

                if line.startswith("#end document"):
                    break

                if line != "" and not line.startswith("#"):
                    fields = self._parse_line(line)
                    speaker = fields.speaker
                    if fields.part_number is not None:
                        part_id = int(fields.part_number)
                    document_id = fields.document_id

                    assert fields.word is not None
                    word_begin = offset
                    word_end = offset + len(fields.word)

                    # add tokens
                    token = Token(pack, word_begin, word_end)

                    if fields.pos_tag is not None:
                        token.pos = fields.pos_tag
                    if fields.word_sense is not None:
                        token.sense = fields.word_sense

                    # add entity mentions
                    current_entity_mention = self._process_entity_annotations(
                        pack,
                        fields.entity_label,
                        word_begin,
                        word_end,
                        current_entity_mention,
                    )

                    # add predicate mentions
                    if (fields.lemmatised_word is not None
                            and fields.lemmatised_word != "-"):
                        word_is_verbal_predicate = any(
                            "(V" in x for x in fields.predicate_labels)
                        pred_mention = PredicateMention(
                            pack, word_begin, word_end)

                        pred_mention.predicate_lemma = fields.lemmatised_word
                        pred_mention.is_verb = word_is_verbal_predicate

                        if fields.framenet_id is not None:
                            pred_mention.framenet_id = fields.framenet_id

                        if word_is_verbal_predicate:
                            verbal_predicates.append(pred_mention)

                    if not verbal_pred_args:
                        current_pred_arg = [None] * len(
                            fields.predicate_labels)
                        verbal_pred_args = [[]
                                            for _ in fields.predicate_labels]

                    # add predicate arguments
                    self._process_pred_annotations(
                        pack,
                        fields.predicate_labels,
                        word_begin,
                        word_end,
                        current_pred_arg,
                        verbal_pred_args,
                    )

                    # add coreference mentions
                    self._process_coref_annotations(
                        pack,
                        fields.coreference,
                        word_begin,
                        word_end,
                        coref_stacks,
                        groups,
                    )

                    words.append(fields.word)
                    offset = word_end + 1
                    has_rows = True

                else:
                    if not has_rows:
                        continue

                    # add predicate links in the sentence
                    for predicate, pred_arg in zip(verbal_predicates,
                                                   verbal_pred_args):
                        for arg in pred_arg:
                            link = PredicateLink(pack, predicate, arg[0])
                            link.arg_type = arg[1]

                    verbal_predicates = []
                    current_pred_arg = []
                    verbal_pred_args = []

                    # add sentence

                    sent = Sentence(pack, sentence_begin, offset - 1)
                    if speaker is not None:
                        sent.speaker = speaker
                    if part_id is not None:
                        sent.part_id = int(part_id)

                    sentence_begin = offset

                    has_rows = False

            # group the coreference mentions in the whole document
            for _, mention_list in groups.items():
                group = CoreferenceGroup(pack)
                group.add_members(mention_list)

            text = " ".join(words)
            pack.set_text(text, replace_func=self.text_replace_operation)

            _ = Document(pack, 0, len(text))
            if document_id is not None:
                pack.pack_name = document_id
        yield pack
Beispiel #8
0
    def _parse_pack(self, collection: str) -> Iterator[DataPack]:
        with open(collection, "r", encoding="utf8") as doc:
            pack_id: int = 0

            pack: DataPack = DataPack()
            text: str = ""
            offset: int = 0
            has_rows: bool = False

            sentence_begin: int = 0
            sentence_cnt: int = 0

            # NER tag is either "O" or in the format "X-Y",
            # where X is one of B, I,
            # Y is a tag like ORG, PER etc
            prev_y = None
            prev_x = None
            start_index = -1

            for line in doc:
                line = line.strip()

                if line.find("DOCSTART") != -1:
                    # Skip the first DOCSTART.
                    if offset == 0:
                        continue
                    # Add remaining sentence.
                    if has_rows:
                        # Add the last sentence if exists.
                        Sentence(pack, sentence_begin, offset - 1)
                        sentence_cnt += 1

                    pack.set_text(text,
                                  replace_func=self.text_replace_operation)
                    Document(pack, 0, len(text))
                    pack.pack_name = collection + "_%d" % pack_id
                    pack_id += 1
                    yield pack

                    # Create a new datapack.
                    pack = DataPack()
                    text = ""
                    offset = 0
                    has_rows = False

                    sentence_begin = 0
                    sentence_cnt = 0

                    prev_y = None
                    prev_x = None
                    start_index = -1

                elif line != "" and not line.startswith("#"):
                    conll_components = line.split()

                    word = conll_components[0]
                    pos = conll_components[1]
                    chunk_id = conll_components[2]

                    ner_tag = conll_components[3]

                    # A new ner tag occurs.
                    if ner_tag == "O" or ner_tag.split("-")[0] == "B":
                        # Add previous ner tag to sentence if it exists.
                        if prev_y is not None:
                            entity_mention = EntityMention(
                                pack, start_index, offset - 1)
                            entity_mention.ner_type = prev_y

                        # Start process current ner tag.
                        if ner_tag == "O":
                            # Current ner tag is O, reset information.
                            prev_x = None
                            prev_y = None
                            start_index = -1
                        else:
                            # Current ner tag is B.
                            prev_x = "B"
                            prev_y = ner_tag.split("-")[1]
                            start_index = offset
                    # This ner tag is connected to previous one.
                    else:
                        x, y = ner_tag.split("-")
                        assert x == "I", "Unseen tag %s in the file." % x
                        assert y == prev_y, "Error in %s." % ner_tag
                        assert prev_x in ("B", "I"), "Error in %s." % ner_tag
                        prev_x = "I"

                    word_begin = offset
                    word_end = offset + len(word)

                    # Add tokens.
                    token = Token(pack, word_begin, word_end)
                    token.pos = pos
                    token.chunk = chunk_id

                    text += word + " "
                    offset = word_end + 1
                    has_rows = True
                else:
                    if not has_rows:
                        # Skip consecutive empty lines.
                        continue
                    # Add sentence
                    Sentence(pack, sentence_begin, offset - 1)

                    # Handle the last ner tag if exists.
                    if prev_x is not None:
                        entity_mention = EntityMention(pack, start_index,
                                                       offset - 1)
                        entity_mention.ner_type = prev_y

                    # Reset information.
                    sentence_cnt += 1
                    has_rows = False
                    prev_y = None
                    prev_x = None
                    sentence_begin = offset

            if has_rows:
                # Add the last sentence if exists.
                Sentence(pack, sentence_begin, offset - 1)
                sentence_cnt += 1

            pack.set_text(text, replace_func=self.text_replace_operation)
            Document(pack, 0, len(text))
            pack.pack_name = os.path.basename(collection)

            yield pack