コード例 #1
0
ファイル: spacy_processors.py プロジェクト: williamwhe/forte
    def _process_parser(self, sentences, input_pack):
        """Parse the sentence. Default behaviour is to segment sentence, POSTag
        and Lemmatize.

        Args:
            sentences: Generator object which yields sentences in document
            input_pack: input pack which needs to be modified

        Returns:

        """
        for sentence in sentences:
            sentence_entry = Sentence(input_pack,
                                      sentence.start_char,
                                      sentence.end_char)
            input_pack.add_or_get_entry(sentence_entry)

            if "tokenize" in self.processors:
                # Iterating through spaCy token objects
                for word in sentence:
                    begin_pos_word = word.idx
                    end_pos_word = begin_pos_word + len(word.text)
                    token = Token(input_pack, begin_pos_word,
                                  end_pos_word)

                    if "pos" in self.processors:
                        token.set_fields(pos=word.tag_)

                    if "lemma" in self.processors:
                        token.set_fields(lemma=word.lemma_)

                    input_pack.add_or_get_entry(token)
コード例 #2
0
    def _create_tokens(self, input_pack, sentence, result):
        words, pos = result['words'], result['pos']
        tokens = []
        offset = sentence.span.begin
        word_end = 0
        for i, word in enumerate(words):
            word_begin = sentence.text.find(word, word_end)
            word_end = word_begin + len(word)
            token = Token(input_pack, offset + word_begin, offset + word_end)
            if "pos" in self.processors:
                token.set_fields(pos=pos[i])
            tokens.append(token)
            input_pack.add_entry(token)

        return tokens
コード例 #3
0
    def _process(self, input_pack: DataPack):
        doc = input_pack.text
        end_pos = 0

        # sentence parsing
        sentences = self.nlp(doc).sentences  # type: ignore

        # Iterating through stanfordnlp sentence objects
        for sentence in sentences:
            begin_pos = doc.find(sentence.words[0].text, end_pos)
            end_pos = doc.find(sentence.words[-1].text, begin_pos) + len(
                sentence.words[-1].text)
            sentence_entry = Sentence(input_pack, begin_pos, end_pos)
            input_pack.add_or_get_entry(sentence_entry)

            tokens: List[Token] = []
            if "tokenize" in self.processors:
                offset = sentence_entry.span.begin
                end_pos_word = 0

                # Iterating through stanfordnlp word objects
                for word in sentence.words:
                    begin_pos_word = sentence_entry.text. \
                        find(word.text, end_pos_word)
                    end_pos_word = begin_pos_word + len(word.text)
                    token = Token(input_pack,
                                  begin_pos_word + offset,
                                  end_pos_word + offset
                                  )

                    if "pos" in self.processors:
                        token.set_fields(pos=word.pos)
                        token.set_fields(upos=word.upos)
                        token.set_fields(xpos=word.xpos)

                    if "lemma" in self.processors:
                        token.set_fields(lemma=word.lemma)

                    tokens.append(token)
                    input_pack.add_or_get_entry(token)

            # For each sentence, get the dependency relations among tokens
            if "depparse" in self.processors:
                # Iterating through token entries in current sentence
                for token, word in zip(tokens, sentence.words):
                    child = token  # current token
                    parent = tokens[word.governor - 1]  # Root token
                    relation_entry = Dependency(input_pack, parent, child)
                    relation_entry.set_fields(
                        rel_type=word.dependency_relation)

                    input_pack.add_or_get_entry(relation_entry)
コード例 #4
0
ファイル: conll03_reader.py プロジェクト: williamwhe/forte
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = DataPack()
        doc = codecs.open(file_path, "r", encoding="utf8")

        text = ""
        offset = 0
        has_rows = False

        sentence_begin = 0
        sentence_cnt = 0

        for line in doc:
            line = line.strip()

            if line != "" and not line.startswith("#"):
                conll_components = line.split()

                word = conll_components[1]
                pos = conll_components[2]
                chunk_id = conll_components[3]
                ner_tag = conll_components[4]

                word_begin = offset
                word_end = offset + len(word)

                # Add tokens.
                kwargs_i = {"pos": pos, "chunk": chunk_id, "ner": ner_tag}
                token = Token(pack, word_begin, word_end)

                token.set_fields(**kwargs_i)
                pack.add_or_get_entry(token)

                text += word + " "
                offset = word_end + 1
                has_rows = True
            else:
                if not has_rows:
                    # Skip consecutive empty lines.
                    continue
                # add sentence
                sent = Sentence(pack, sentence_begin, offset - 1)
                pack.add_or_get_entry(sent)

                sentence_begin = offset
                sentence_cnt += 1
                has_rows = False

        if has_rows:
            # Add the last sentence if exists.
            sent = Sentence(pack, sentence_begin, offset - 1)
            sentence_cnt += 1
            pack.add_or_get_entry(sent)

        document = Document(pack, 0, len(text))
        pack.add_or_get_entry(document)

        pack.set_text(text, replace_func=self.text_replace_operation)
        pack.meta.doc_id = file_path
        doc.close()

        yield pack
コード例 #5
0
ファイル: ontonotes_reader.py プロジェクト: huzecong/forte
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = DataPack()

        with open(file_path, encoding="utf8") as doc:
            text = ""
            offset = 0
            has_rows = False

            speaker = part_id = document_id = None
            sentence_begin = 0

            # auxiliary structures
            current_entity_mention: Optional[Tuple[int, str]] = None
            verbal_predicates: List[PredicateMention] = []

            current_pred_arg: List[Optional[Tuple[int, str]]] = []
            verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = []

            groups: DefaultDict[int, List[EntityMention]] = defaultdict(list)
            coref_stacks: DefaultDict[int, List[int]] = defaultdict(list)

            for line in doc:
                line = line.strip()

                if line.startswith("#end document"):
                    break

                if line != "" and not line.startswith("#"):
                    conll_components = line.split()
                    document_id = conll_components[0]
                    part_id = int(conll_components[1])
                    word = conll_components[3]
                    pos_tag = conll_components[4]
                    lemmatised_word = conll_components[6]
                    framenet_id = conll_components[7]
                    word_sense = conll_components[8]
                    speaker = conll_components[9]
                    entity_label = conll_components[10]
                    pred_labels = conll_components[11:-1]

                    word_begin = offset
                    word_end = offset + len(word)

                    # add tokens
                    kwargs_i: Dict[str, Any] = {"pos": pos_tag,
                                                "sense": word_sense}
                    token = Token(pack, word_begin, word_end)
                    token.set_fields(**kwargs_i)
                    pack.add_or_get_entry(token)

                    # add entity mentions
                    current_entity_mention = self._process_entity_annotations(
                        pack, entity_label, word_begin, word_end,
                        current_entity_mention
                    )

                    # add predicate mentions
                    if lemmatised_word != "-":
                        word_is_verbal_predicate = any(
                            ["(V" in x for x in pred_labels]
                        )
                        kwargs_i = {
                            "framenet_id": framenet_id,
                            "pred_lemma": lemmatised_word,
                            "pred_type": "verb" if word_is_verbal_predicate
                            else "other"
                        }
                        pred_mention = PredicateMention(
                                pack, word_begin, word_end)
                        pred_mention.set_fields(**kwargs_i)
                        pred_mention = pack.add_or_get_entry(
                            pred_mention
                        )

                        if word_is_verbal_predicate:
                            verbal_predicates.append(pred_mention)

                    if not verbal_pred_args:
                        current_pred_arg = [None for _ in pred_labels]
                        verbal_pred_args = [[] for _ in pred_labels]

                    # add predicate arguments
                    self._process_pred_annotations(
                        pack,
                        conll_components[11:-1],
                        word_begin,
                        word_end,
                        current_pred_arg,
                        verbal_pred_args,
                    )

                    # add coreference mentions
                    self._process_coref_annotations(
                        pack,
                        conll_components[-1],
                        word_begin,
                        word_end,
                        coref_stacks,
                        groups,
                    )

                    text += word + " "
                    offset = word_end + 1
                    has_rows = True

                else:
                    if not has_rows:
                        continue

                    # add predicate links in the sentence
                    for predicate, pred_arg in zip(verbal_predicates,
                                                   verbal_pred_args):
                        for arg in pred_arg:
                            kwargs_i = {
                                "arg_type": arg[1],
                            }
                            link = PredicateLink(pack, predicate, arg[0])
                            link.set_fields(**kwargs_i)
                            pack.add_or_get_entry(link)

                    verbal_predicates = []
                    current_pred_arg = []
                    verbal_pred_args = []

                    # add sentence

                    kwargs_i = {"speaker": speaker, "part_id": part_id}
                    sent = Sentence(pack, sentence_begin, offset - 1)
                    sent.set_fields(**kwargs_i)
                    pack.add_or_get_entry(sent)

                    sentence_begin = offset

                    has_rows = False

            # group the coreference mentions in the whole document
            for _, mention_list in groups.items():
                # kwargs_i = {"coref_type": group_id}
                group = CoreferenceGroup(pack)
                # group.set_fields(**kwargs_i)
                group.add_members(mention_list)
                pack.add_or_get_entry(group)

            document = Document(pack, 0, len(text))
            pack.add_or_get_entry(document)

            kwargs_i = {"doc_id": document_id}
            pack.set_meta(**kwargs_i)
            pack.set_text(text, replace_func=self.text_replace_operation)

        yield pack
コード例 #6
0
ファイル: ontonotes_reader.py プロジェクト: williamwhe/forte
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = DataPack()

        with open(file_path, encoding="utf8") as doc:
            words = []
            offset = 0
            has_rows = False

            speaker = part_id = document_id = None
            sentence_begin = 0

            # auxiliary structures
            current_entity_mention: Optional[Tuple[int, str]] = None
            verbal_predicates: List[PredicateMention] = []

            current_pred_arg: List[Optional[Tuple[int, str]]] = []
            verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = []

            groups: DefaultDict[int, List[EntityMention]] = defaultdict(list)
            coref_stacks: DefaultDict[int, List[int]] = defaultdict(list)

            for line in doc:
                line = line.strip()

                if line.startswith("#end document"):
                    break

                if line != "" and not line.startswith("#"):
                    fields = self._parse_line(line)
                    speaker = fields.speaker
                    if fields.part_number is not None:
                        part_id = int(fields.part_number)
                    document_id = fields.document_id

                    assert fields.word is not None
                    word_begin = offset
                    word_end = offset + len(fields.word)

                    # add tokens
                    token = Token(pack, word_begin, word_end)
                    if fields.pos_tag is not None:
                        token.set_fields(pos=fields.pos_tag)
                    if fields.word_sense is not None:
                        token.set_fields(sense=fields.word_sense)
                    pack.add_entry(token)

                    # add entity mentions
                    current_entity_mention = self._process_entity_annotations(
                        pack,
                        fields.entity_label,
                        word_begin,
                        word_end,
                        current_entity_mention,
                    )

                    # add predicate mentions
                    if (fields.lemmatised_word is not None
                            and fields.lemmatised_word != "-"):
                        word_is_verbal_predicate = any(
                            "(V" in x for x in fields.predicate_labels)
                        kwargs_i = {
                            "pred_lemma":
                            fields.lemmatised_word,
                            "pred_type":
                            ("verb" if word_is_verbal_predicate else "other")
                        }
                        pred_mention = PredicateMention(
                            pack, word_begin, word_end)
                        pred_mention.set_fields(**kwargs_i)
                        if fields.framenet_id is not None:
                            pred_mention.set_fields(
                                framenet_id=fields.framenet_id)
                        pack.add_entry(pred_mention)

                        if word_is_verbal_predicate:
                            verbal_predicates.append(pred_mention)

                    if not verbal_pred_args:
                        current_pred_arg = [None] * len(
                            fields.predicate_labels)
                        verbal_pred_args = [[]
                                            for _ in fields.predicate_labels]

                    # add predicate arguments
                    self._process_pred_annotations(
                        pack,
                        fields.predicate_labels,
                        word_begin,
                        word_end,
                        current_pred_arg,
                        verbal_pred_args,
                    )

                    # add coreference mentions
                    self._process_coref_annotations(
                        pack,
                        fields.coreference,
                        word_begin,
                        word_end,
                        coref_stacks,
                        groups,
                    )

                    words.append(fields.word)
                    offset = word_end + 1
                    has_rows = True

                else:
                    if not has_rows:
                        continue

                    # add predicate links in the sentence
                    for predicate, pred_arg in zip(verbal_predicates,
                                                   verbal_pred_args):
                        for arg in pred_arg:
                            kwargs_i = {
                                "arg_type": arg[1],
                            }
                            link = PredicateLink(pack, predicate, arg[0])
                            link.set_fields(**kwargs_i)
                            pack.add_entry(link)

                    verbal_predicates = []
                    current_pred_arg = []
                    verbal_pred_args = []

                    # add sentence

                    sent = Sentence(pack, sentence_begin, offset - 1)
                    if speaker is not None:
                        sent.set_fields(speaker=speaker)
                    if part_id is not None:
                        sent.set_fields(part_id=int(part_id))
                    pack.add_entry(sent)

                    sentence_begin = offset

                    has_rows = False

            # group the coreference mentions in the whole document
            for _, mention_list in groups.items():
                # kwargs_i = {"coref_type": group_id}
                group = CoreferenceGroup(pack)
                # group.set_fields(**kwargs_i)
                group.add_members(mention_list)
                pack.add_entry(group)

            text = " ".join(words)
            document = Document(pack, 0, len(text))
            pack.add_entry(document)

            if document_id is not None:
                pack.set_meta(doc_id=document_id)
            pack.set_text(text, replace_func=self.text_replace_operation)

        yield pack