コード例 #1
0
    def _parse_pack(self, data_source: str) -> Iterator[MultiPack]:
        """
        Takes a raw string and converts into a MultiPack

        Args:
            data_source: str that contains text of a document

        Returns: MultiPack containing a datapack for the current query

        """

        multi_pack = MultiPack()

        # use context to build the query
        if self.resource.get("user_utterance"):
            user_pack = self.resource.get("user_utterance")[-1]
            multi_pack.update_pack({"user_utterance": user_pack})

        if self.resource.get("bot_utterance"):
            bot_pack = self.resource.get("bot_utterance")[-1]
            multi_pack.update_pack({"bot_utterance": bot_pack})

        pack = DataPack()
        utterance = Utterance(pack, 0, len(data_source))
        pack.add_entry(utterance)

        pack.set_text(data_source, replace_func=self.text_replace_operation)
        multi_pack.update_pack({self.config.pack_name: pack})

        yield multi_pack
コード例 #2
0
ファイル: ontonotes_reader.py プロジェクト: williamwhe/forte
    def _process_entity_annotations(
        self,
        pack: DataPack,
        label: Optional[str],
        word_begin: int,
        word_end: int,
        current_entity_mention: Optional[Tuple[int, str]],
    ) -> Optional[Tuple[int, str]]:
        if label is None:
            return None

        ner_type = label.strip("()*")

        if "(" in label:
            # Entering into a span for a particular ner.
            current_entity_mention = (word_begin, ner_type)
        if ")" in label:
            if current_entity_mention is None:
                raise ValueError(
                    "current_entity_mention is None when meet right blanket.")
            # Exiting a span, add and then reset the current span.
            kwargs_i = {"ner_type": current_entity_mention[1]}
            entity = EntityMention(pack, current_entity_mention[0], word_end)
            entity.set_fields(**kwargs_i)
            pack.add_entry(entity)

            current_entity_mention = None

        return current_entity_mention
コード例 #3
0
 def add_wiki_info(self, pack: DataPack, statements: List[state_type]):
     for _, _, o in statements:
         resource_name = get_resource_name(o)
         if resource_name is not None:
             wc = WikiCategory(pack)
             wc.values.append(resource_name)
             pack.add_entry(wc)
コード例 #4
0
    def _parse_pack(self, file_path: str) -> Iterator[MultiPack]:

        m_pack: MultiPack = MultiPack()

        input_pack_name = self.config.input_pack_name
        output_pack_name = self.config.output_pack_name

        text = ""
        offset = 0
        with open(file_path, "r", encoding="utf8") as doc:

            input_pack = DataPack(doc_id=file_path)

            for line in doc:
                line = line.strip()

                if len(line) == 0:
                    continue

                # add sentence
                sent = Sentence(input_pack, offset, offset + len(line))
                input_pack.add_entry(sent)
                text += line + '\n'
                offset = offset + len(line) + 1

            input_pack.set_text(text, replace_func=self.text_replace_operation)

            output_pack = DataPack()

            m_pack.update_pack({
                input_pack_name: input_pack,
                output_pack_name: output_pack
            })

            yield m_pack
コード例 #5
0
 def pack(self, data_pack: DataPack, output_dict: Optional[Dict] = None):
     entries = list(data_pack.get_entries_by_type(NewType))
     if len(entries) == 0:
         entry = NewType(pack=data_pack, value="[BATCH]")
         data_pack.add_entry(entry)
     else:
         entry = entries[0]  # type: ignore
         entry.value += "[BATCH]"
コード例 #6
0
 def _process(self, input_pack: DataPack):
     entries = list(input_pack.get_entries_by_type(NewType))
     if len(entries) == 0:
         entry = NewType(pack=input_pack, value="[PACK]")
         input_pack.add_entry(entry)
     else:
         entry = entries[0]  # type: ignore
         entry.value += "[PACK]"
コード例 #7
0
 def test_replace(self):
     data_pack = DataPack()
     data_pack.set_text("google")
     token_1 = Token(data_pack, 0, 6)
     data_pack.add_entry(token_1)
     is_replace, replaced_token = self.esa.replace(token_1)
     self.assertTrue(is_replace)
     self.assertIn(replaced_token,
                   ["yahoo", "aol", "microsoft", "web", "internet"])
コード例 #8
0
ファイル: reader.py プロジェクト: williamwhe/forte
 def _parse_pack(self, data_source: str) -> Iterator[MultiPack]:
     fields = data_source.split("\t")
     data_pack = DataPack(doc_id=fields[0])
     multi_pack = MultiPack()
     document = Document(pack=data_pack, begin=0, end=len(fields[1]))
     data_pack.add_entry(document)
     data_pack.set_text(fields[1])
     multi_pack.update_pack({self.config.pack_name: data_pack})
     yield multi_pack
コード例 #9
0
def _space_token(pack: DataPack):
    begin = 0
    for i, c in enumerate(pack.text):
        if c == ' ':
            pack.add_entry(Token(pack, begin, i))
            begin = i + 1

    if begin < len(pack.text):
        pack.add_entry(Token(pack, begin, len(pack.text)))
コード例 #10
0
    def test_back_translation(self):
        random.seed(0)
        data_pack = DataPack()
        text = "Natural Language Processing has never been made this simple!"
        data_pack.set_text(text)
        sent = Sentence(data_pack, 0, len(text))
        data_pack.add_entry(sent)

        translated_text = "The treatment of natural language has never been easier!"
        assert(translated_text == self.bta.replace(sent)[1])
コード例 #11
0
    def test_replace(self):
        random.seed(42)
        data_pack = DataPack()
        test_string = "The lazy fox jumped over the fence"
        test_result = "T/-/3 lazy f0>< jumpe|) oveI2 th3 fe^ce"
        data_pack.set_text(test_string)
        token_1 = Token(data_pack, 0, len(test_string))
        data_pack.add_entry(token_1)

        self.assertIn(self.test.replace(token_1)[1], test_result)
コード例 #12
0
 def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
     with open(file_path, "r", encoding="utf8") as doc:
         for line in doc:
             pack = DataPack(doc_id=file_path)
             line = line.strip()
             if len(line) == 0:
                 continue
             sent = Sentence(pack, 0, len(line))
             pack.add_entry(sent)
             pack.set_text(line)
             self.count += 1
             yield pack
コード例 #13
0
 def _parse_pack(self,
                 file_path: str) -> Iterator[DataPack]:  # type: ignore
     with open(file_path, "r", encoding="utf8") as doc:
         for line in doc:
             m_pack = MultiPack()
             pack = DataPack(doc_id=file_path)
             line = line.strip()
             if len(line) == 0:
                 continue
             sent = Sentence(pack, 0, len(line))
             pack.add_entry(sent)
             pack.set_text(line)
             self.count += 1
             m_pack.update_pack({"pack": pack})
             yield m_pack  # type: ignore
コード例 #14
0
ファイル: ontonotes_reader.py プロジェクト: williamwhe/forte
    def _process_coref_annotations(
        self,
        pack: DataPack,
        label: Optional[str],
        word_begin: int,
        word_end: int,
        coref_stacks: DefaultDict[int, List[int]],
        groups: DefaultDict[int, List[EntityMention]],
    ) -> None:

        if label is None or label == "-":
            return
        for segment in label.split("|"):
            # The conll representation of coref spans allows spans to overlap.
            if segment[0] == "(":
                if segment[-1] == ")":
                    # The span begins and ends at this word (single word span).
                    group_id = int(segment[1:-1])

                    coref_mention = EntityMention(pack, word_begin, word_end)
                    coref_mention = pack.add_entry(coref_mention)

                    groups[group_id].append(coref_mention)
                else:
                    # The span is starting, so we record the index of the word.
                    group_id = int(segment[1:])
                    coref_stacks[group_id].append(word_begin)
            else:
                # The span for this id is ending, but not start at this word.
                group_id = int(segment[:-1])
                start = coref_stacks[group_id].pop()
                coref_mention = EntityMention(pack, start, word_end)
                coref_mention = pack.add_or_get_entry(coref_mention)

                groups[group_id].append(coref_mention)
コード例 #15
0
    def test_replace(self):
        data_pack = DataPack()
        data_pack.set_text("auxiliary colleague apple")
        token_1 = Token(data_pack, 0, 9)
        token_2 = Token(data_pack, 10, 19)
        token_3 = Token(data_pack, 20, 25)
        data_pack.add_entry(token_1)
        data_pack.add_entry(token_2)
        data_pack.add_entry(token_3)

        self.assertIn(
            self.tyre.replace(token_1)[1],
            ["auxilliary", "auxilary", "auxillary"],
        )
        self.assertIn(self.tyre.replace(token_2)[1], ["collegue", "colleaque"])
        self.assertIn(self.tyre.replace(token_3)[1], ["apple"])
コード例 #16
0
ファイル: ontonotes_reader.py プロジェクト: williamwhe/forte
    def _process_pred_annotations(
        self,
        pack: DataPack,
        labels: List[str],
        word_begin: int,
        word_end: int,
        current_pred_arg: List[Optional[Tuple[int, str]]],
        verbal_pred_args: List[List[Tuple[PredicateArgument, str]]],
    ) -> None:

        for label_index, label in enumerate(labels):

            if "(" in label:
                # Entering into a span
                arg_type = label.strip("()*")
                current_pred_arg[label_index] = (word_begin, arg_type)

            if ")" in label:
                # Exiting a span
                if current_pred_arg[label_index] is None:
                    raise ValueError(
                        "current_pred_arg is None when meet right blanket.")

                arg_begin = current_pred_arg[label_index][0]  # type: ignore
                arg_type = current_pred_arg[label_index][1]  # type: ignore

                if arg_type != "V":
                    pred_arg = PredicateArgument(pack, arg_begin, word_end)
                    pred_arg = pack.add_entry(pred_arg)

                    verbal_pred_args[label_index].append((pred_arg, arg_type))
                current_pred_arg[label_index] = None
コード例 #17
0
    def test_segmenter(self):
        data_pack = DataPack()
        data_pack.set_text("eat phone")
        token_1 = Token(data_pack, 0, 3)
        token_2 = Token(data_pack, 4, 9)
        token_1.pos = "VB"
        token_2.pos = None
        data_pack.add_entry(token_1)
        data_pack.add_entry(token_2)

        self.assertIn(
            self.dra.replace(token_1)[1],
            [
                "eat",
                "feed",
                "eat on",
                "consume",
                "eat up",
                "use up",
                "deplete",
                "exhaust",
                "run through",
                "wipe out",
                "corrode",
                "rust",
            ],
        )
        self.assertIn(
            self.dra.replace(token_2)[1],
            [
                "telephone",
                "phone",
                "telephone set",
                "speech sound",
                "sound",
                "earphone",
                "earpiece",
                "headphone",
                "call",
                "telephone",
                "call up",
                "ring",
            ],
        )
コード例 #18
0
        def _insert_new_span(
            entry_class: str,
            insert_ind: int,
            inserted_annos: List[Tuple[int, int]],
            new_pack: DataPack,
            spans: List[Span],
            new_spans: List[Span],
        ):
            """
            An internal helper function for insertion.

            Args:
                entry_class: The new annotation type to be created.
                insert_ind: The index to be insert.
                inserted_annos: The annotation span information to be inserted.
                new_pack: The new data pack to insert the annotation.
                spans: The original spans before replacement, should be
                  a sorted ascending list.
                new_spans: The original spans before replacement, should be
                  a sorted ascending list.

            Returns:

            """
            pos: int
            length: int
            pos, length = inserted_annos[insert_ind]
            insert_end: int = modify_index(
                pos,
                spans,
                new_spans,
                is_begin=False,
                # Include the inserted span itself.
                is_inclusive=True,
            )
            insert_begin: int = insert_end - length
            new_anno = create_class_with_kwargs(
                entry_class,
                {"pack": new_pack, "begin": insert_begin, "end": insert_end},
            )
            new_pack.add_entry(new_anno)
コード例 #19
0
    def _parse_pack(self, file_path: str) -> Iterator[MultiPack]:
        m_pack: MultiPack = MultiPack()

        input_pack_name = "input_src"
        output_pack_name = "output_tgt"

        with open(file_path, "r", encoding="utf8") as doc:
            text = ""
            offset = 0

            sentence_cnt = 0

            input_pack = DataPack(doc_id=file_path)

            for line in doc:
                line = line.strip()
                if len(line) == 0:
                    # skip empty lines
                    continue
                # add sentence
                sent = Sentence(input_pack, offset, offset + len(line))
                input_pack.add_entry(sent)
                text += line + '\n'
                offset = offset + len(line) + 1

                sentence_cnt += 1

                if sentence_cnt >= 20:
                    break

            input_pack.set_text(text, replace_func=self.text_replace_operation)

        output_pack = DataPack()

        m_pack.update_pack({
            input_pack_name: input_pack,
            output_pack_name: output_pack
        })

        yield m_pack
コード例 #20
0
 def _insert_new_span(insert_ind: int, inserted_annos: List[Tuple[int,
                                                                  int]],
                      new_pack: DataPack, spans: List[Span],
                      new_spans: List[Span]):
     r"""
     An internal helper function for insertion.
     """
     pos: int
     length: int
     pos, length = inserted_annos[insert_ind]
     insert_end: int = modify_index(
         pos,
         spans,
         new_spans,
         is_begin=False,
         # Include the inserted span itself.
         is_inclusive=True)
     insert_begin: int = insert_end - length
     new_anno = create_class_with_kwargs(entry, {
         "pack": new_pack,
         "begin": insert_begin,
         "end": insert_end
     })
     new_pack.add_entry(new_anno)
コード例 #21
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        with open(file_path, "r", encoding="utf8", errors='ignore') as file:
            dataset = json.load(file)

            pack = DataPack()
            text: str = dataset['article']
            article_end = len(text)
            article = Article(pack, 0, article_end)
            pack.add_entry(article)
            offset = article_end + 1

            for qid, ques_text in enumerate(dataset['questions']):
                text += '\n' + ques_text
                ques_end = offset + len(ques_text)
                question = Question(pack, offset, ques_end)
                offset = ques_end + 1

                options: List[Option] = []
                options_text = dataset['options'][qid]
                for option_text in options_text:
                    text += '\n' + option_text
                    option_end = offset + len(option_text)
                    option = Option(pack, offset, option_end)
                    options.append(option)
                    pack.add_entry(option)
                    offset = option_end + 1
                question.set_options(options)

                answers = dataset['answers'][qid]
                if not isinstance(answers, list):
                    answers = [answers]
                answers = [self._convert_to_int(ans) for ans in answers]
                question.set_answers(answers)
                pack.add_entry(question)

            pack.set_text(text, replace_func=self.text_replace_operation)

            passage_id: str = dataset['id']
            passage = Passage(pack, 0, len(pack.text))
            passage.set_passage_id(passage_id)
            pack.add_entry(passage)

            pack.meta.doc_id = passage_id
            yield pack
コード例 #22
0
    def pack(self, data_pack: DataPack,
             inputs: Dict[str, List[Prediction]]) -> None:
        batch_predictions = inputs["predictions"]
        for predictions in batch_predictions:
            for pred_span, arg_result in predictions:

                pred = data_pack.add_entry(
                    PredicateMention(data_pack, pred_span.begin,
                                     pred_span.end))

                for arg_span, label in arg_result:
                    arg = data_pack.add_or_get_entry(
                        PredicateArgument(data_pack, arg_span.begin,
                                          arg_span.end))
                    link = PredicateLink(data_pack, pred, arg)
                    link.set_fields(arg_type=label)
                    data_pack.add_or_get_entry(link)
コード例 #23
0
    def _copy_link_or_group(
        self,
        entry: Union[Link, Group],
        entry_map: Dict[int, int],
        new_pack: DataPack,
    ) -> bool:
        r"""
        This function copies a Link/Group in the data pack.
        If the children Link/Group does not exist, it will recursively
        create the children Link/Group. If the children Annotation
        does not exist, it will abort and return False.

        Args:
            entry: The Link/Group in the original data pack to copy.
            entry_map: The dictionary mapping original entry to copied entry.
            new_pack: The new data pack, which is the destination of copy.
        Returns:
            A bool value indicating whether the copy happens.
        """

        # If the entry has been copied, return True.
        if entry.tid in entry_map:
            return True

        # The entry should be either Link or Group.
        is_link: bool = isinstance(entry, Link)

        # Get the children entries.
        children: List[Entry]
        if is_link:
            children = [entry.get_parent(), entry.get_child()]
        else:
            children = entry.get_members()

        # Copy the children entries.
        new_children: List[Entry] = []
        for child_entry in children:
            if isinstance(child_entry, (Link, Group)):
                # Recursively copy the children Links/Groups.
                if not self._copy_link_or_group(child_entry, entry_map,
                                                new_pack):
                    return False
            else:
                # Children Annotation must have been copied.
                if child_entry.tid not in entry_map:
                    return False
            new_child: Entry = new_pack.get_entry(entry_map[child_entry.tid])
            new_children.append(new_child)

        # Create the new entry and add to the new pack.
        new_entry: Entry
        if is_link:
            entry = cast(Link, entry)
            new_link_parent: Entry = new_children[0]
            new_link_child: Entry = new_children[1]
            new_entry = type(entry)(new_pack, new_link_parent,
                                    new_link_child)  # type: ignore
        else:
            entry = cast(Group, entry)
            new_entry = type(entry)(new_pack, new_children)  # type: ignore
        new_pack.add_entry(new_entry)
        entry_map[entry.tid] = new_entry.tid
        return True
コード例 #24
0
ファイル: ontonotes_reader.py プロジェクト: williamwhe/forte
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = DataPack()

        with open(file_path, encoding="utf8") as doc:
            words = []
            offset = 0
            has_rows = False

            speaker = part_id = document_id = None
            sentence_begin = 0

            # auxiliary structures
            current_entity_mention: Optional[Tuple[int, str]] = None
            verbal_predicates: List[PredicateMention] = []

            current_pred_arg: List[Optional[Tuple[int, str]]] = []
            verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = []

            groups: DefaultDict[int, List[EntityMention]] = defaultdict(list)
            coref_stacks: DefaultDict[int, List[int]] = defaultdict(list)

            for line in doc:
                line = line.strip()

                if line.startswith("#end document"):
                    break

                if line != "" and not line.startswith("#"):
                    fields = self._parse_line(line)
                    speaker = fields.speaker
                    if fields.part_number is not None:
                        part_id = int(fields.part_number)
                    document_id = fields.document_id

                    assert fields.word is not None
                    word_begin = offset
                    word_end = offset + len(fields.word)

                    # add tokens
                    token = Token(pack, word_begin, word_end)
                    if fields.pos_tag is not None:
                        token.set_fields(pos=fields.pos_tag)
                    if fields.word_sense is not None:
                        token.set_fields(sense=fields.word_sense)
                    pack.add_entry(token)

                    # add entity mentions
                    current_entity_mention = self._process_entity_annotations(
                        pack,
                        fields.entity_label,
                        word_begin,
                        word_end,
                        current_entity_mention,
                    )

                    # add predicate mentions
                    if (fields.lemmatised_word is not None
                            and fields.lemmatised_word != "-"):
                        word_is_verbal_predicate = any(
                            "(V" in x for x in fields.predicate_labels)
                        kwargs_i = {
                            "pred_lemma":
                            fields.lemmatised_word,
                            "pred_type":
                            ("verb" if word_is_verbal_predicate else "other")
                        }
                        pred_mention = PredicateMention(
                            pack, word_begin, word_end)
                        pred_mention.set_fields(**kwargs_i)
                        if fields.framenet_id is not None:
                            pred_mention.set_fields(
                                framenet_id=fields.framenet_id)
                        pack.add_entry(pred_mention)

                        if word_is_verbal_predicate:
                            verbal_predicates.append(pred_mention)

                    if not verbal_pred_args:
                        current_pred_arg = [None] * len(
                            fields.predicate_labels)
                        verbal_pred_args = [[]
                                            for _ in fields.predicate_labels]

                    # add predicate arguments
                    self._process_pred_annotations(
                        pack,
                        fields.predicate_labels,
                        word_begin,
                        word_end,
                        current_pred_arg,
                        verbal_pred_args,
                    )

                    # add coreference mentions
                    self._process_coref_annotations(
                        pack,
                        fields.coreference,
                        word_begin,
                        word_end,
                        coref_stacks,
                        groups,
                    )

                    words.append(fields.word)
                    offset = word_end + 1
                    has_rows = True

                else:
                    if not has_rows:
                        continue

                    # add predicate links in the sentence
                    for predicate, pred_arg in zip(verbal_predicates,
                                                   verbal_pred_args):
                        for arg in pred_arg:
                            kwargs_i = {
                                "arg_type": arg[1],
                            }
                            link = PredicateLink(pack, predicate, arg[0])
                            link.set_fields(**kwargs_i)
                            pack.add_entry(link)

                    verbal_predicates = []
                    current_pred_arg = []
                    verbal_pred_args = []

                    # add sentence

                    sent = Sentence(pack, sentence_begin, offset - 1)
                    if speaker is not None:
                        sent.set_fields(speaker=speaker)
                    if part_id is not None:
                        sent.set_fields(part_id=int(part_id))
                    pack.add_entry(sent)

                    sentence_begin = offset

                    has_rows = False

            # group the coreference mentions in the whole document
            for _, mention_list in groups.items():
                # kwargs_i = {"coref_type": group_id}
                group = CoreferenceGroup(pack)
                # group.set_fields(**kwargs_i)
                group.add_members(mention_list)
                pack.add_entry(group)

            text = " ".join(words)
            document = Document(pack, 0, len(text))
            pack.add_entry(document)

            if document_id is not None:
                pack.set_meta(doc_id=document_id)
            pack.set_text(text, replace_func=self.text_replace_operation)

        yield pack