Ejemplo n.º 1
0
 def _create_srl(input_pack: DataPack, tokens: List[Token],
                 result: Dict[str, List[str]]) -> None:
     for _, tag in enumerate(result['srl_tags']):
         pred_span, arguments = parse_allennlp_srl_tags(tag)
         if not pred_span:
             continue
         pred = PredicateMention(input_pack, tokens[pred_span.begin].begin,
                                     tokens[pred_span.end].end)
         for arg_span, label in arguments:
             arg = PredicateArgument(input_pack,
                 tokens[arg_span.begin].begin, tokens[arg_span.end].end)
             link = PredicateLink(input_pack, pred, arg)
             link.arg_type = label
Ejemplo n.º 2
0
    def pack(self, data_pack: DataPack,
             inputs: Dict[str, List[Prediction]]) -> None:
        batch_predictions = inputs["predictions"]
        for predictions in batch_predictions:
            for pred_span, arg_result in predictions:

                pred = PredicateMention(data_pack, pred_span.begin,
                                        pred_span.end)

                for arg_span, label in arg_result:
                    arg = PredicateArgument(data_pack, arg_span.begin,
                                            arg_span.end)
                    link = PredicateLink(data_pack, pred, arg)
                    link.arg_type = label
Ejemplo n.º 3
0
    def pack(
        self,
        pack: DataPack,
        predict_results: Dict[str, List[Prediction]],
        _: Optional[Annotation] = None,
    ):
        batch_predictions = predict_results["predictions"]
        for predictions in batch_predictions:
            for pred_span, arg_result in predictions:

                pred = PredicateMention(pack, pred_span.begin, pred_span.end)

                for arg_span, label in arg_result:
                    arg = PredicateArgument(pack, arg_span.begin, arg_span.end)
                    link = PredicateLink(pack, pred, arg)
                    link.arg_type = label
Ejemplo n.º 4
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack: DataPack = DataPack()
        text: str = ""
        offset: int = 0

        with open(file_path, "r", encoding="utf8") as f:
            for line in f:
                line = line.strip()
                if line != "":
                    oie_component: List[str] = line.split("\t")
                    sentence: str = oie_component[0]

                    # Add sentence.
                    Sentence(pack, offset, offset + len(sentence))
                    offset += len(sentence) + 1
                    text += sentence + " "

                    head_predicate: str = oie_component[1]
                    full_predicate: str = oie_component[2]

                    # Add head predicate.
                    token: Token = Token(pack,
                                         offset,
                                         offset + len(head_predicate))
                    offset += len(head_predicate) + 1
                    text += head_predicate + " "

                    # Add full predicate.
                    predicate_mention: PredicateMention = PredicateMention(pack,
                                                         offset,
                                                         offset
                                                         + len(full_predicate))
                    predicate_mention.headword = token
                    offset += len(full_predicate) + 1
                    text += full_predicate + " "

                    for arg in oie_component[3:]:
                        # Add predicate argument.
                        predicate_arg: PredicateArgument = \
                            PredicateArgument(pack,
                                              offset,
                                              offset + len(arg))
                        offset += len(arg) + 1
                        text += arg + " "

                        # Add predicate link.
                        PredicateLink(pack, predicate_mention, predicate_arg)

        pack.set_text(text, replace_func=self.text_replace_operation)

        Document(pack, 0, len(text))

        pack.pack_name = file_path

        yield pack
Ejemplo n.º 5
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = self.new_pack()

        with open(file_path, encoding="utf8") as doc:
            words = []
            offset = 0
            has_rows = False

            speaker = part_id = document_id = None
            sentence_begin = 0

            # auxiliary structures
            current_entity_mention: Optional[Tuple[int, str]] = None
            verbal_predicates: List[PredicateMention] = []

            current_pred_arg: List[Optional[Tuple[int, str]]] = []
            verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = []

            groups: DefaultDict[int, List[EntityMention]] = defaultdict(list)
            coref_stacks: DefaultDict[int, List[int]] = defaultdict(list)

            for line in doc:
                line = line.strip()

                if line.startswith("#end document"):
                    break

                if line != "" and not line.startswith("#"):
                    fields = self._parse_line(line)
                    speaker = fields.speaker
                    if fields.part_number is not None:
                        part_id = int(fields.part_number)
                    document_id = fields.document_id

                    assert fields.word is not None
                    word_begin = offset
                    word_end = offset + len(fields.word)

                    # add tokens
                    token = Token(pack, word_begin, word_end)

                    if fields.pos_tag is not None:
                        token.pos = fields.pos_tag
                    if fields.word_sense is not None:
                        token.sense = fields.word_sense

                    # add entity mentions
                    current_entity_mention = self._process_entity_annotations(
                        pack,
                        fields.entity_label,
                        word_begin,
                        word_end,
                        current_entity_mention,
                    )

                    # add predicate mentions
                    if (fields.lemmatised_word is not None
                            and fields.lemmatised_word != "-"):
                        word_is_verbal_predicate = any(
                            "(V" in x for x in fields.predicate_labels)
                        pred_mention = PredicateMention(
                            pack, word_begin, word_end)

                        pred_mention.predicate_lemma = fields.lemmatised_word
                        pred_mention.is_verb = word_is_verbal_predicate

                        if fields.framenet_id is not None:
                            pred_mention.framenet_id = fields.framenet_id

                        if word_is_verbal_predicate:
                            verbal_predicates.append(pred_mention)

                    if not verbal_pred_args:
                        current_pred_arg = [None] * len(
                            fields.predicate_labels)
                        verbal_pred_args = [[]
                                            for _ in fields.predicate_labels]

                    # add predicate arguments
                    self._process_pred_annotations(
                        pack,
                        fields.predicate_labels,
                        word_begin,
                        word_end,
                        current_pred_arg,
                        verbal_pred_args,
                    )

                    # add coreference mentions
                    self._process_coref_annotations(
                        pack,
                        fields.coreference,
                        word_begin,
                        word_end,
                        coref_stacks,
                        groups,
                    )

                    words.append(fields.word)
                    offset = word_end + 1
                    has_rows = True

                else:
                    if not has_rows:
                        continue

                    # add predicate links in the sentence
                    for predicate, pred_arg in zip(verbal_predicates,
                                                   verbal_pred_args):
                        for arg in pred_arg:
                            link = PredicateLink(pack, predicate, arg[0])
                            link.arg_type = arg[1]

                    verbal_predicates = []
                    current_pred_arg = []
                    verbal_pred_args = []

                    # add sentence

                    sent = Sentence(pack, sentence_begin, offset - 1)
                    if speaker is not None:
                        sent.speaker = speaker
                    if part_id is not None:
                        sent.part_id = int(part_id)

                    sentence_begin = offset

                    has_rows = False

            # group the coreference mentions in the whole document
            for _, mention_list in groups.items():
                group = CoreferenceGroup(pack)
                group.add_members(mention_list)

            text = " ".join(words)
            pack.set_text(text, replace_func=self.text_replace_operation)

            _ = Document(pack, 0, len(text))
            if document_id is not None:
                pack.pack_name = document_id
        yield pack
Ejemplo n.º 6
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        pack = DataPack()

        with open(file_path, encoding="utf8") as doc:
            text = ""
            offset = 0
            has_rows = False

            speaker = part_id = document_id = None
            sentence_begin = 0

            # auxiliary structures
            current_entity_mention: Optional[Tuple[int, str]] = None
            verbal_predicates: List[PredicateMention] = []

            current_pred_arg: List[Optional[Tuple[int, str]]] = []
            verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = []

            groups: DefaultDict[int, List[EntityMention]] = defaultdict(list)
            coref_stacks: DefaultDict[int, List[int]] = defaultdict(list)

            for line in doc:
                line = line.strip()

                if line.startswith("#end document"):
                    break

                if line != "" and not line.startswith("#"):
                    conll_components = line.split()
                    document_id = conll_components[0]
                    part_id = int(conll_components[1])
                    word = conll_components[3]
                    pos_tag = conll_components[4]
                    lemmatised_word = conll_components[6]
                    framenet_id = conll_components[7]
                    word_sense = conll_components[8]
                    speaker = conll_components[9]
                    entity_label = conll_components[10]
                    pred_labels = conll_components[11:-1]

                    word_begin = offset
                    word_end = offset + len(word)

                    # add tokens
                    kwargs_i: Dict[str, Any] = {"pos": pos_tag,
                                                "sense": word_sense}
                    token = Token(pack, word_begin, word_end)
                    token.set_fields(**kwargs_i)
                    pack.add_or_get_entry(token)

                    # add entity mentions
                    current_entity_mention = self._process_entity_annotations(
                        pack, entity_label, word_begin, word_end,
                        current_entity_mention
                    )

                    # add predicate mentions
                    if lemmatised_word != "-":
                        word_is_verbal_predicate = any(
                            ["(V" in x for x in pred_labels]
                        )
                        kwargs_i = {
                            "framenet_id": framenet_id,
                            "pred_lemma": lemmatised_word,
                            "pred_type": "verb" if word_is_verbal_predicate
                            else "other"
                        }
                        pred_mention = PredicateMention(
                                pack, word_begin, word_end)
                        pred_mention.set_fields(**kwargs_i)
                        pred_mention = pack.add_or_get_entry(
                            pred_mention
                        )

                        if word_is_verbal_predicate:
                            verbal_predicates.append(pred_mention)

                    if not verbal_pred_args:
                        current_pred_arg = [None for _ in pred_labels]
                        verbal_pred_args = [[] for _ in pred_labels]

                    # add predicate arguments
                    self._process_pred_annotations(
                        pack,
                        conll_components[11:-1],
                        word_begin,
                        word_end,
                        current_pred_arg,
                        verbal_pred_args,
                    )

                    # add coreference mentions
                    self._process_coref_annotations(
                        pack,
                        conll_components[-1],
                        word_begin,
                        word_end,
                        coref_stacks,
                        groups,
                    )

                    text += word + " "
                    offset = word_end + 1
                    has_rows = True

                else:
                    if not has_rows:
                        continue

                    # add predicate links in the sentence
                    for predicate, pred_arg in zip(verbal_predicates,
                                                   verbal_pred_args):
                        for arg in pred_arg:
                            kwargs_i = {
                                "arg_type": arg[1],
                            }
                            link = PredicateLink(pack, predicate, arg[0])
                            link.set_fields(**kwargs_i)
                            pack.add_or_get_entry(link)

                    verbal_predicates = []
                    current_pred_arg = []
                    verbal_pred_args = []

                    # add sentence

                    kwargs_i = {"speaker": speaker, "part_id": part_id}
                    sent = Sentence(pack, sentence_begin, offset - 1)
                    sent.set_fields(**kwargs_i)
                    pack.add_or_get_entry(sent)

                    sentence_begin = offset

                    has_rows = False

            # group the coreference mentions in the whole document
            for _, mention_list in groups.items():
                # kwargs_i = {"coref_type": group_id}
                group = CoreferenceGroup(pack)
                # group.set_fields(**kwargs_i)
                group.add_members(mention_list)
                pack.add_or_get_entry(group)

            document = Document(pack, 0, len(text))
            pack.add_or_get_entry(document)

            kwargs_i = {"doc_id": document_id}
            pack.set_meta(**kwargs_i)
            pack.set_text(text, replace_func=self.text_replace_operation)

        yield pack