Example #1
0
    def annotate_text(self, text, textname, sentence_filter):

        sentences = []
        sent_id = 1

        doc = self.annotator(text)
        for sent in doc.sents:
            if sentence_filter([token.text for token in sent]):
                continue

            sentence = []
            for tok_id, token in enumerate(sent):
                if not token.is_space:
                    sentence.append(
                        collections.OrderedDict([
                            ('id', tok_id + 1), ('form', token.text),
                            ('lemma', token.lemma_), ('upos', token.pos_),
                            ('xpos', token.tag_), ('feats', None),
                            ('head', token.head.i - sent[0].i +
                             1 if token.dep_ != "ROOT" else 0),
                            ('deprel', token.dep_), ('deps', None),
                            ('misc', None)
                        ]))

            sentences.append(
                conllu.TokenList(sentence,
                                 metadata=collections.OrderedDict([
                                     ('sent_id', textname + '.' + str(sent_id))
                                 ])))
            sent_id += 1

        return sentences
    def _tacred_example_to_token_list(
            self, example: Dict[str, Any]) -> conllu.TokenList:
        id_ = example["id"]
        tokens = example["token"]
        ner = example["stanford_ner"]

        subj_start = example["subj_start"]
        subj_end = example["subj_end"]
        obj_start = example["obj_start"]
        obj_end = example["obj_end"]

        subj_tag = example["subj_type"]
        obj_tag = example["obj_type"]

        label = example["relation"]

        metadata = {
            "text":
            " ".join(tokens),
            "sentence_id":
            str(id_),
            "relations":
            ";".join([
                str(subj_start + 1),
                str(subj_end + 1),
                str(obj_start + 1),
                str(obj_end + 1),
                label,
            ]),
        }

        prev_tag = None
        token_dicts = []
        for idx, (token, tag) in enumerate(zip(tokens, ner)):
            if subj_start <= idx <= subj_end:
                tag = subj_tag

            if obj_start <= idx <= obj_end:
                tag = obj_tag

            prefix = ""
            if tag != "O":
                if tag != prev_tag:
                    prefix = "B-"
                else:
                    prefix = "I-"

            prev_tag = tag

            token_dicts.append(
                Token({
                    "id": str(idx + 1),
                    "form": convert_ptb_token(token),
                    "ner": prefix + tag,
                }))

        return conllu.TokenList(tokens=token_dicts,
                                metadata=Metadata(metadata))
def tokens2conllu(tokens):
    tokens = [
        OrderedDict((k, v) for k, v in zip(
            conllu.parser.DEFAULT_FIELDS,
            [i + 1, unescape(token)] +
            ["_" for i in range(len(conllu.parser.DEFAULT_FIELDS) - 1)],
        )) for i, token in enumerate(tokens)
    ]
    tl = conllu.TokenList(tokens)
    return tl
    def _src_token_list_to_token_list(self, src_token_list):
        tokens = []
        token_dicts = []
        ner_tags = []
        for index, token in enumerate(src_token_list, start=1):
            text = token["form"]
            ner_tag = token["ner"]
            tokens.append(text)
            ner_tags.append(ner_tag)

            token_dicts.append({
                "id": str(index),
                "form": text,
                "ner": ner_tag,
            })

        span_end_to_span = {
            end: (start, end)
            for start, end in self._bio_tags_to_spans(ner_tags)
        }

        relations = []
        for index, token in enumerate(src_token_list):
            for relation, head in zip(token["relations"],
                                      token["relation_heads"]):
                if relation == "N":
                    continue

                subj_start, subj_end = span_end_to_span[index]
                obj_start, obj_end = span_end_to_span[head]
                relations.append(
                    (subj_start, subj_end, obj_start, obj_end, relation))

        doc_id = src_token_list.metadata["doc"]

        metadata = {
            "text":
            " ".join(tokens),
            "sentence_id":
            doc_id,
            "relations":
            "|".join([
                ";".join([
                    str(subj_start + 1),
                    str(subj_end + 1),
                    str(obj_start + 1),
                    str(obj_end + 1),
                    relation,
                ]) for subj_start, subj_end, obj_start, obj_end, relation in
                relations
            ]),
        }

        return conllu.TokenList(tokens=token_dicts, metadata=metadata)
Example #5
0
 def finalize_sentence(sentence):
     encode_entities(sentence)
     sent_meta = {
         "sent_id":
         meta['document_cts_urn'][18:] + "-" + str(len(sentences) + 1),
         "doc_id":
         meta['document_cts_urn'],
         "segmentation":
         meta['segmentation'],
         "tagging":
         meta['tagging'],
         "parsing":
         meta.get('parsing', 'none'),
         "entities":
         meta.get('entities', 'none'),
     }
     tl = conllu.TokenList(sentence, sent_meta)
     sentences.append(tl.serialize())
    def drugprot_document_to_tokenlists(
        self,
        pmid: str,
        title_sentences: List[Sentence],
        abstract_sentences: List[Sentence],
        abstract_offset: int,
        entities: Dict[str, Tuple[str, int, int, str]],
        relations: Set[Tuple[str, str, str]],
    ) -> List[conllu.TokenList]:
        tokenlists: List[conllu.TokenList] = []
        sentence_id = 1
        for offset, sents in [
            (0, title_sentences),
            (abstract_offset, abstract_sentences),
        ]:
            for sent in sents:
                assert sent.start_pos is not None
                assert sent.end_pos is not None
                sent_char_start = sent.start_pos + offset
                sent_char_end = sent.end_pos + offset

                entities_in_sent = set()
                for entity_id, (_, char_start, char_end,
                                _) in entities.items():
                    if sent_char_start <= char_start and char_end <= sent_char_end:
                        entities_in_sent.add(entity_id)

                entity_char_spans = [(entities[entity_id][1],
                                      entities[entity_id][2])
                                     for entity_id in entities_in_sent]

                token_offsets = [(
                    sent.start_pos + (token.start_pos or 0) + offset,
                    sent.start_pos + (token.end_pos or 0) + offset,
                ) for token in sent.tokens]
                entity_token_spans = self.char_spans_to_token_spans(
                    entity_char_spans, token_offsets)

                tags_1 = ["O"] * len(sent)
                tags_2 = ["O"] * len(sent)
                entity_id_to_token_idx = {}

                ordered_entities = sorted(
                    zip(entities_in_sent, entity_token_spans),
                    key=lambda x: x[1][1] - x[1][0],
                    reverse=True,
                )

                for entity_id, entity_span in ordered_entities:

                    entity_id_to_token_idx[entity_id] = entity_span

                    # check if first tag row is already occupied
                    token_start, token_end = entity_span
                    tag_1_occupied = False
                    for i in range(token_start, token_end):
                        if tags_1[i] != "O":
                            tag_1_occupied = True

                    # if first tag row is occupied, use second tag row
                    tags = tags_2 if tag_1_occupied else tags_1

                    tag = entities[entity_id][0]
                    token_start, token_end = entity_span
                    for i in range(token_start, token_end):
                        if i == token_start:
                            prefix = "B-"
                        else:
                            prefix = "I-"

                        tags[i] = prefix + tag

                token_dicts = []
                for i, (token, tag_1,
                        tag_2) in enumerate(zip(sent, tags_1, tags_2)):
                    # hardcoded mapping TODO: perhaps find nicer solution
                    tag_1 = tag_1.replace("GENE-N", "GENE")
                    tag_1 = tag_1.replace("GENE-Y", "GENE")
                    tag_2 = tag_2.replace("GENE-N", "GENE")
                    tag_2 = tag_2.replace("GENE-Y", "GENE")

                    token_dicts.append(
                        Token({
                            "id": str(i + 1),
                            "form": token.text,
                            "ner": tag_1,
                            "ner-2": tag_2,
                        }))

                relations_in_sent = []
                for relation, ent1, ent2 in [
                        r for r in relations
                        if {r[1], r[2]} <= entities_in_sent
                ]:
                    subj_start = entity_id_to_token_idx[ent1][0]
                    subj_end = entity_id_to_token_idx[ent1][1]
                    obj_start = entity_id_to_token_idx[ent2][0]
                    obj_end = entity_id_to_token_idx[ent2][1]
                    relations_in_sent.append(
                        (subj_start, subj_end, obj_start, obj_end, relation))

                metadata = {
                    "text":
                    sent.to_original_text(),
                    "doc_id":
                    pmid,
                    "sentence_id":
                    str(sentence_id),
                    "relations":
                    "|".join([
                        ";".join([
                            str(subj_start + 1),
                            str(subj_end),
                            str(obj_start + 1),
                            str(obj_end),
                            relation,
                        ]) for subj_start, subj_end, obj_start, obj_end,
                        relation in relations_in_sent
                    ]),
                }

                tokenlists.append(
                    conllu.TokenList(tokens=token_dicts,
                                     metadata=Metadata(metadata)))

                sentence_id += 1

        return tokenlists
    def _semeval_lines_to_token_list(self, raw_lines, augment_relations):
        raw_id, raw_text = raw_lines[0].split("\t")
        label = raw_lines[1]
        id_ = int(raw_id)
        raw_text = raw_text.strip('"')

        # Some special cases (e.g., missing spaces before entity marker)
        if id_ in [213, 4612, 6373, 8411, 9867]:
            raw_text = raw_text.replace("<e2>", " <e2>")
        if id_ in [2740, 4219, 4784]:
            raw_text = raw_text.replace("<e1>", " <e1>")
        if id_ == 9256:
            raw_text = raw_text.replace("log- jam", "log-jam")

        # necessary if text should be whitespace tokenizeable
        if id_ in [2609, 7589]:
            raw_text = raw_text.replace("1 1/2", "1-1/2")
        if id_ == 10591:
            raw_text = raw_text.replace("1 1/4", "1-1/4")
        if id_ == 10665:
            raw_text = raw_text.replace("6 1/2", "6-1/2")

        raw_text = re.sub(r"([.,!?()])$", r" \1", raw_text)
        raw_text = re.sub(r"(e[12]>)([',;:\"\(\)])", r"\1 \2", raw_text)
        raw_text = re.sub(r"([',;:\"\(\)])(</?e[12])", r"\1 \2", raw_text)
        raw_text = raw_text.replace("<e1>", "<e1> ")
        raw_text = raw_text.replace("<e2>", "<e2> ")
        raw_text = raw_text.replace("</e1>", " </e1>")
        raw_text = raw_text.replace("</e2>", " </e2>")

        tokens = raw_text.split(" ")

        # Handle case where tail may occur before the head
        subj_start = tokens.index("<e1>")
        obj_start = tokens.index("<e2>")
        if subj_start < obj_start:
            tokens.pop(subj_start)
            subj_end = tokens.index("</e1>")
            tokens.pop(subj_end)
            obj_start = tokens.index("<e2>")
            tokens.pop(obj_start)
            obj_end = tokens.index("</e2>")
            tokens.pop(obj_end)
        else:
            tokens.pop(obj_start)
            obj_end = tokens.index("</e2>")
            tokens.pop(obj_end)
            subj_start = tokens.index("<e1>")
            tokens.pop(subj_start)
            subj_end = tokens.index("</e1>")
            tokens.pop(subj_end)

        relation = ";".join([
            str(subj_start + 1),
            str(subj_end),
            str(obj_start + 1),
            str(obj_end),
            label,
        ])

        if augment_relations:
            label_inverted = label.replace("e1", "e3")
            label_inverted = label_inverted.replace("e2", "e1")
            label_inverted = label_inverted.replace("e3", "e2")
            relation_inverted = ";".join([
                str(obj_start + 1),
                str(obj_end),
                str(subj_start + 1),
                str(subj_end),
                label_inverted,
            ])

        metadata = {
            "text":
            " ".join(tokens),
            "sentence_id":
            str(id_),
            "relations":
            relation + "|" +
            relation_inverted if augment_relations else relation,
        }

        token_dicts = []
        for idx, token in enumerate(tokens):
            tag = "O"
            prefix = ""

            if subj_start <= idx < subj_end:
                prefix = "B-" if idx == subj_start else "I-"
                tag = "E1"
            elif obj_start <= idx < obj_end:
                prefix = "B-" if idx == obj_start else "I-"
                tag = "E2"

            token_dicts.append({
                "id": str(idx + 1),
                "form": token,
                "ner": prefix + tag,
            })

        return conllu.TokenList(tokens=token_dicts, metadata=metadata)
Example #8
0
 def to_conllu(self):
     return conllu.TokenList([token.serialize() for token in self[1:]],
                             metadata=self.meta)
Example #9
0
    def predict(self,
                input: str,
                output: Optional[str] = None,
                pos: bool = True,
                ner: bool = False,
                srl: bool = False,
                dep: bool = True,
                sdp: bool = False):
        """
        预测文本并输出为 conllu 格式
        :param input: 要预测的文件,每行一句话
        :param output: 输出的结果文件,默认是输入文件添加 .conll 后缀
        :param pos: 是否输出 词性标注 结果 ['True','False']
        :param ner: 是否输出 命名实体识别 结果 ['True','False'], 占用 conllu feats 列
        :param srl: 是否输出 语义角色标注 结果 ['True','False'], 占用 conllu misc 列
        :param dep: 是否输出 依存句法分析 结果 ['True','False']
        :param sdp: 是否输出 语义依存分析 结果 ['True','False']
        """
        if output is None:
            output = f"{input}.conllu"

        with open(output, mode='w', encoding='utf-8') as f:
            sentences = sum([sent for idx, sent in iter_lines(input)], [])
            results = self._predict(sentences, pos, ner, srl, dep, sdp)

            for text, (seg_s, pos_s, ner_s, dep_s, sdp_s,
                       srl_s) in zip(sentences, results):
                tokens = conllu.TokenList([
                    conllu.models.Token(id=idx + 1,
                                        form=token,
                                        lemma=token,
                                        upos=pos if pos else '_',
                                        xpos=pos if pos else '_',
                                        feats='O' if ner else '_',
                                        head=idx,
                                        deprel='_',
                                        deps='' if sdp else '_',
                                        misc='SpaceAfter=No')
                    for idx, (token,
                              pos) in enumerate(zip_longest(seg_s, pos_s))
                ], conllu.models.Metadata(text=text))

                if ner:
                    for tag, start, end in ner_s:
                        tokens[start]['feats'] = f'B-{tag}'
                        for i in range(start + 1, end):
                            tokens[start]['feats'] = f'I-{tag}'
                if dep:
                    for id, head, tag in dep_s:
                        tokens[id - 1]['head'] = head
                        tokens[id - 1]['deprel'] = tag
                if sdp:
                    for id, head, tag in sdp_s:
                        if tokens[id - 1]['deps']:
                            tokens[id - 1]['deps'] = tokens[
                                id - 1]['deps'] + f"|{head}:{tag}"
                        else:
                            tokens[id - 1]['deps'] = f"{head}:{tag}"

                if srl:
                    srl_predicate, srl_roles = list(zip(*srl_s))
                    srl_predicate_num = len(srl_predicate)
                    if srl_predicate_num > 0:
                        srl_misc = [[
                            f'Predicate={"Y" if i in srl_predicate else "_"}',
                            ['O'] * srl_predicate_num
                        ] for i in range(len(tokens))]
                        for idx, srl_role in enumerate(srl_roles):
                            for tag, start, end in srl_role:
                                srl_misc[start][-1][idx] = f'B-{tag}'
                                for i in range(start + 1, end):
                                    srl_misc[start][-1][idx] = f'I-{tag}'
                        srl_misc = [
                            "|".join([s[0], "Role=" + ",".join(s[-1])])
                            for s in srl_misc
                        ]

                        for token, misc in zip(tokens, srl_misc):
                            token['misc'] = f"{token['misc']}|{misc}"

                f.write(tokens.serialize())
Example #10
0
    def drugprot_document_to_tokenlists(
            self, pmid: str, title_sentences: List[Sentence],
            abstract_sentences: List[Sentence], abstract_offset: int,
            entities: Dict[str, Tuple[str, int, int, str]],
            relations: Set[Tuple[str, str, str]]) -> List[conllu.TokenList]:
        tokenlists: List[conllu.TokenList] = []
        sentence_id = 1
        for offset, sents in [(0, title_sentences),
                              (abstract_offset, abstract_sentences)]:
            for sent in sents:
                sent_char_start = sent.start_pos + offset
                sent_char_end = sent.end_pos + offset

                entities_in_sent = set()
                for entity_id, (_, char_start, char_end,
                                _) in entities.items():
                    if sent_char_start <= char_start and char_end <= sent_char_end:
                        entities_in_sent.add(entity_id)

                entity_char_spans = [(entities[entity_id][1],
                                      entities[entity_id][2])
                                     for entity_id in entities_in_sent]

                token_offsets = [(sent.start_pos + token.start_pos + offset,
                                  sent.start_pos + token.end_pos + offset)
                                 for token in sent.tokens]
                entity_token_spans = self.char_spans_to_token_spans(
                    entity_char_spans, token_offsets)

                tags_1 = ["O"] * len(sent)
                tags_2 = ["O"] * len(sent)
                entity_id_to_token_idx = {}
                prev_entity_span = None
                for entity_id, entity_span in sorted(zip(
                        entities_in_sent, entity_token_spans),
                                                     key=lambda x: x[1][0]):
                    entity_id_to_token_idx[entity_id] = entity_span

                    overlap = self.has_overlap(prev_entity_span, entity_span)

                    tags = tags_2 if overlap else tags_1

                    tag = entities[entity_id][0]
                    token_start, token_end = entity_span
                    for i in range(token_start, token_end):
                        if i == token_start:
                            prefix = "B-"
                        else:
                            prefix = "I-"

                        tags[i] = prefix + tag

                    prev_entity_span = entity_span

                token_dicts = []
                for i, (token, tag_1,
                        tag_2) in enumerate(zip(sent, tags_1, tags_2)):

                    # hardcoded mapping TODO: perhaps find nicer solution
                    tag_1 = tag_1.replace("GENE-N", "GENE")
                    tag_1 = tag_1.replace("GENE-Y", "GENE")
                    tag_2 = tag_2.replace("GENE-N", "GENE")
                    tag_2 = tag_2.replace("GENE-Y", "GENE")

                    token_dicts.append({
                        "id": str(i + 1),
                        "form": token.text,
                        "ner": tag_1,
                        "ner-2": tag_2
                    })

                relations_in_sent = []
                for relation, ent1, ent2 in [
                        r for r in relations
                        if {r[1], r[2]} <= entities_in_sent
                ]:
                    subj_start = entity_id_to_token_idx[ent1][0]
                    subj_end = entity_id_to_token_idx[ent1][1]
                    obj_start = entity_id_to_token_idx[ent2][0]
                    obj_end = entity_id_to_token_idx[ent2][1]
                    relations_in_sent.append(
                        (subj_start, subj_end, obj_start, obj_end, relation))

                metadata = {
                    "text":
                    sent.to_original_text(),
                    "doc_id":
                    pmid,
                    "sentence_id":
                    str(sentence_id),
                    "relations":
                    "|".join([
                        ";".join([
                            str(subj_start + 1),
                            str(subj_end),
                            str(obj_start + 1),
                            str(obj_end), relation
                        ]) for subj_start, subj_end, obj_start, obj_end,
                        relation in relations_in_sent
                    ]),
                }

                tokenlists.append(
                    conllu.TokenList(tokens=token_dicts, metadata=metadata))

                sentence_id += 1

        return tokenlists