Ejemplo n.º 1
0
def postprocess_brat_conllu(corpus, save_to=None):
    """Converts corpus in text format into CoNLL-U format. Embedded brat
    entities will be placed to the MISC field.

    :param corpus: corpus in Parsed CoNLL-U format or a path to the previously
                   saved corpus in CoNLL-U format
    :param save_to: a path where the result will be stored. If ``None``
                    (default), the function returns the result as a generator
                    of Parsed CoNLL-U data
    """
    def process():
        for sent, meta in Conllu.load(corpus) \
                              if isinstance(corpus, str) else \
                          corpus:
            meta.pop('text', None)
            sent_ = []
            tags = []
            for token in sent:
                misc = token['MISC']
                if token['FORM'] is None:
                    if TAGS_BRAT[0] in misc:
                        if TAGS_BRAT[0] not in tags:
                            tags.append(misc[TAGS_BRAT[0]])
                    elif TAGS_BRAT[1] in misc:
                        try:
                            tags.remove(misc[TAGS_BRAT[1]])
                        except:
                            pass
                        if sent_ and 'SpaceAfter' in misc:
                            sent_[-1]['MISC']['SpaceAfter'] = misc[
                                'SpaceAfter']
                    else:
                        sent_.append(token)
                else:
                    for tag in tags:
                        misc[TAG_BRAT + tag] = 'Yes'
                    sent_.append(token)
            yield sent_, meta

    res = process()
    if save_to:
        Conllu.save(res, save_to, fix=True)
    else:
        return Conllu.fix(res)
Ejemplo n.º 2
0
def postprocess_brat_conllu(corpus, save_to=None):
    """Does postprocessing for the *corpus* with embedded brat annotations
    which already was preliminarily prepared by Toxine's TextPreprocessor.

    :param corpus: corpus in Parsed CoNLL-U format or a path to the previously
                   saved corpus in CoNLL-U format.
    :param save_to: a path where result will be stored. If ``None`` (default),
                    the function returns the result as a generator of Parsed
                    CoNLL-U data.
    """
    def process():
        def unmask(text):
            return text.replace(r'\{}'.format(BRAT_TEXT_BOUND_START_MARK[-1]),
                                BRAT_TEXT_BOUND_START_MARK[-1]) \
                       .replace(r'\{}'.format(SEP1), SEP1) \
                       .replace('__', ' ').replace(r'\_', '_')

        for sent, meta in Conllu.load(corpus) \
                              if isinstance(corpus, str) else \
                          corpus:
            meta.pop('text', None)
            if 'par_text' in meta:
                meta['par_text'] = RE_BRAT.sub('', meta['par_text'])
            sent_ = []
            anns = []
            for token in sent:
                misc = token['MISC']
                if token['FORM'] is None:
                    if BRAT_START_TAG in misc:
                        assert BRAT_START_TAG not in anns
                        assert misc[BRAT_START_TAG][0] == 'T', \
                            'ERROR: Invalid annotation type "{}"' \
                                .format(misc[BRAT_START_TAG])
                        anns.append(misc[BRAT_START_TAG])
                    elif BRAT_END_TAG in misc:
                        anns_ = []
                        for ann in anns:
                            prefix = misc[BRAT_END_TAG] + SEP2
                            anns = list(
                                filter(lambda x: not x.startswith(prefix),
                                       anns))
                        try:
                            tags.remove(misc[BRAT_END_TAG])
                        except:
                            pass
                        if sent_ and 'SpaceAfter' in misc:
                            sent_[-1]['MISC']['SpaceAfter'] = \
                                misc['SpaceAfter']
                    else:
                        sent_.append(token)
                else:
                    for ann in anns:
                        ann = ann.split(SEP1 + SEP1)
                        entity, ann_ = ann[0], ann[1:]
                        tid, name = entity.split(SEP2)
                        assert tid.startswith('T'), \
                            'ERROR: Unrecognized annotation {}'.format(ann)
                        misc[BRAT_TAG + tid] = name
                        for ann in ann_:
                            if ann.startswith('R'):
                                ann_id, name, role = ann.split(SEP2)
                                misc[BRAT_TAG + ann_id] = \
                                    tid + SEP3 + name + SEP3 + role
                            elif ann.startswith('*'):
                                ann_id, name = ann.split(SEP2)
                                misc[BRAT_TAG + ann_id] = tid + SEP3 + name
                            elif ann.startswith('E'):
                                ann_id, name, role = ann.split(SEP2)
                                val = tid + SEP3 + name
                                if role:
                                    val += SEP3 + role
                                misc[BRAT_TAG + ann_id] = val
                            elif ann.startswith('A'):
                                ann_id, name, value = ann.split(SEP2)
                                val = tid + SEP3 + name
                                if value:
                                    val += SEP3 + value
                                misc[BRAT_TAG + ann_id] = val
                            elif ann.startswith('N'):
                                ann_id, service_name, service_id, title = \
                                    ann.split(SEP2, maxsplit=3)
                                misc[BRAT_TAG + ann_id] = \
                                    tid + SEP3 + service_name \
                                  + SEP3 + service_id + SEP3 + unmask(title)
                            elif ann.startswith('#'):
                                ann_id, note = ann.split(SEP2, maxsplit=1)
                                misc[BRAT_TAG + ann_id] = \
                                    tid + SEP3 + unmask(note)
                            else:
                                raise ValueError('ERROR: Unknown annotation '
                                                 'type')
                        #misc[BRAT_TAG + ann] = 'Yes'
                    sent_.append(token)
            yield sent_, meta

    res = process()
    if save_to:
        Conllu.save(res, save_to, fix=True)
    else:
        return Conllu.fix(res)