Example #1
0
def tokenize(num_links, isdialog=True, norm_punct=False):
    tp = TextPreprocessor()
    chunk_fns = get_file_list(CHUNKS_DIR, num_links)
    max_conll = min(CONLL_FOR_SOURCE, len(chunk_fns))
    chunk_no, texts_processed = 1, 0
    for chunk_fn in chunk_fns:
        conll_fn = chunk_fn.replace(CHUNKS_DIR, CONLL_DIR)
        assert conll_fn != chunk_fn, 'ERROR: invalid path to chunk file'
        if not os.path.isfile(conll_fn):
            with open(chunk_fn, 'rt', encoding='utf-8') as f_in:
                text = norm_text(f_in.read())
                if not text:
                    continue
                pars = text.split('\n')

            if isdialog:
                text = [x.split('\t') for x in pars if x]
                curr_speaker = None

                speakers, pars = [], []
                for speaker, sentence in text:
                    if speaker:
                        if speaker != curr_speaker:
                            curr_speaker = speaker
                    else:
                        speaker = curr_speaker
                    speakers.append(curr_speaker)
                    pars.append(sentence)
                speaker_list = \
                    {x: str(i) for i, x in
                         enumerate(OrderedDict(zip(speakers, speakers)),
                                   start=1)}

            doc_id = fn_to_id(conll_fn)
            tp.new_doc(doc_id=doc_id, metadata=[])
            tp.new_pars(pars, doc_id=doc_id)
            tp.do_all(tag_phone=False,
                      tag_date=False,
                      norm_punct=norm_punct,
                      silent=True)
            conll = list(tp.save(doc_id=doc_id))
            tp.remove_doc(doc_id)

            if isdialog:
                speakers = iter(speakers)
                for sentence in conll:
                    sent, meta = sentence
                    if not any(x.isalnum() for x in meta['text']):
                        continue
                    if 'newpar id' in meta:
                        meta['speaker'] = speaker_list[next(speakers)]

            Conllu.save(conll, conll_fn, log_file=None)
            print('\r{} (of {})'.format(chunk_no, max_conll), end='')
            texts_processed += 1
        chunk_no += 1
        if chunk_no > max_conll:
            break
    if texts_processed:
        print()
Example #2
0
    def split_corpus(corpus,
                     split=[.8, .1, .1],
                     save_split_to=None,
                     seed=None,
                     silent=False):
        """Split a *corpus* in the given proportion.

        :param corpus: a name of file in CoNLL-U format or list/iterator of
                       sentences in Parsed CoNLL-U
        :param split: list of sizes of the necessary *corpus* parts. If values
                      are of int type, they are interpreted as lengths of new
                      corpora in sentences; if values are float, they are
                      proportions of a given *corpus*. The types of the
                      *split* values can't be mixed: they are either all int,
                      or all float. The sum of float values must be less or
                      equals to 1; the sum of int values can't be greater than
                      the lentgh of the *corpus*
        :param save_split_to: list of file names to save the result of the
                              *corpus* splitting. Can be `None` (default;
                              don't save parts to files) or its length must be
                              equal to the length of *split*
        :param silent: if True, suppress output
        :return: a list of new corpora
        """
        assert save_split_to is None or len(save_split_to) == len(split), \
               'ERROR: lengths of split and save_split_to must be equal'
        isfloat = len([x for x in split if isinstance(x, float)]) > 0
        if isfloat:
            assert sum(split) <= 1, \
                   "ERROR: sum of split can't be greater that 1"
        corpus = list(
            Conllu.load(corpus, log_file=None if silent else LOG_FILE))
        corpus_len = len(corpus)
        if isfloat:
            split = list(map(lambda x: round(corpus_len * x), split))
            diff = corpus_len - sum(split)
            if abs(diff) == 1:
                split[-1] += diff
        assert sum(split) <= corpus_len, \
               "ERROR: sum of split can't be greater that corpus length"
        random.seed(seed)
        random.shuffle(corpus)
        res = []
        pos_b = 0
        for i, sp in enumerate(split):
            pos_e = pos_b + sp
            corpus_ = corpus[pos_b:pos_e]
            pos_b = pos_e
            if save_split_to:
                Conllu.save(corpus_, save_split_to[i])
            res.append(corpus_)
        return res
Example #3
0
def make_ne_tags(corpus, save_to=None, keep_originals=True):
    """Process the *corpus* in CoNLL-U or Parsed CoNLL-U format such that
    MISC:bratT entities converts to MISC:NE entities supported by MorDL. Note,
    that if several bratT entities are linked to the one token, only first one
    will be used (it is allowed only one MISC:NE entity for the token).

    :param corpus: corpus in Parsed CoNLL-U format or a path to the previously
                   saved corpus in CoNLL-U format.
    :param save_to: a path where result will be stored. If ``None`` (default),
                    the function returns the result as a generator of Parsed
                    CoNLL-U data.
    :param keep_originals: If ``True`` (default), original MISC:bratT entities
                           will be stayed intact. Elsewise, they will be
                           removed.
    """
    TAG = BRAT_TAG + 'T'

    def process():
        for i, (sent, meta) in enumerate(
                Conllu.load(corpus) if isinstance(corpus, str) else corpus):
            for token in sent:
                misc = token['MISC']
                ne = None
                ne_excess = set()
                for feat, val in misc.items():
                    if feat.startswith(TAG):
                        if ne and ne != val:
                            warnings.warn(
                                'Multiple brat entities in sent '
                                '{} (sent_id = {}), token {} ("{}"):'.format(
                                    i, meta['sent_id'], token['ID'],
                                    token['FORM']) +
                                ': Entities {} and {}. Ignore the last one'.
                                format(ne, val))
                        else:
                            ne = val
                        ne_excess.add(feat)
                if ne:
                    if not keep_originals:
                        for ne_ in list(ne_excess):
                            misc.pop(ne_)
                    misc[TAG_NE] = ne
            yield sent, meta

    res = process()
    if save_to:
        Conllu.save(res, save_to, fix=False)
    else:
        return res
Example #4
0
 def process():
     for i, (sent, meta) in enumerate(
             Conllu.load(corpus) if isinstance(corpus, str) else corpus):
         for token in sent:
             misc = token['MISC']
             ne = None
             ne_excess = set()
             for feat, val in misc.items():
                 if feat.startswith(TAG):
                     if ne and ne != val:
                         warnings.warn(
                             'Multiple brat entities in sent '
                             '{} (sent_id = {}), token {} ("{}"):'.format(
                                 i, meta['sent_id'], token['ID'],
                                 token['FORM']) +
                             ': Entities {} and {}. Ignore the last one'.
                             format(ne, val))
                     else:
                         ne = val
                     ne_excess.add(feat)
             if ne:
                 if not keep_originals:
                     for ne_ in list(ne_excess):
                         misc.pop(ne_)
                 misc[TAG_NE] = ne
         yield sent, meta
Example #5
0
 def process():
     for i, (sent, meta) in enumerate(
             Conllu.load(corpus) if isinstance(corpus, str) else corpus):
         tag_brat_len = len(TAG_BRAT)
         for token in sent:
             misc = token['MISC']
             ne = None
             ne_excess = set()
             for feat, val in misc.items():
                 if feat.startswith(TAG_BRAT) and val == 'Yes':
                     if ne:
                         warnings.warn(
                             'Multiple brat entities in sent '
                             '{} (sent_id = {}), token {} ("{}"):'.format(
                                 i, meta['sent_id'], token['ID'],
                                 token['FORM']) +
                             ': Entities {} and {}. Ignore the last one'.
                             format(ne, feat))
                         ne_excess.add(feat)
                     else:
                         ne = feat
             if ne:
                 for ne_ in [ne] + list(ne_excess):
                     misc.pop(ne_)
                 misc[TAG_NE] = ne[tag_brat_len:]
         yield sent, meta
Example #6
0
 def process():
     for sent, meta in Conllu.load(corpus) \
                           if isinstance(corpus, str) else \
                       corpus:
         meta.pop('text', None)
         sent_ = []
         tags = []
         for token in sent:
             misc = token['MISC']
             if token['FORM'] is None:
                 if TAGS_BRAT[0] in misc:
                     if TAGS_BRAT[0] not in tags:
                         tags.append(misc[TAGS_BRAT[0]])
                 elif TAGS_BRAT[1] in misc:
                     try:
                         tags.remove(misc[TAGS_BRAT[1]])
                     except:
                         pass
                     if sent_ and 'SpaceAfter' in misc:
                         sent_[-1]['MISC']['SpaceAfter'] = misc[
                             'SpaceAfter']
                 else:
                     sent_.append(token)
             else:
                 for tag in tags:
                     misc[TAG_BRAT + tag] = 'Yes'
                 sent_.append(token)
         yield sent_, meta
Example #7
0
 def save_conllu(*args, **kwargs):
     """Wrapper for ``Conllu.save()``"""
     silent = kwargs.pop('silent', None)
     if silent:
         kwargs['log_file'] = None
     elif 'log_file' not in kwargs:
         kwargs['log_file'] = LOG_FILE
     return Conllu.save(*args, **kwargs)
Example #8
0
def postprocess_brat_conllu(corpus, save_to=None):
    """Converts corpus in text format into CoNLL-U format. Embedded brat
    entities will be placed to the MISC field.

    :param corpus: corpus in Parsed CoNLL-U format or a path to the previously
                   saved corpus in CoNLL-U format
    :param save_to: a path where the result will be stored. If ``None``
                    (default), the function returns the result as a generator
                    of Parsed CoNLL-U data
    """
    def process():
        for sent, meta in Conllu.load(corpus) \
                              if isinstance(corpus, str) else \
                          corpus:
            meta.pop('text', None)
            sent_ = []
            tags = []
            for token in sent:
                misc = token['MISC']
                if token['FORM'] is None:
                    if TAGS_BRAT[0] in misc:
                        if TAGS_BRAT[0] not in tags:
                            tags.append(misc[TAGS_BRAT[0]])
                    elif TAGS_BRAT[1] in misc:
                        try:
                            tags.remove(misc[TAGS_BRAT[1]])
                        except:
                            pass
                        if sent_ and 'SpaceAfter' in misc:
                            sent_[-1]['MISC']['SpaceAfter'] = misc[
                                'SpaceAfter']
                    else:
                        sent_.append(token)
                else:
                    for tag in tags:
                        misc[TAG_BRAT + tag] = 'Yes'
                    sent_.append(token)
            yield sent_, meta

    res = process()
    if save_to:
        Conllu.save(res, save_to, fix=True)
    else:
        return Conllu.fix(res)
Example #9
0
def make_ne_tags(corpus, save_to=None):
    """Replaces brat entities in the corpus in CoNLL-U or Parsed CoNLL-U
    format to MISC:NE entities supported by mordl. Note, that if several brat
    entities are linked to the one token, only first one will be used.

    :param corpus: corpus in Parsed CoNLL-U format or a path to the previously
                   saved corpus in CoNLL-U format
    :param save_to: a path where the result will be stored to. If ``None``
                    (default), the function returns the result as a generator
                    of Parsed CoNLL-U data
    """
    def process():
        for i, (sent, meta) in enumerate(
                Conllu.load(corpus) if isinstance(corpus, str) else corpus):
            tag_brat_len = len(TAG_BRAT)
            for token in sent:
                misc = token['MISC']
                ne = None
                ne_excess = set()
                for feat, val in misc.items():
                    if feat.startswith(TAG_BRAT) and val == 'Yes':
                        if ne:
                            warnings.warn(
                                'Multiple brat entities in sent '
                                '{} (sent_id = {}), token {} ("{}"):'.format(
                                    i, meta['sent_id'], token['ID'],
                                    token['FORM']) +
                                ': Entities {} and {}. Ignore the last one'.
                                format(ne, feat))
                            ne_excess.add(feat)
                        else:
                            ne = feat
                if ne:
                    for ne_ in [ne] + list(ne_excess):
                        misc.pop(ne_)
                    misc[TAG_NE] = ne[tag_brat_len:]
            yield sent, meta

    res = process()
    if save_to:
        Conllu.save(res, save_to, fix=False)
    else:
        return res
Example #10
0
def get_conllu_fields(corpus=None,
                      fields=None,
                      word2idx=None,
                      unk_token=None,
                      with_empty=False,
                      silent=False):
    """Split corpus in CoNLL-U format to separate lists of tokens and tags.

    :param corpus: the corpus in CoNLL-U or Parsed CoNLL-U format.
    :param fields: list of CoNLL-U fields but 'FORM' to extract.
    :type fields: list
    :param word2idx: Word to Index dict. If not None, all words not from dict
        will be skipped or replacet to *unk_token*
    :type word2idx: dict({word: int})
    :param unk_token: replacement for tokens that are not present in
        *word2idx*.
    :type unk_token: str
    :param with_empty: don't skip empty sentences.
    :param silent: suppress output.
    :return: splitted corpus
    :rtype: tuple(list([list([str|OrderedDict])]))
    """
    if fields is None:
        fields = []

    if isinstance(corpus, str):
        corpus = Conllu.load(corpus, **({'log_file': None} if silent else {}))
    elif callable(corpus):
        corpus = corpus()

    sents = tuple([] for _ in range(len(fields) + 1))

    for sent in corpus:
        if isinstance(sent, tuple):
            sent = sent[0]
        for i, field in enumerate(
                zip(*[(x['FORM'] if not word2idx or x['FORM'] in word2idx else
                       unk_token, *[
                           x[y[0]].get(y[1], y[2]) if len(y) >= 3 else x[y[0]].
                           get(y[1]) if len(y) == 2 else x[y[0]]
                           for y in [y.split(':') for y in fields]
                       ]) for x in sent if x['FORM'] and '-' not in x['ID'] and
                      (not word2idx or x['FORM'] in word2idx or unk_token)])):
            if field or with_empty:
                sents[i].append(field)

    return sents if fields else sents[0]
Example #11
0
def embed_conllu_fields(corpus,
                        fields,
                        values,
                        empties=None,
                        nones=None,
                        silent=False):

    if isinstance(corpus, str):
        corpus = Conllu.load(corpus, **({'log_file': None} if silent else {}))
    elif callable(corpus):
        corpus = corpus()

    if empties:
        for i in empties:
            values.insert(i, [])
    if nones:
        for i, j in nones:
            values[i].insert(j, None)
    for sentence, vals in zip(corpus, values):
        sent = sentence[0] if isinstance(sentence, tuple) else sentence
        for token, val in zip(sent, vals):
            for field, val_ in [[fields, val]] \
                                   if isinstance(fields, str) else \
                               zip(fields, val):
                field = field.split(':')
                if val_ is not None:
                    if len(field) >= 2:
                        if len(field) >= 3 and val_ == field[2]:
                            if field[1]:
                                token[field[0]].pop(field[1], None)
                            else:
                                token[field[0]] = None
                        else:
                            if field[1]:
                                token[field[0]][field[1]] = val_
                            else:
                                token[field[0]] = val_
                    else:
                        token[field[0]] = val_
        yield sentence
Example #12
0
import sys

from _utils_add import _path, _sub_idx, DATA_DIR_NAME


assert len(sys.argv) == 3, \
    'ERROR: Syntax is: {} <domain> <source>'.format(sys.argv[0])
domain, source = sys.argv[1:]


def setdir_(*suffixes):
    dir_ = os.path.join(*_path[:_sub_idx], DATA_DIR_NAME, *suffixes)
    if not os.path.isdir(dir_):
        os.makedirs(dir_)
    return dir_


ORIG_DIR = setdir_('conll')
BRAT_DIR = setdir_('brat', 'conll')
OUT_DIR = setdir_('..', 'corpus', 'ner', 'conll')

for fn in glob.glob(ORIG_DIR + '/{}/{}/*.txt'.format(domain, source),
                    recursive=True):
    print(fn)
    brat_fn = fn.replace(ORIG_DIR, BRAT_DIR)
    out_fn = fn.replace(ORIG_DIR, OUT_DIR)[:-4] + '.conllu'
    out_dir = os.path.dirname(out_fn)
    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    Conllu.save(Conllu.merge(fn, brat_fn, ignore_new_meta=True), out_fn)
Example #13
0
#-*- encoding: utf-8 -*-

from copy import deepcopy
from corpuscula import Conllu
import glob
import os
from pathlib import Path

CONLL_DIR = r'C:\prj-git\_mine\ru_corner\_data\conll\newswire'
EDITED_DIR = '_0'

TOKEN = '%'

for fn in glob.glob(CONLL_DIR + '/*/*.txt', recursive=True):
    print(fn)
    corpus = list(Conllu.load(fn))

    end_spaces = []
    for sentence in corpus:
        sent, meta = sentence
        if 'par_text' in meta:
            parts = meta['par_text'].split(TOKEN)
            end_spaces = [x[-1:] == ' ' for x in parts[:-1]]

        if not end_spaces:
            continue

        if 'text' not in meta:
            continue

        parts = meta['text'].split(TOKEN)
Example #14
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Toxine project
#
# Copyright (C) 2019-present by Sergei Ternovykh
# License: BSD, see LICENSE for details
"""
Example: Tokenize Wikipedia and make its articles looks like some speech
recognition software output. Save the result as CoNLL-U.
"""
from corpuscula import Conllu
from corpuscula.wikipedia_utils import download_wikipedia
from toxine.wikipedia_utils import TokenizedWikipedia

download_wikipedia(overwrite=False)
Conllu.save(TokenizedWikipedia().articles(),
            'wiki_speech.conllu',
            fix=True,
            adjust_for_speech=True,
            log_file=None)
Example #15
0
def conllu_to_brat(corpus, txt_fn, ann_fn=None, spaces=3, short_spaces=1):
    """Converts *corpus* in CoNLL-U format to txt and ann files used by brat.

    :param txt_fn: a path to the brat txt file.
    :param ann_fn: a path to the brat ann file. If ``None`` (default), an
                   extension of *txt_fn* file will be changed to '.ann'.
    :param save_to: a path where result will be stored. If ``None`` (default),
                    the function returns the result as a generator of Parsed
                    CoNLL-U data.
    :param spaces: number of spaces to use as word delimiter.
    :param short_spaces: number of spaces to use as word delimiter
                         inside multi-tokens (when ID field has a hyphen
                         inside).

    Note, that we create empty `.ann` files. Use this function to get initial
    data for annotation."""
    fn, fe = os.path.splitext(txt_fn)
    if fe != '.txt':
        print('WARNING: Extension of txt_fn must be ".txt"', file=sys.stderr)
    if ann_fn is None:
        ann_fn = fn + '.ann'
    _, fe = os.path.splitext(ann_fn)
    if fe != '.ann':
        print('WARNING: Extension of ann_fn must be ".ann"', file=sys.stderr)

    with io.open(txt_fn, 'wt', encoding='utf-8', newline='\n') as out_f, \
         open(ann_fn, 'w'):
        for sent_no, sent in enumerate(
                Conllu.load(corpus, fix=False, log_file=None)):
            if sent_no:
                print(file=out_f)
                if 'newpar id' in sent[1]:
                    print(file=out_f)
            is_next = None
            short_start = short_end = None
            for tok in sent[0]:
                id_, form, misc = tok['ID'], tok['FORM'], tok['MISC']
                if '.' in id_:
                    continue
                if '-' in id_:
                    short_start, short_end, *_ = id_.split('-')
                    continue
                if is_next:
                    print(' ' * (short_spaces
                                 if short_end and not short_start else spaces),
                          end='',
                          file=out_f)
                has_entity = False
                for feat, value in misc.items():
                    if feat.startswith('Entity'):
                        assert not has_entity
                        # workaround because brat can't display emojies
                        # correctly
                        form = '[emo]' if feat == 'EntityEmoji' else value
                        has_entity = True
                is_next = True
                if short_end:
                    short_start = None
                if id_ == short_end:
                    short_end = None
                print(form, end='', file=out_f)
Example #16
0
def extract_conllu_fields(corpus,
                          fields=None,
                          word2idx=None,
                          unk_token=None,
                          with_empty=False,
                          return_nones=False,
                          silent=False):
    """Split corpus in CoNLL-U format to separate lists of tokens and tags.

    :param corpus: the corpus in CoNLL-U or Parsed CoNLL-U format.
    :param fields: list of CoNLL-U fields but 'FORM' to extract.
    :type fields: list|str
    :param word2idx: Word to Index dict. If not None, all words not from dict
        will be skipped or replacet to *unk_token*
    :type word2idx: dict({word: int})
    :param unk_token: replacement for tokens that are not present in
        *word2idx*.
    :type unk_token: str
    :param with_empty: don't skip empty sentences.
    :param silent: suppress output.
    :param return_nones: return indexes of filtered sentences and tokens
    :return: splitted corpus
    :rtype: tuple(list([list([str|OrderedDict])])), [ list([<empty sent idx]),
            list([tuple(<empty token sent idx>, <empty token idx>)]) ]
    """
    if fields is None:
        fields = []
    elif isinstance(fields, str):
        fields = [fields]

    if isinstance(corpus, str):
        corpus = Conllu.load(corpus, **({'log_file': None} if silent else {}))
    elif callable(corpus):
        corpus = corpus()

    sents = tuple([] for _ in range(len(fields) + 1))
    empties, nones = [], []

    for i, sent in enumerate(corpus):
        if isinstance(sent, tuple):
            sent = sent[0]

        isempty = True
        for j, field in enumerate(
                zip(*[(x['FORM'] if not word2idx or x['FORM'] in word2idx else
                       unk_token, *[
                           x[y[0]].get(y[1], y[2]) if len(y) >= 3 and y[1] else
                           x[y[0]].get(y[1]) if len(y) ==
                           2 else x[y[0]] or y[2] if len(y) >= 3 else x[y[0]]
                           for y in [y.split(':') for y in fields]
                       ]) for x in sent if x['FORM'] and '-' not in x['ID'] and
                      (not word2idx or x['FORM'] in word2idx or unk_token)])):
            sents[j].append(field)
            isempty = False
        if isempty and return_nones:
            empties.append(i)

        if return_nones:
            for j, x in enumerate(sent):
                if not (x['FORM'] and '-' not in x['ID'] and
                        (not word2idx or x['FORM'] in word2idx or unk_token)):
                    nones.append((i, j))

    return (*sents, *((empties, nones) if return_nones else [])) \
               if fields or return_nones else \
           sents[0]
Example #17
0
            f'{ENDING_TPL}{HYPHEN}', f'(?:{re_end_})?{HYPHEN}').replace(
                ENDING_TPL, f'(?:{re_end_})?').replace(HYPHEN, rf'${HYPHEN}^')
          + '$').split(HYPHEN), y) for x, y in tpl
    ]


re_cons_ = get_re(conjoints)
re_diss_ = get_re(disjoints)
#print(re_cons_)
#print(re_diss_)
rex = [(re.compile(x[0][0]), re.compile(x[0][1]), len(x[1]), False)
           for x in re_cons_] \
    + [(re.compile(x[0][0]), re.compile(x[0][1]), len(x[1]), True)
           for x in re_diss_]

hyphen_tok = Conllu.from_sentence(['-'])[0]
hyphen_tok['MISC']['SpaceAfter'] = 'Yes'

for fn in glob.glob(CONLL_DIR + '/*/*.txt', recursive=True):
    print(fn)
    corpus = list(Conllu.load(fn))

    end_spaces = []
    for sentence in corpus:
        sent, meta = sentence

        sub_tokens = []
        multi_end_id = None
        for tok_idx, tok in enumerate(sent):
            id_, form, misc = tok['ID'], tok['FORM'], tok['MISC']
            if '-' in id_:
Example #18
0
    def process():
        def unmask(text):
            return text.replace(r'\{}'.format(BRAT_TEXT_BOUND_START_MARK[-1]),
                                BRAT_TEXT_BOUND_START_MARK[-1]) \
                       .replace(r'\{}'.format(SEP1), SEP1) \
                       .replace('__', ' ').replace(r'\_', '_')

        for sent, meta in Conllu.load(corpus) \
                              if isinstance(corpus, str) else \
                          corpus:
            meta.pop('text', None)
            if 'par_text' in meta:
                meta['par_text'] = RE_BRAT.sub('', meta['par_text'])
            sent_ = []
            anns = []
            for token in sent:
                misc = token['MISC']
                if token['FORM'] is None:
                    if BRAT_START_TAG in misc:
                        assert BRAT_START_TAG not in anns
                        assert misc[BRAT_START_TAG][0] == 'T', \
                            'ERROR: Invalid annotation type "{}"' \
                                .format(misc[BRAT_START_TAG])
                        anns.append(misc[BRAT_START_TAG])
                    elif BRAT_END_TAG in misc:
                        anns_ = []
                        for ann in anns:
                            prefix = misc[BRAT_END_TAG] + SEP2
                            anns = list(
                                filter(lambda x: not x.startswith(prefix),
                                       anns))
                        try:
                            tags.remove(misc[BRAT_END_TAG])
                        except:
                            pass
                        if sent_ and 'SpaceAfter' in misc:
                            sent_[-1]['MISC']['SpaceAfter'] = \
                                misc['SpaceAfter']
                    else:
                        sent_.append(token)
                else:
                    for ann in anns:
                        ann = ann.split(SEP1 + SEP1)
                        entity, ann_ = ann[0], ann[1:]
                        tid, name = entity.split(SEP2)
                        assert tid.startswith('T'), \
                            'ERROR: Unrecognized annotation {}'.format(ann)
                        misc[BRAT_TAG + tid] = name
                        for ann in ann_:
                            if ann.startswith('R'):
                                ann_id, name, role = ann.split(SEP2)
                                misc[BRAT_TAG + ann_id] = \
                                    tid + SEP3 + name + SEP3 + role
                            elif ann.startswith('*'):
                                ann_id, name = ann.split(SEP2)
                                misc[BRAT_TAG + ann_id] = tid + SEP3 + name
                            elif ann.startswith('E'):
                                ann_id, name, role = ann.split(SEP2)
                                val = tid + SEP3 + name
                                if role:
                                    val += SEP3 + role
                                misc[BRAT_TAG + ann_id] = val
                            elif ann.startswith('A'):
                                ann_id, name, value = ann.split(SEP2)
                                val = tid + SEP3 + name
                                if value:
                                    val += SEP3 + value
                                misc[BRAT_TAG + ann_id] = val
                            elif ann.startswith('N'):
                                ann_id, service_name, service_id, title = \
                                    ann.split(SEP2, maxsplit=3)
                                misc[BRAT_TAG + ann_id] = \
                                    tid + SEP3 + service_name \
                                  + SEP3 + service_id + SEP3 + unmask(title)
                            elif ann.startswith('#'):
                                ann_id, note = ann.split(SEP2, maxsplit=1)
                                misc[BRAT_TAG + ann_id] = \
                                    tid + SEP3 + unmask(note)
                            else:
                                raise ValueError('ERROR: Unknown annotation '
                                                 'type')
                        #misc[BRAT_TAG + ann] = 'Yes'
                    sent_.append(token)
            yield sent_, meta
Example #19
0
#!/usr/bin/python -u
#-*- encoding: utf-8 -*-

from copy import deepcopy
from corpuscula import Conllu
import glob
import os
from pathlib import Path

DIR = r'C:\prj-git\_mine\ru_corner\_data\conll\newswire'
TOKEN = '%'
log = open('_splitted', 'wt', encoding='utf-8')

parent_fn = None
for fn in glob.glob(DIR + '/*/*.txt', recursive=True):
    corpus = list(Conllu.load(fn, fix=False, log_file=None))
    path = Path(fn)

    for idx, sentence in enumerate(corpus):
        sent, meta = sentence

        prev_id = None
        for idx_, tok in enumerate(sent):
            id_ = tok['ID']
            if id_ == prev_id:
                if parent_fn and parent_fn != fn:
                   print(file=log)
                parent_fn = fn
                print('{} ({}) - {} : {} / {} "{} {}"'
                          .format(meta['sent_id'], idx, id_,
                                  path.parent.name, path.name,
Example #20
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Toxine project: Text Preprocessing pipeline
#
# Copyright (C) 2019-present by Sergei Ternovykh
# License: BSD, see LICENSE for details
"""
Example: Tokenize Wikipedia and save articles as CONLL-U.
"""
from corpuscula import Conllu
from corpuscula.wikipedia_utils import download_wikipedia
from toxine.wikipedia_utils import TokenizedWikipedia

# download syntagrus if it's not done yet
download_wikipedia(overwrite=False)
# tokenize and save articles
Conllu.save(TokenizedWikipedia().articles(),
            'wiki.conllu',
            fix=False,
            log_file=None)
Example #21
0
def postprocess_brat_conllu(corpus, save_to=None):
    """Does postprocessing for the *corpus* with embedded brat annotations
    which already was preliminarily prepared by Toxine's TextPreprocessor.

    :param corpus: corpus in Parsed CoNLL-U format or a path to the previously
                   saved corpus in CoNLL-U format.
    :param save_to: a path where result will be stored. If ``None`` (default),
                    the function returns the result as a generator of Parsed
                    CoNLL-U data.
    """
    def process():
        def unmask(text):
            return text.replace(r'\{}'.format(BRAT_TEXT_BOUND_START_MARK[-1]),
                                BRAT_TEXT_BOUND_START_MARK[-1]) \
                       .replace(r'\{}'.format(SEP1), SEP1) \
                       .replace('__', ' ').replace(r'\_', '_')

        for sent, meta in Conllu.load(corpus) \
                              if isinstance(corpus, str) else \
                          corpus:
            meta.pop('text', None)
            if 'par_text' in meta:
                meta['par_text'] = RE_BRAT.sub('', meta['par_text'])
            sent_ = []
            anns = []
            for token in sent:
                misc = token['MISC']
                if token['FORM'] is None:
                    if BRAT_START_TAG in misc:
                        assert BRAT_START_TAG not in anns
                        assert misc[BRAT_START_TAG][0] == 'T', \
                            'ERROR: Invalid annotation type "{}"' \
                                .format(misc[BRAT_START_TAG])
                        anns.append(misc[BRAT_START_TAG])
                    elif BRAT_END_TAG in misc:
                        anns_ = []
                        for ann in anns:
                            prefix = misc[BRAT_END_TAG] + SEP2
                            anns = list(
                                filter(lambda x: not x.startswith(prefix),
                                       anns))
                        try:
                            tags.remove(misc[BRAT_END_TAG])
                        except:
                            pass
                        if sent_ and 'SpaceAfter' in misc:
                            sent_[-1]['MISC']['SpaceAfter'] = \
                                misc['SpaceAfter']
                    else:
                        sent_.append(token)
                else:
                    for ann in anns:
                        ann = ann.split(SEP1 + SEP1)
                        entity, ann_ = ann[0], ann[1:]
                        tid, name = entity.split(SEP2)
                        assert tid.startswith('T'), \
                            'ERROR: Unrecognized annotation {}'.format(ann)
                        misc[BRAT_TAG + tid] = name
                        for ann in ann_:
                            if ann.startswith('R'):
                                ann_id, name, role = ann.split(SEP2)
                                misc[BRAT_TAG + ann_id] = \
                                    tid + SEP3 + name + SEP3 + role
                            elif ann.startswith('*'):
                                ann_id, name = ann.split(SEP2)
                                misc[BRAT_TAG + ann_id] = tid + SEP3 + name
                            elif ann.startswith('E'):
                                ann_id, name, role = ann.split(SEP2)
                                val = tid + SEP3 + name
                                if role:
                                    val += SEP3 + role
                                misc[BRAT_TAG + ann_id] = val
                            elif ann.startswith('A'):
                                ann_id, name, value = ann.split(SEP2)
                                val = tid + SEP3 + name
                                if value:
                                    val += SEP3 + value
                                misc[BRAT_TAG + ann_id] = val
                            elif ann.startswith('N'):
                                ann_id, service_name, service_id, title = \
                                    ann.split(SEP2, maxsplit=3)
                                misc[BRAT_TAG + ann_id] = \
                                    tid + SEP3 + service_name \
                                  + SEP3 + service_id + SEP3 + unmask(title)
                            elif ann.startswith('#'):
                                ann_id, note = ann.split(SEP2, maxsplit=1)
                                misc[BRAT_TAG + ann_id] = \
                                    tid + SEP3 + unmask(note)
                            else:
                                raise ValueError('ERROR: Unknown annotation '
                                                 'type')
                        #misc[BRAT_TAG + ann] = 'Yes'
                    sent_.append(token)
            yield sent_, meta

    res = process()
    if save_to:
        Conllu.save(res, save_to, fix=True)
    else:
        return Conllu.fix(res)
Example #22
0
#-*- encoding: utf-8 -*-

from copy import deepcopy
from corpuscula import Conllu
import glob
import os
from pathlib import Path

CONLL_DIR = '_0'
EDITED_DIR = '_1'

TOKEN = '$'

for fn in glob.glob(CONLL_DIR + '/*/*.txt', recursive=True):
    print(fn)
    corpus = list(Conllu.load(fn))

    start_spaces = []
    for sentence in corpus:
        sent, meta = sentence
        if 'par_text' in meta:
            parts = meta['par_text'].split(TOKEN)
            start_spaces = [x[:1] == ' ' for x in parts[1:]]

        if not start_spaces:
            continue

        if 'text' not in meta:
            continue

        parts = meta['text'].split(TOKEN)