def tokenize(num_links, isdialog=True, norm_punct=False): tp = TextPreprocessor() chunk_fns = get_file_list(CHUNKS_DIR, num_links) max_conll = min(CONLL_FOR_SOURCE, len(chunk_fns)) chunk_no, texts_processed = 1, 0 for chunk_fn in chunk_fns: conll_fn = chunk_fn.replace(CHUNKS_DIR, CONLL_DIR) assert conll_fn != chunk_fn, 'ERROR: invalid path to chunk file' if not os.path.isfile(conll_fn): with open(chunk_fn, 'rt', encoding='utf-8') as f_in: text = norm_text(f_in.read()) if not text: continue pars = text.split('\n') if isdialog: text = [x.split('\t') for x in pars if x] curr_speaker = None speakers, pars = [], [] for speaker, sentence in text: if speaker: if speaker != curr_speaker: curr_speaker = speaker else: speaker = curr_speaker speakers.append(curr_speaker) pars.append(sentence) speaker_list = \ {x: str(i) for i, x in enumerate(OrderedDict(zip(speakers, speakers)), start=1)} doc_id = fn_to_id(conll_fn) tp.new_doc(doc_id=doc_id, metadata=[]) tp.new_pars(pars, doc_id=doc_id) tp.do_all(tag_phone=False, tag_date=False, norm_punct=norm_punct, silent=True) conll = list(tp.save(doc_id=doc_id)) tp.remove_doc(doc_id) if isdialog: speakers = iter(speakers) for sentence in conll: sent, meta = sentence if not any(x.isalnum() for x in meta['text']): continue if 'newpar id' in meta: meta['speaker'] = speaker_list[next(speakers)] Conllu.save(conll, conll_fn, log_file=None) print('\r{} (of {})'.format(chunk_no, max_conll), end='') texts_processed += 1 chunk_no += 1 if chunk_no > max_conll: break if texts_processed: print()
def split_corpus(corpus, split=[.8, .1, .1], save_split_to=None, seed=None, silent=False): """Split a *corpus* in the given proportion. :param corpus: a name of file in CoNLL-U format or list/iterator of sentences in Parsed CoNLL-U :param split: list of sizes of the necessary *corpus* parts. If values are of int type, they are interpreted as lengths of new corpora in sentences; if values are float, they are proportions of a given *corpus*. The types of the *split* values can't be mixed: they are either all int, or all float. The sum of float values must be less or equals to 1; the sum of int values can't be greater than the lentgh of the *corpus* :param save_split_to: list of file names to save the result of the *corpus* splitting. Can be `None` (default; don't save parts to files) or its length must be equal to the length of *split* :param silent: if True, suppress output :return: a list of new corpora """ assert save_split_to is None or len(save_split_to) == len(split), \ 'ERROR: lengths of split and save_split_to must be equal' isfloat = len([x for x in split if isinstance(x, float)]) > 0 if isfloat: assert sum(split) <= 1, \ "ERROR: sum of split can't be greater that 1" corpus = list( Conllu.load(corpus, log_file=None if silent else LOG_FILE)) corpus_len = len(corpus) if isfloat: split = list(map(lambda x: round(corpus_len * x), split)) diff = corpus_len - sum(split) if abs(diff) == 1: split[-1] += diff assert sum(split) <= corpus_len, \ "ERROR: sum of split can't be greater that corpus length" random.seed(seed) random.shuffle(corpus) res = [] pos_b = 0 for i, sp in enumerate(split): pos_e = pos_b + sp corpus_ = corpus[pos_b:pos_e] pos_b = pos_e if save_split_to: Conllu.save(corpus_, save_split_to[i]) res.append(corpus_) return res
def make_ne_tags(corpus, save_to=None, keep_originals=True): """Process the *corpus* in CoNLL-U or Parsed CoNLL-U format such that MISC:bratT entities converts to MISC:NE entities supported by MorDL. Note, that if several bratT entities are linked to the one token, only first one will be used (it is allowed only one MISC:NE entity for the token). :param corpus: corpus in Parsed CoNLL-U format or a path to the previously saved corpus in CoNLL-U format. :param save_to: a path where result will be stored. If ``None`` (default), the function returns the result as a generator of Parsed CoNLL-U data. :param keep_originals: If ``True`` (default), original MISC:bratT entities will be stayed intact. Elsewise, they will be removed. """ TAG = BRAT_TAG + 'T' def process(): for i, (sent, meta) in enumerate( Conllu.load(corpus) if isinstance(corpus, str) else corpus): for token in sent: misc = token['MISC'] ne = None ne_excess = set() for feat, val in misc.items(): if feat.startswith(TAG): if ne and ne != val: warnings.warn( 'Multiple brat entities in sent ' '{} (sent_id = {}), token {} ("{}"):'.format( i, meta['sent_id'], token['ID'], token['FORM']) + ': Entities {} and {}. Ignore the last one'. format(ne, val)) else: ne = val ne_excess.add(feat) if ne: if not keep_originals: for ne_ in list(ne_excess): misc.pop(ne_) misc[TAG_NE] = ne yield sent, meta res = process() if save_to: Conllu.save(res, save_to, fix=False) else: return res
def process(): for i, (sent, meta) in enumerate( Conllu.load(corpus) if isinstance(corpus, str) else corpus): for token in sent: misc = token['MISC'] ne = None ne_excess = set() for feat, val in misc.items(): if feat.startswith(TAG): if ne and ne != val: warnings.warn( 'Multiple brat entities in sent ' '{} (sent_id = {}), token {} ("{}"):'.format( i, meta['sent_id'], token['ID'], token['FORM']) + ': Entities {} and {}. Ignore the last one'. format(ne, val)) else: ne = val ne_excess.add(feat) if ne: if not keep_originals: for ne_ in list(ne_excess): misc.pop(ne_) misc[TAG_NE] = ne yield sent, meta
def process(): for i, (sent, meta) in enumerate( Conllu.load(corpus) if isinstance(corpus, str) else corpus): tag_brat_len = len(TAG_BRAT) for token in sent: misc = token['MISC'] ne = None ne_excess = set() for feat, val in misc.items(): if feat.startswith(TAG_BRAT) and val == 'Yes': if ne: warnings.warn( 'Multiple brat entities in sent ' '{} (sent_id = {}), token {} ("{}"):'.format( i, meta['sent_id'], token['ID'], token['FORM']) + ': Entities {} and {}. Ignore the last one'. format(ne, feat)) ne_excess.add(feat) else: ne = feat if ne: for ne_ in [ne] + list(ne_excess): misc.pop(ne_) misc[TAG_NE] = ne[tag_brat_len:] yield sent, meta
def process(): for sent, meta in Conllu.load(corpus) \ if isinstance(corpus, str) else \ corpus: meta.pop('text', None) sent_ = [] tags = [] for token in sent: misc = token['MISC'] if token['FORM'] is None: if TAGS_BRAT[0] in misc: if TAGS_BRAT[0] not in tags: tags.append(misc[TAGS_BRAT[0]]) elif TAGS_BRAT[1] in misc: try: tags.remove(misc[TAGS_BRAT[1]]) except: pass if sent_ and 'SpaceAfter' in misc: sent_[-1]['MISC']['SpaceAfter'] = misc[ 'SpaceAfter'] else: sent_.append(token) else: for tag in tags: misc[TAG_BRAT + tag] = 'Yes' sent_.append(token) yield sent_, meta
def save_conllu(*args, **kwargs): """Wrapper for ``Conllu.save()``""" silent = kwargs.pop('silent', None) if silent: kwargs['log_file'] = None elif 'log_file' not in kwargs: kwargs['log_file'] = LOG_FILE return Conllu.save(*args, **kwargs)
def postprocess_brat_conllu(corpus, save_to=None): """Converts corpus in text format into CoNLL-U format. Embedded brat entities will be placed to the MISC field. :param corpus: corpus in Parsed CoNLL-U format or a path to the previously saved corpus in CoNLL-U format :param save_to: a path where the result will be stored. If ``None`` (default), the function returns the result as a generator of Parsed CoNLL-U data """ def process(): for sent, meta in Conllu.load(corpus) \ if isinstance(corpus, str) else \ corpus: meta.pop('text', None) sent_ = [] tags = [] for token in sent: misc = token['MISC'] if token['FORM'] is None: if TAGS_BRAT[0] in misc: if TAGS_BRAT[0] not in tags: tags.append(misc[TAGS_BRAT[0]]) elif TAGS_BRAT[1] in misc: try: tags.remove(misc[TAGS_BRAT[1]]) except: pass if sent_ and 'SpaceAfter' in misc: sent_[-1]['MISC']['SpaceAfter'] = misc[ 'SpaceAfter'] else: sent_.append(token) else: for tag in tags: misc[TAG_BRAT + tag] = 'Yes' sent_.append(token) yield sent_, meta res = process() if save_to: Conllu.save(res, save_to, fix=True) else: return Conllu.fix(res)
def make_ne_tags(corpus, save_to=None): """Replaces brat entities in the corpus in CoNLL-U or Parsed CoNLL-U format to MISC:NE entities supported by mordl. Note, that if several brat entities are linked to the one token, only first one will be used. :param corpus: corpus in Parsed CoNLL-U format or a path to the previously saved corpus in CoNLL-U format :param save_to: a path where the result will be stored to. If ``None`` (default), the function returns the result as a generator of Parsed CoNLL-U data """ def process(): for i, (sent, meta) in enumerate( Conllu.load(corpus) if isinstance(corpus, str) else corpus): tag_brat_len = len(TAG_BRAT) for token in sent: misc = token['MISC'] ne = None ne_excess = set() for feat, val in misc.items(): if feat.startswith(TAG_BRAT) and val == 'Yes': if ne: warnings.warn( 'Multiple brat entities in sent ' '{} (sent_id = {}), token {} ("{}"):'.format( i, meta['sent_id'], token['ID'], token['FORM']) + ': Entities {} and {}. Ignore the last one'. format(ne, feat)) ne_excess.add(feat) else: ne = feat if ne: for ne_ in [ne] + list(ne_excess): misc.pop(ne_) misc[TAG_NE] = ne[tag_brat_len:] yield sent, meta res = process() if save_to: Conllu.save(res, save_to, fix=False) else: return res
def get_conllu_fields(corpus=None, fields=None, word2idx=None, unk_token=None, with_empty=False, silent=False): """Split corpus in CoNLL-U format to separate lists of tokens and tags. :param corpus: the corpus in CoNLL-U or Parsed CoNLL-U format. :param fields: list of CoNLL-U fields but 'FORM' to extract. :type fields: list :param word2idx: Word to Index dict. If not None, all words not from dict will be skipped or replacet to *unk_token* :type word2idx: dict({word: int}) :param unk_token: replacement for tokens that are not present in *word2idx*. :type unk_token: str :param with_empty: don't skip empty sentences. :param silent: suppress output. :return: splitted corpus :rtype: tuple(list([list([str|OrderedDict])])) """ if fields is None: fields = [] if isinstance(corpus, str): corpus = Conllu.load(corpus, **({'log_file': None} if silent else {})) elif callable(corpus): corpus = corpus() sents = tuple([] for _ in range(len(fields) + 1)) for sent in corpus: if isinstance(sent, tuple): sent = sent[0] for i, field in enumerate( zip(*[(x['FORM'] if not word2idx or x['FORM'] in word2idx else unk_token, *[ x[y[0]].get(y[1], y[2]) if len(y) >= 3 else x[y[0]]. get(y[1]) if len(y) == 2 else x[y[0]] for y in [y.split(':') for y in fields] ]) for x in sent if x['FORM'] and '-' not in x['ID'] and (not word2idx or x['FORM'] in word2idx or unk_token)])): if field or with_empty: sents[i].append(field) return sents if fields else sents[0]
def embed_conllu_fields(corpus, fields, values, empties=None, nones=None, silent=False): if isinstance(corpus, str): corpus = Conllu.load(corpus, **({'log_file': None} if silent else {})) elif callable(corpus): corpus = corpus() if empties: for i in empties: values.insert(i, []) if nones: for i, j in nones: values[i].insert(j, None) for sentence, vals in zip(corpus, values): sent = sentence[0] if isinstance(sentence, tuple) else sentence for token, val in zip(sent, vals): for field, val_ in [[fields, val]] \ if isinstance(fields, str) else \ zip(fields, val): field = field.split(':') if val_ is not None: if len(field) >= 2: if len(field) >= 3 and val_ == field[2]: if field[1]: token[field[0]].pop(field[1], None) else: token[field[0]] = None else: if field[1]: token[field[0]][field[1]] = val_ else: token[field[0]] = val_ else: token[field[0]] = val_ yield sentence
import sys from _utils_add import _path, _sub_idx, DATA_DIR_NAME assert len(sys.argv) == 3, \ 'ERROR: Syntax is: {} <domain> <source>'.format(sys.argv[0]) domain, source = sys.argv[1:] def setdir_(*suffixes): dir_ = os.path.join(*_path[:_sub_idx], DATA_DIR_NAME, *suffixes) if not os.path.isdir(dir_): os.makedirs(dir_) return dir_ ORIG_DIR = setdir_('conll') BRAT_DIR = setdir_('brat', 'conll') OUT_DIR = setdir_('..', 'corpus', 'ner', 'conll') for fn in glob.glob(ORIG_DIR + '/{}/{}/*.txt'.format(domain, source), recursive=True): print(fn) brat_fn = fn.replace(ORIG_DIR, BRAT_DIR) out_fn = fn.replace(ORIG_DIR, OUT_DIR)[:-4] + '.conllu' out_dir = os.path.dirname(out_fn) if not os.path.isdir(out_dir): os.makedirs(out_dir) Conllu.save(Conllu.merge(fn, brat_fn, ignore_new_meta=True), out_fn)
#-*- encoding: utf-8 -*- from copy import deepcopy from corpuscula import Conllu import glob import os from pathlib import Path CONLL_DIR = r'C:\prj-git\_mine\ru_corner\_data\conll\newswire' EDITED_DIR = '_0' TOKEN = '%' for fn in glob.glob(CONLL_DIR + '/*/*.txt', recursive=True): print(fn) corpus = list(Conllu.load(fn)) end_spaces = [] for sentence in corpus: sent, meta = sentence if 'par_text' in meta: parts = meta['par_text'].split(TOKEN) end_spaces = [x[-1:] == ' ' for x in parts[:-1]] if not end_spaces: continue if 'text' not in meta: continue parts = meta['text'].split(TOKEN)
#!/usr/bin/python # -*- coding: utf-8 -*- # Toxine project # # Copyright (C) 2019-present by Sergei Ternovykh # License: BSD, see LICENSE for details """ Example: Tokenize Wikipedia and make its articles looks like some speech recognition software output. Save the result as CoNLL-U. """ from corpuscula import Conllu from corpuscula.wikipedia_utils import download_wikipedia from toxine.wikipedia_utils import TokenizedWikipedia download_wikipedia(overwrite=False) Conllu.save(TokenizedWikipedia().articles(), 'wiki_speech.conllu', fix=True, adjust_for_speech=True, log_file=None)
def conllu_to_brat(corpus, txt_fn, ann_fn=None, spaces=3, short_spaces=1): """Converts *corpus* in CoNLL-U format to txt and ann files used by brat. :param txt_fn: a path to the brat txt file. :param ann_fn: a path to the brat ann file. If ``None`` (default), an extension of *txt_fn* file will be changed to '.ann'. :param save_to: a path where result will be stored. If ``None`` (default), the function returns the result as a generator of Parsed CoNLL-U data. :param spaces: number of spaces to use as word delimiter. :param short_spaces: number of spaces to use as word delimiter inside multi-tokens (when ID field has a hyphen inside). Note, that we create empty `.ann` files. Use this function to get initial data for annotation.""" fn, fe = os.path.splitext(txt_fn) if fe != '.txt': print('WARNING: Extension of txt_fn must be ".txt"', file=sys.stderr) if ann_fn is None: ann_fn = fn + '.ann' _, fe = os.path.splitext(ann_fn) if fe != '.ann': print('WARNING: Extension of ann_fn must be ".ann"', file=sys.stderr) with io.open(txt_fn, 'wt', encoding='utf-8', newline='\n') as out_f, \ open(ann_fn, 'w'): for sent_no, sent in enumerate( Conllu.load(corpus, fix=False, log_file=None)): if sent_no: print(file=out_f) if 'newpar id' in sent[1]: print(file=out_f) is_next = None short_start = short_end = None for tok in sent[0]: id_, form, misc = tok['ID'], tok['FORM'], tok['MISC'] if '.' in id_: continue if '-' in id_: short_start, short_end, *_ = id_.split('-') continue if is_next: print(' ' * (short_spaces if short_end and not short_start else spaces), end='', file=out_f) has_entity = False for feat, value in misc.items(): if feat.startswith('Entity'): assert not has_entity # workaround because brat can't display emojies # correctly form = '[emo]' if feat == 'EntityEmoji' else value has_entity = True is_next = True if short_end: short_start = None if id_ == short_end: short_end = None print(form, end='', file=out_f)
def extract_conllu_fields(corpus, fields=None, word2idx=None, unk_token=None, with_empty=False, return_nones=False, silent=False): """Split corpus in CoNLL-U format to separate lists of tokens and tags. :param corpus: the corpus in CoNLL-U or Parsed CoNLL-U format. :param fields: list of CoNLL-U fields but 'FORM' to extract. :type fields: list|str :param word2idx: Word to Index dict. If not None, all words not from dict will be skipped or replacet to *unk_token* :type word2idx: dict({word: int}) :param unk_token: replacement for tokens that are not present in *word2idx*. :type unk_token: str :param with_empty: don't skip empty sentences. :param silent: suppress output. :param return_nones: return indexes of filtered sentences and tokens :return: splitted corpus :rtype: tuple(list([list([str|OrderedDict])])), [ list([<empty sent idx]), list([tuple(<empty token sent idx>, <empty token idx>)]) ] """ if fields is None: fields = [] elif isinstance(fields, str): fields = [fields] if isinstance(corpus, str): corpus = Conllu.load(corpus, **({'log_file': None} if silent else {})) elif callable(corpus): corpus = corpus() sents = tuple([] for _ in range(len(fields) + 1)) empties, nones = [], [] for i, sent in enumerate(corpus): if isinstance(sent, tuple): sent = sent[0] isempty = True for j, field in enumerate( zip(*[(x['FORM'] if not word2idx or x['FORM'] in word2idx else unk_token, *[ x[y[0]].get(y[1], y[2]) if len(y) >= 3 and y[1] else x[y[0]].get(y[1]) if len(y) == 2 else x[y[0]] or y[2] if len(y) >= 3 else x[y[0]] for y in [y.split(':') for y in fields] ]) for x in sent if x['FORM'] and '-' not in x['ID'] and (not word2idx or x['FORM'] in word2idx or unk_token)])): sents[j].append(field) isempty = False if isempty and return_nones: empties.append(i) if return_nones: for j, x in enumerate(sent): if not (x['FORM'] and '-' not in x['ID'] and (not word2idx or x['FORM'] in word2idx or unk_token)): nones.append((i, j)) return (*sents, *((empties, nones) if return_nones else [])) \ if fields or return_nones else \ sents[0]
f'{ENDING_TPL}{HYPHEN}', f'(?:{re_end_})?{HYPHEN}').replace( ENDING_TPL, f'(?:{re_end_})?').replace(HYPHEN, rf'${HYPHEN}^') + '$').split(HYPHEN), y) for x, y in tpl ] re_cons_ = get_re(conjoints) re_diss_ = get_re(disjoints) #print(re_cons_) #print(re_diss_) rex = [(re.compile(x[0][0]), re.compile(x[0][1]), len(x[1]), False) for x in re_cons_] \ + [(re.compile(x[0][0]), re.compile(x[0][1]), len(x[1]), True) for x in re_diss_] hyphen_tok = Conllu.from_sentence(['-'])[0] hyphen_tok['MISC']['SpaceAfter'] = 'Yes' for fn in glob.glob(CONLL_DIR + '/*/*.txt', recursive=True): print(fn) corpus = list(Conllu.load(fn)) end_spaces = [] for sentence in corpus: sent, meta = sentence sub_tokens = [] multi_end_id = None for tok_idx, tok in enumerate(sent): id_, form, misc = tok['ID'], tok['FORM'], tok['MISC'] if '-' in id_:
def process(): def unmask(text): return text.replace(r'\{}'.format(BRAT_TEXT_BOUND_START_MARK[-1]), BRAT_TEXT_BOUND_START_MARK[-1]) \ .replace(r'\{}'.format(SEP1), SEP1) \ .replace('__', ' ').replace(r'\_', '_') for sent, meta in Conllu.load(corpus) \ if isinstance(corpus, str) else \ corpus: meta.pop('text', None) if 'par_text' in meta: meta['par_text'] = RE_BRAT.sub('', meta['par_text']) sent_ = [] anns = [] for token in sent: misc = token['MISC'] if token['FORM'] is None: if BRAT_START_TAG in misc: assert BRAT_START_TAG not in anns assert misc[BRAT_START_TAG][0] == 'T', \ 'ERROR: Invalid annotation type "{}"' \ .format(misc[BRAT_START_TAG]) anns.append(misc[BRAT_START_TAG]) elif BRAT_END_TAG in misc: anns_ = [] for ann in anns: prefix = misc[BRAT_END_TAG] + SEP2 anns = list( filter(lambda x: not x.startswith(prefix), anns)) try: tags.remove(misc[BRAT_END_TAG]) except: pass if sent_ and 'SpaceAfter' in misc: sent_[-1]['MISC']['SpaceAfter'] = \ misc['SpaceAfter'] else: sent_.append(token) else: for ann in anns: ann = ann.split(SEP1 + SEP1) entity, ann_ = ann[0], ann[1:] tid, name = entity.split(SEP2) assert tid.startswith('T'), \ 'ERROR: Unrecognized annotation {}'.format(ann) misc[BRAT_TAG + tid] = name for ann in ann_: if ann.startswith('R'): ann_id, name, role = ann.split(SEP2) misc[BRAT_TAG + ann_id] = \ tid + SEP3 + name + SEP3 + role elif ann.startswith('*'): ann_id, name = ann.split(SEP2) misc[BRAT_TAG + ann_id] = tid + SEP3 + name elif ann.startswith('E'): ann_id, name, role = ann.split(SEP2) val = tid + SEP3 + name if role: val += SEP3 + role misc[BRAT_TAG + ann_id] = val elif ann.startswith('A'): ann_id, name, value = ann.split(SEP2) val = tid + SEP3 + name if value: val += SEP3 + value misc[BRAT_TAG + ann_id] = val elif ann.startswith('N'): ann_id, service_name, service_id, title = \ ann.split(SEP2, maxsplit=3) misc[BRAT_TAG + ann_id] = \ tid + SEP3 + service_name \ + SEP3 + service_id + SEP3 + unmask(title) elif ann.startswith('#'): ann_id, note = ann.split(SEP2, maxsplit=1) misc[BRAT_TAG + ann_id] = \ tid + SEP3 + unmask(note) else: raise ValueError('ERROR: Unknown annotation ' 'type') #misc[BRAT_TAG + ann] = 'Yes' sent_.append(token) yield sent_, meta
#!/usr/bin/python -u #-*- encoding: utf-8 -*- from copy import deepcopy from corpuscula import Conllu import glob import os from pathlib import Path DIR = r'C:\prj-git\_mine\ru_corner\_data\conll\newswire' TOKEN = '%' log = open('_splitted', 'wt', encoding='utf-8') parent_fn = None for fn in glob.glob(DIR + '/*/*.txt', recursive=True): corpus = list(Conllu.load(fn, fix=False, log_file=None)) path = Path(fn) for idx, sentence in enumerate(corpus): sent, meta = sentence prev_id = None for idx_, tok in enumerate(sent): id_ = tok['ID'] if id_ == prev_id: if parent_fn and parent_fn != fn: print(file=log) parent_fn = fn print('{} ({}) - {} : {} / {} "{} {}"' .format(meta['sent_id'], idx, id_, path.parent.name, path.name,
#!/usr/bin/python # -*- coding: utf-8 -*- # Toxine project: Text Preprocessing pipeline # # Copyright (C) 2019-present by Sergei Ternovykh # License: BSD, see LICENSE for details """ Example: Tokenize Wikipedia and save articles as CONLL-U. """ from corpuscula import Conllu from corpuscula.wikipedia_utils import download_wikipedia from toxine.wikipedia_utils import TokenizedWikipedia # download syntagrus if it's not done yet download_wikipedia(overwrite=False) # tokenize and save articles Conllu.save(TokenizedWikipedia().articles(), 'wiki.conllu', fix=False, log_file=None)
def postprocess_brat_conllu(corpus, save_to=None): """Does postprocessing for the *corpus* with embedded brat annotations which already was preliminarily prepared by Toxine's TextPreprocessor. :param corpus: corpus in Parsed CoNLL-U format or a path to the previously saved corpus in CoNLL-U format. :param save_to: a path where result will be stored. If ``None`` (default), the function returns the result as a generator of Parsed CoNLL-U data. """ def process(): def unmask(text): return text.replace(r'\{}'.format(BRAT_TEXT_BOUND_START_MARK[-1]), BRAT_TEXT_BOUND_START_MARK[-1]) \ .replace(r'\{}'.format(SEP1), SEP1) \ .replace('__', ' ').replace(r'\_', '_') for sent, meta in Conllu.load(corpus) \ if isinstance(corpus, str) else \ corpus: meta.pop('text', None) if 'par_text' in meta: meta['par_text'] = RE_BRAT.sub('', meta['par_text']) sent_ = [] anns = [] for token in sent: misc = token['MISC'] if token['FORM'] is None: if BRAT_START_TAG in misc: assert BRAT_START_TAG not in anns assert misc[BRAT_START_TAG][0] == 'T', \ 'ERROR: Invalid annotation type "{}"' \ .format(misc[BRAT_START_TAG]) anns.append(misc[BRAT_START_TAG]) elif BRAT_END_TAG in misc: anns_ = [] for ann in anns: prefix = misc[BRAT_END_TAG] + SEP2 anns = list( filter(lambda x: not x.startswith(prefix), anns)) try: tags.remove(misc[BRAT_END_TAG]) except: pass if sent_ and 'SpaceAfter' in misc: sent_[-1]['MISC']['SpaceAfter'] = \ misc['SpaceAfter'] else: sent_.append(token) else: for ann in anns: ann = ann.split(SEP1 + SEP1) entity, ann_ = ann[0], ann[1:] tid, name = entity.split(SEP2) assert tid.startswith('T'), \ 'ERROR: Unrecognized annotation {}'.format(ann) misc[BRAT_TAG + tid] = name for ann in ann_: if ann.startswith('R'): ann_id, name, role = ann.split(SEP2) misc[BRAT_TAG + ann_id] = \ tid + SEP3 + name + SEP3 + role elif ann.startswith('*'): ann_id, name = ann.split(SEP2) misc[BRAT_TAG + ann_id] = tid + SEP3 + name elif ann.startswith('E'): ann_id, name, role = ann.split(SEP2) val = tid + SEP3 + name if role: val += SEP3 + role misc[BRAT_TAG + ann_id] = val elif ann.startswith('A'): ann_id, name, value = ann.split(SEP2) val = tid + SEP3 + name if value: val += SEP3 + value misc[BRAT_TAG + ann_id] = val elif ann.startswith('N'): ann_id, service_name, service_id, title = \ ann.split(SEP2, maxsplit=3) misc[BRAT_TAG + ann_id] = \ tid + SEP3 + service_name \ + SEP3 + service_id + SEP3 + unmask(title) elif ann.startswith('#'): ann_id, note = ann.split(SEP2, maxsplit=1) misc[BRAT_TAG + ann_id] = \ tid + SEP3 + unmask(note) else: raise ValueError('ERROR: Unknown annotation ' 'type') #misc[BRAT_TAG + ann] = 'Yes' sent_.append(token) yield sent_, meta res = process() if save_to: Conllu.save(res, save_to, fix=True) else: return Conllu.fix(res)
#-*- encoding: utf-8 -*- from copy import deepcopy from corpuscula import Conllu import glob import os from pathlib import Path CONLL_DIR = '_0' EDITED_DIR = '_1' TOKEN = '$' for fn in glob.glob(CONLL_DIR + '/*/*.txt', recursive=True): print(fn) corpus = list(Conllu.load(fn)) start_spaces = [] for sentence in corpus: sent, meta = sentence if 'par_text' in meta: parts = meta['par_text'].split(TOKEN) start_spaces = [x[:1] == ' ' for x in parts[1:]] if not start_spaces: continue if 'text' not in meta: continue parts = meta['text'].split(TOKEN)