#!/usr/bin/env python3 """ UD morphology and features """ import os import re from collections import namedtuple from collections import defaultdict import json from util import import_xml_lib, get_fnames, get_children ET = import_xml_lib() whitespace_re = re.compile('\s+') opening_cite_re = re.compile('(:["-]|,-)') with open('PROPN_exc.json') as data_file: load_exceptions = json.load(data_file) Feature = namedtuple('Feature', ['ud_id', 'regex', 'convert', 'default']) Pos = namedtuple('Pos', ['ud_id', 'feats']) ifolder = 'Syntax' ofolder = 'Morphology' # nominal gender = Feature('Gender', re.compile(r'\b((МУЖ)|(ЖЕН)|(СРЕД))\b'), { 'МУЖ': 'Masc', 'ЖЕН': 'Fem', 'СРЕД': 'Neut' }, None) anim = Feature('Animacy', re.compile(r'\b((ОД)|(НЕОД))\b'), {
#!/usr/bin/env python3 """ Small fixes. 1. delete sentences from targets_for_deletion list 2. replaces 'ё' -> 'е' 3. replaces PART -> ADV for lemmas 'уже', 'еще', 'почти', 'также', 'чуть' """ import os from util import import_xml_lib, get_fnames ET = import_xml_lib(verbose=True) targets_for_deletion = {'Problema_vybora.tgt': 5, 'Vyzhivshii_kamikadze.tgt': 255, 'Ukroshchenie_stroptivogo_naukograda.tgt': 47, 'Korp_622.tgt': 13, 'Korp_624.tgt': 51, 'Pravilo_75.tgt': 48, 'V_perevode_s_nebesnogo.tgt': 41} def safe_str(text): if text is not None: return text.strip().lower().replace('ё', 'е') return '' def fix_token(fname, i, j, token, word, lemma, newfeat, logfile): """ Fix token's feats and log it. """ logfile.write('Fixed {}, sentence {}, token {}:\n\t{} {}\n'.format(fname, i, j, str(token.attrib), word)) token.attrib['FEAT'] = newfeat token.attrib['LEMMA'] = lemma logfile.write('\t{} {}\n'.format(str(token.attrib), word)) def munch(ifiles, ofiles):