Example #1
0
#!/usr/bin/env python3
""" UD morphology and features """

import os
import re
from collections import namedtuple
from collections import defaultdict
import json

from util import import_xml_lib, get_fnames, get_children
ET = import_xml_lib()

whitespace_re = re.compile('\s+')
opening_cite_re = re.compile('(:["-]|,-)')

with open('PROPN_exc.json') as data_file:
    load_exceptions = json.load(data_file)

Feature = namedtuple('Feature', ['ud_id', 'regex', 'convert', 'default'])
Pos = namedtuple('Pos', ['ud_id', 'feats'])

ifolder = 'Syntax'
ofolder = 'Morphology'

# nominal
gender = Feature('Gender', re.compile(r'\b((МУЖ)|(ЖЕН)|(СРЕД))\b'), {
    'МУЖ': 'Masc',
    'ЖЕН': 'Fem',
    'СРЕД': 'Neut'
}, None)
anim = Feature('Animacy', re.compile(r'\b((ОД)|(НЕОД))\b'), {
#!/usr/bin/env python3

""" Small fixes.

1. delete sentences from targets_for_deletion list
2. replaces 'ё' -> 'е'
3. replaces PART -> ADV for lemmas 'уже', 'еще', 'почти', 'также', 'чуть'
"""

import os

from util import import_xml_lib, get_fnames
ET = import_xml_lib(verbose=True)

targets_for_deletion = {'Problema_vybora.tgt': 5, 'Vyzhivshii_kamikadze.tgt': 255, 'Ukroshchenie_stroptivogo_naukograda.tgt': 47, 'Korp_622.tgt': 13, 'Korp_624.tgt': 51, 'Pravilo_75.tgt': 48, 'V_perevode_s_nebesnogo.tgt': 41}

def safe_str(text):
    if text is not None:
        return text.strip().lower().replace('ё', 'е')
    return ''

def fix_token(fname, i, j, token, word, lemma, newfeat, logfile):
    """
    Fix token's feats and log it.
    """
    logfile.write('Fixed {}, sentence {}, token {}:\n\t{} {}\n'.format(fname, i, j, str(token.attrib), word))
    token.attrib['FEAT'] = newfeat
    token.attrib['LEMMA'] = lemma
    logfile.write('\t{} {}\n'.format(str(token.attrib), word))

def munch(ifiles, ofiles):