def main(ifiles, ofiles):
    for ifname, ofname in zip(ifiles, ofiles):
        tree = ET.parse(ifname)
        root = tree.getroot()
        for sentence in root[-1].findall('S'):
            list_of_compounds = []
            for i, token in enumerate(sentence.findall('W')):
                if token.attrib.get('FEAT', 'EMPTY').split()[0] == 'COM':
                    link, pos, feats, head_token, head_pos, head_feats, head_root = get_info(
                        token, sentence)
                    children = get_children_attrib(sentence,
                                                   token.attrib['ID'])
                    if head_token.text != 'FANTOM':
                        list_of_compounds.append((token, head_token, children))
            for elem in list_of_compounds:
                wordf, head_word, children = elem
                if children != []:
                    for child in children:
                        child['DOM'] = head_word.attrib['ID']
            for elem in list_of_compounds:
                wordf, head_word, children = elem
                shift_position = wordf.attrib['ID']
                for elem in sentence.findall('W'):
                    if int(elem.attrib['ID']) == int(shift_position):
                        sentence.remove(elem)
                        break
                for item in sentence.findall('W'):
                    if int(item.attrib['ID']) > int(shift_position):
                        item.attrib['ID'] = str(int(item.attrib['ID']) - 1)
                    if item.attrib.get('DOM', 'EMPTY') != '_root' and int(
                            item.attrib['DOM']) > int(shift_position) - 1:
                        item.attrib['DOM'] = str(int(item.attrib['DOM']) - 1)
        tree.write(ofname, encoding='utf-8')
    return
Exemple #2
0
 def get_tile(self, cid):
     """fetch info about a tiled tiff, or retrieve a specific tile."""
     _id = bson.ObjectId(cid)
     container, _ = self._get(_id, 'ro')  # need at least read access to view tiles
     montage_info = None
     for f in container.get('files'):
         if f['filetype'] == 'montage':
             montage_info = f
             break
     if not montage_info:
         self.abort(404, 'montage zip not found')
     fn = montage_info['filename']
     fp = os.path.join(self.app.config['data_path'], cid[-3:], cid, fn)
     z = self.request.GET.get('z')
     x = self.request.GET.get('x')
     y = self.request.GET.get('y')
     if not (z and x and y):
         return util.get_info(fp)
     else:
         self.response.content_type = 'image/jpeg'
         tile = util.get_tile(fp, int(z), int(x), int(y))
         if tile:
             self.response.write(tile)
Exemple #3
0
 def get_tile(self, cid):
     """fetch info about a tiled tiff, or retrieve a specific tile."""
     _id = bson.ObjectId(cid)
     container, _ = self._get(
         _id, 'ro')  # need at least read access to view tiles
     montage_info = None
     for f in container.get('files'):
         if f['filetype'] == 'montage':
             montage_info = f
             break
     if not montage_info:
         self.abort(404, 'montage zip not found')
     fn = montage_info['filename']
     fp = os.path.join(self.app.config['data_path'], cid[-3:], cid, fn)
     z = self.request.GET.get('z')
     x = self.request.GET.get('x')
     y = self.request.GET.get('y')
     if not (z and x and y):
         return util.get_info(fp)
     else:
         self.response.content_type = 'image/jpeg'
         tile = util.get_tile(fp, int(z), int(x), int(y))
         if tile:
             self.response.write(tile)
Exemple #4
0
def get_all_info(filepath):
    ext = os.path.splitext(filepath)[1]
    if ext not in FORMATS:
        raise UnsupportedFileTypeError()
    try:
        audio = FORMATS[ext](filepath)
    except ID3NoHeaderError:
        audio = MP3(filepath)
        audio.add_tags()
        audio.save()
        audio = EasyID3(filepath)
    artist = get_info(audio, 'artist')
    album = get_info(audio, 'album')
    date = get_info(audio, 'date')
    tracknumber = get_info(audio, 'tracknumber', '')
    title = get_info(audio, 'title')
    is_compil = get_info(audio, 'compilation') or False
    return (sanitize(artist),
            sanitize(album),
            sanitize(date),
            sanitize(trackFormat(tracknumber)),
            sanitize(title),
            ext,
            is_compil)
Exemple #5
0
def munch(ifiles, ofiles):
    # part 1
    count = 0
    for ifname, ofname in zip(ifiles, ofiles):
        tree = ET.parse(ifname)
        root = tree.getroot()
        for sent in root[-1].findall('S'):
            for word in sent.findall('W'):
                if word.attrib.get(
                        'FEAT',
                        'EMPTY').split()[0] == 'PR' and word.attrib.get(
                            'LEMMA', 'EMPTY') in lemmas_adv:
                    count += 1
                    word.attrib['FEAT'] = 'ADV'
                    if 'LINK' in word.attrib and word.attrib[
                            'LINK'] not in conjrels:
                        word.attrib['LINK'] = 'advmod'

        # part 2
        # (the code does not cover all cases on the first iteration,
        # therefore this part needs to be repeated twice)
        for r in range(2):
            for sent in root[-1].findall('S'):
                for candidate in sent.findall('W'):
                    if candidate.attrib.get(
                            'LEMMA', 'EMPTY'
                    ) in fix and 'МН' not in candidate.attrib['FEAT']:
                        link, pos, feats, head_token, head_pos, head_feats, head_root = get_info(
                            candidate, sent)
                        is_root = (candidate.attrib.get('DOM',
                                                        'EMPTY') == '_root')
                        ch = candidate.attrib['ID']
                        parent = candidate.attrib['DOM']
                        away = False
                        do_not_change = False
                        deprel = ''
                        if 'LINK' in candidate.attrib:
                            deprel = candidate.attrib['LINK']
                        if 'LINK' in candidate.attrib and candidate.attrib[
                                'LINK'] in conjrels:
                            do_not_change = True
                        for token in sent.findall('W'):
                            if token.attrib.get(
                                    'ID',
                                    'EMPTY') == parent and token.attrib.get(
                                        'LEMMA', 'EMPTY') in go_away:
                                away = True
                        if away:
                            continue
                        children = get_children_attrib(sent, ch)
                        if len(children) == 0:
                            ch_parent = get_children_attrib(sent, parent)
                            check = 0
                            for elem in ch_parent:
                                if elem['FEAT'].split()[0] == 'NUM':
                                    check += 1
                            if check == 1:
                                for elem in ch_parent:
                                    if elem['FEAT'].split()[0] == 'NUM':
                                        elem['DOM'] = ch
                                        elem['LINK'] = 'compound'
                                        if not do_not_change:
                                            candidate.attrib[
                                                'LINK'] = 'nummod:gov'
                        if len(children) == 1 and children[0]['FEAT'].split(
                        )[0] == 'NUM':  #only one and it is NUM
                            children[0]['LINK'] = 'compound'
                            if not do_not_change:
                                candidate.attrib['LINK'] = 'nummod:gov'
                        if any(child['FEAT'].split()[0] == 'NUM'
                               for child in children
                               ) and len(children) > 1:  # NUM among others
                            if not do_not_change:
                                candidate.attrib['LINK'] = 'nummod:gov'
                            for elem in children:
                                if elem['FEAT'].split()[0] == 'NUM':
                                    elem['LINK'] = 'compound'
                        if len(children) == 1 and children[0]['FEAT'].split(
                        )[0] == 'A':
                            continue
                        if any(child['FEAT'].split()[0] == 'S'
                               for child in children):
                            list_of_nouns = []
                            numgov = False
                            for elem in children:
                                genetive = False
                                if 'РОД' in elem['FEAT']:
                                    genetive = True
                                if elem['FEAT'].split()[0] == 'S' and elem[
                                        'LEMMA'] in lemmas and genetive:
                                    list_of_nouns.append(elem)
                                if elem['FEAT'].split()[0] == 'NUM':
                                    elem['LINK'] = 'compound'
                                    numgov = True
                            if len(list_of_nouns) > 0:
                                trace = list_of_nouns[0]['ID']
                                for elem in children:
                                    if elem['ID'] == trace:
                                        if is_root:
                                            elem['DOM'] = '_root'
                                            candidate.attrib['LINK'] = elem[
                                                'LINK']
                                            del elem['LINK']
                                        else:
                                            elem['DOM'] = parent
                                            if deprel != '':
                                                elem['LINK'] = deprel
                                            else:
                                                print(candidate.attrib, elem)
                                        candidate.attrib['DOM'] = elem['ID']
                                        if numgov:
                                            if not do_not_change:
                                                candidate.attrib[
                                                    'LINK'] = 'nummod:gov'
                                        for elem in children:
                                            if elem['ID'] != trace and elem[
                                                    'FEAT'].split(
                                                    )[0] != 'NUM':
                                                elem['DOM'] = trace

        # part 3
        for sent in root[-1].findall('S'):
            for word in sent.findall('W'):
                link, pos, feats, head_token, head_pos, head_feats, head_root = get_info(
                    word, sent)
                if link not in ['предик'
                                ] + conjrels and word.attrib['DOM'] != '_root':
                    if 'NUM' in pos and word.attrib['LEMMA'] == 'один':  # один
                        word.attrib['LINK'] = 'nummod'
                    if link != 'EMPTY' and 'NUM' in pos:
                        if 'ВИН' in feats or 'ИМ' in feats:
                            word.attrib['LINK'] = 'nummod:gov'
                        else:
                            word.attrib['LINK'] = 'nummod'
                    if word.attrib.get('LEMMA', 'EMPTY') in lemmas_compare:
                        children = get_children_attrib(sent, word.attrib['ID'])
                        for elem in children:
                            if 'FEAT' in elem and elem['FEAT'].startswith(
                                    'S ') and 'РОД' in elem['FEAT']:
                                grandchildren = get_children_attrib(
                                    sent, elem['ID'])
                                if not any('PR' in grchild['FEAT']
                                           for grchild in grandchildren):
                                    elem['DOM'] = word.attrib['DOM']
                                    word.attrib['DOM'] = elem['ID']
                                    word.attrib['LINK'] = 'nummod:gov'
        tree.write(ofname, encoding="UTF-8")
    return
def munch(ifiles, ofiles):
    """
    Process all files in ifiles list.
    Output into ofiles list.
    """
    temp_info = []
    for ifname, ofname in zip(ifiles, ofiles):
        tree = ET.parse(ifname)
        root = tree.getroot()
        for sent in root[-1].findall('S'):
            for word in sent.findall('W'):
                if word.attrib.get('FEAT', 'EMPTY').split()[0] == 'PR':
                    link, pos, feats, head_token, head_pos, head_feats, head_root = get_info(
                        word, sent)
                    children = get_children_attrib(sent, word.attrib['ID'])
                    if children == [] and head_pos != 'CONJ':  #CONJ is converted during syntax phase

                        # only 'за' is converted here
                        if word.attrib['LEMMA'] == 'за':
                            if head_token.attrib['DOM'] == '_root':
                                # small fix for one sentence
                                word.attrib['DOM'] = '3'
                            else:
                                word.attrib['DOM'] = head_token.attrib['DOM']

                        new_dep = relation(head_pos)
                        word.attrib['LINK'] = new_dep
                    elif len(children) >= 1:
                        if any(ch['LINK'] == 'предл' for ch in children):
                            new_head_found = True

                        elif any(ch['LINK'] == 'сочин' for ch in children):
                            list_of_coord_candidates = []
                            candidate_coord = [
                                ch for ch in children if ch['LINK'] == 'сочин'
                            ]
                            for cand in candidate_coord:
                                if cand['FEAT'] == 'PR':
                                    list_of_coord_candidates.append(cand)
                                elif cand['FEAT'] == 'CONJ':
                                    sub_children = get_children_attrib(
                                        sent, cand['ID'])
                                    check_pr = [
                                        ch for ch in sub_children
                                        if ch['FEAT'] == 'PR'
                                    ]
                                    list_of_coord_candidates += check_pr
                            new_head_found = all(
                                any(elem['LINK'] == 'предл'
                                    for elem in get_children_attrib(
                                        sent, item['ID']))
                                for item in list_of_coord_candidates)
                        else:
                            new_head_found = True

                        if new_head_found:
                            for child in children:
                                if child['FEAT'].split(
                                )[0] in safe + ['V'] and child.get(
                                        'LINK', 'EMPTY') == 'предл':

                                    child['DOM'] = word.attrib['DOM']
                                    word.attrib['DOM'] = child['ID']

                                    if child['DOM'] == '_root':
                                        child.pop('LINK')
                                    else:
                                        child['LINK'] = word.attrib['LINK']
                                    word.attrib['LINK'] = relation(
                                        child['FEAT'].split()[0])

                                    for elem in children:
                                        if elem['ID'] != child['ID']:
                                            elem['DOM'] = child['ID']
                                    break
                            else:
                                if any(elem['LINK'] == 'сочин'
                                       for elem in children):
                                    continue
                                elif len(children
                                         ) == 1 and children[0]['LINK'] in [
                                             'разъяснит', 'огранич'
                                         ]:
                                    continue
                                elif len(children) == 1 and children[0][
                                        'FEAT'] == 'CONJ МЕТА':
                                    child['DOM'] = word.attrib['DOM']
                                    word.attrib['DOM'] = child['ID']
                                    child['LINK'] = word.attrib['LINK']
                                    word.attrib['LINK'] = 'case'

                                elif len(children) >= 1 and any(
                                        elem['LINK'] == 'предик'
                                        for elem in children):
                                    continue
                                elif word.text == 'кроме' and children[0][
                                        'LEMMA'] == 'как':
                                    sub_ch = get_children_attrib(
                                        sent, children[0]['ID'])
                                    children[0]['LINK'] = 'fixed'
                                    sub_ch[0]['DOM'] = children[0]['DOM']

                                elif children[0]['LEMMA'] == 'минус':
                                    sub_ch = get_children_attrib(
                                        sent, children[0]['ID'])
                                    sub_ch[0]['DOM'] = word.attrib['DOM']
                                    sub_ch[0]['LINK'] = word.attrib['LINK']
                                    word.attrib['DOM'] = sub_ch[0]['ID']
                                    word.attrib['LINK'] = 'case'
                                    children[0]['DOM'] = sub_ch[0]['ID']
                                    children[0]['FEAT'] = 'S ЕД МУЖ ИМ НЕОД'
                                    children[0]['LINK'] = 'nmod'

                                elif word.text == 'вроде' and children[0][
                                        'LEMMA'] == 'при':
                                    children[0]['LINK'] = 'об-аппоз'
                                elif word.text == 'Около' and children[0][
                                        'LEMMA'] == 'назад':
                                    sub_ch = get_children_attrib(
                                        sent, children[0]['ID'])
                                    sub_ch[0]['DOM'] = word.attrib['DOM']
                                    sub_ch[0]['LINK'] = word.attrib['LINK']
                                    word.attrib['DOM'] = sub_ch[0]['ID']
                                    word.attrib['LINK'] = 'case'
                                    children[0]['DOM'] = sub_ch[0]['ID']
                                    children[0]['FEAT'] = 'ADV'
                                    children[0]['LINK'] = 'advmod'
                                else:
                                    # these are for debug purposes and normaly silent; if they scream something went wrong
                                    print(word.attrib.get('ID', 'EMPTY'),
                                          word.text,
                                          word.attrib.get('FEAT', 'EMPTY'))
                                    print(*[(ch.get('ID', 'EMPTY'),
                                             ch.get('LEMMA', 'EMPTY'),
                                             ch.get('FEAT', 'EMPTY'),
                                             ch.get('LINK', 'EMPTY'))
                                            for ch in children],
                                          sep=' ')
                                    print('+' * 20)
                                    print(*[
                                        (token.attrib.get('ID',
                                                          'EMPTY'), token.text,
                                         token.attrib.get('DOM', 'EMPTY'),
                                         token.attrib.get('FEAT', 'EMPTY'),
                                         token.attrib.get('LINK',
                                                          'EMPTY'), token.tail)
                                        for token in sent
                                    ],
                                          sep='\n')
                                    print('*' * 20)
                        else:
                            continue

        tree.write(ofname, encoding="UTF-8")
import re
import sys
from subprocess import run

import bs4
import pyminizip

from util import (clean_tags, get_arg, get_function, get_info, get_soup,
                  set_info, to_md)

arg = get_arg('Genera un epub de un programa electoral')

re_rtrim = re.compile(r" +$", re.MULTILINE)
re_ltrim = re.compile(r"^\s*\n+")

yml = get_info(autocomplete=True)

isLastLineBlank = False


def fprint(txt, *args, re_clean=None, **kargs):
    global isLastLineBlank
    if isinstance(txt, bs4.Tag):
        txt = to_md(txt)
    if re_clean is not None:
        txt = re_clean.sub("", txt)
    txt = re_rtrim.sub("", txt)
    if isLastLineBlank:
        txt = re_ltrim.sub("", txt)
        if len(txt) == 0:
            return
    wubi_dictionary = wf.read()

input_files = os.listdir(input_dir)
for input_file in input_files:
    hsk_level = re.search(r"[1-7]",
                          input_file).group()  # use 7 for 7 through 9
    with open("{}{}".format(input_dir, input_file), "r",
              encoding="utf-8-sig") as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(r"(\d+) ([^(|\s]+)", line)
            index = match.group(1)
            word = match.group(2)

            logging.info("Getting info for '{}'.".format(word))
            word_info = util.get_info(word)
            word_info["hsk_level"] = hsk_level

            # find the wubi strokes for each character
            wubi = []
            for char in word:
                # this pops up a couple times, ignore it (for now)
                try:
                    keys = re.search(r"^{}\t([a-z]+)$".format(char),
                                     wubi_dictionary, re.MULTILINE).group(1)
                except:
                    logging.warning(
                        "Unable to find wubi strokes for '{}', skipping.".
                        format(char))
                    next
Exemple #9
0
                v = True
            info.pdf[k] = v

    fecha = None
    for k, v in info.pdf.items():
        if v and k.endswith("Date"):
            d = datetime.strptime(v, "%a %b %d %H:%M:%S %Y %Z")
            if fecha is None or d < fecha:
                fecha = d
    if fecha:
        info.fecha = fecha.date()  # .strftime('%Y-%m-%d')


indices = []
for c in glob("*/info.yml"):
    indices.append((c, os.path.dirname(c), get_info(c, autocomplete=False)))

for path_info, codigo, info in sorted(indices):
    print("Descargando %s: %s" % (codigo, info.url))
    pth = codigo+"/wks"
    os.makedirs(pth, exist_ok=True)
    book = "book"
    out = pth + "/" + book

    pdf = out + ".pdf"
    xml = out + ".xml"
    htm = out + ".html"
    flag = False

    if info.url.endswith(".pdf"):
        if not os.path.isfile(pdf):
Exemple #10
0
def munch(ifiles, ofiles):
    """
    Process all files in ifiles list.
    Output into ofiles list.
    """
    dict_of_fixed = get_fixed_rel()
    dict_of_lemmas = get_verb_lemmas()
    for ifname, ofname in zip(ifiles, ofiles):
        tree = ET.parse(ifname)
        root = tree.getroot()
        for k, sentence in enumerate(root[-1].findall('S')):
            for j, token in enumerate(sentence.findall('W')):
                if (
                        token.attrib['LEMMA'], token.attrib['FEAT']
                ) in dict_of_fixed and token.text != 'FANTOM':  # поменять сам токен
                    current_position = int(token.attrib['ID'])
                    shift_num = len(dict_of_fixed[token.attrib['LEMMA'],
                                                  token.attrib['FEAT']][0]) - 1
                    shift_position = current_position - 1
                    for tok in sentence.findall('W'):
                        if '.' not in str(tok.attrib['ID']):
                            if int(tok.attrib['ID']) > current_position:
                                tok.attrib['ID'] = str(
                                    int(tok.attrib['ID']) + shift_num)
                        else:
                            if float(tok.attrib['ID']) > float(
                                    current_position):
                                tok.attrib['ID'] = str(
                                    round(
                                        float(tok.attrib['ID']) +
                                        float(shift_num), 1))

                        if '.' not in str(tok.attrib['DOM']):
                            if tok.attrib['DOM'] != '_root':
                                if int(tok.attrib['DOM']) > current_position:
                                    tok.attrib['DOM'] = str(
                                        int(tok.attrib['DOM']) + shift_num)
                        else:
                            if float(tok.attrib['DOM']) > float(
                                    current_position):
                                tok.attrib['DOM'] = str(
                                    round(
                                        float(tok.attrib['DOM']) +
                                        float(shift_num), 1))

                        first_position = ''
                        if tok.attrib['ENH'].startswith('E:E'):
                            first_position = 'E:E'
                            change_enh = tok.attrib['ENH'][3:].split(':')[0]
                        elif tok.attrib['ENH'].startswith('E'):
                            first_position = 'E'
                            change_enh = tok.attrib['ENH'][1:].split(':')[0]
                        else:
                            change_enh = tok.attrib['ENH'].split(':')[0]

                        if '.' not in str(change_enh):
                            #print(change_enh)
                            if int(change_enh) > current_position:
                                change_enh = int(change_enh) + shift_num
                                tok.attrib['ENH'] = first_position + str(
                                    change_enh
                                ) + ':' + tok.attrib['ENH'].split(':')[1]
                        else:
                            if float(change_enh) > float(current_position):
                                change_enh = round(
                                    float(change_enh) + float(shift_num), 1)
                                tok.attrib['ENH'] = first_position + str(
                                    change_enh
                                ) + ':' + tok.attrib['ENH'].split(':')[1]

                    if 'LINK' in token.attrib:
                        temp_rel = token.attrib['LINK']  # а если нет link
                    else:
                        temp_rel = '_root'
                    temp_dom = token.attrib['DOM']
                    temp_text = token.text.replace('.', '. ').split()[0]
                    temp_tail = token.tail

                    no_dot = (j == len(sentence.findall('W')) - 1)
                    if not no_dot and temp_tail.startswith('.'):
                        temp_tail = temp_tail.lstrip('.')
                    sentence.remove(token)

                    starting_position = current_position

                    for i, elem in enumerate(
                            dict_of_fixed[token.attrib['LEMMA'],
                                          token.attrib['FEAT']][0]):
                        tag = ET.fromstring('<W></W>')
                        tag.attrib['ID'] = str(current_position)
                        tag.attrib['LEMMA'] = elem[1]
                        tag.attrib['OLD'] = 'EMPTY'
                        tag.attrib['FEAT'] = elem[2]

                        head_position = dict_of_fixed[token.attrib['LEMMA'],
                                                      token.attrib['FEAT']][1]
                        if i == head_position:
                            # this is the head token of the group
                            tag.attrib['DOM'] = str(temp_dom)
                            if elem[3] == '%':
                                tag.attrib['LINK'] = temp_rel
                            else:
                                tag.attrib['LINK'] = elem[3]
                        else:
                            tag.attrib['LINK'] = elem[3]
                            tag.attrib['DOM'] = str(starting_position +
                                                    head_position)

                        # if DOM happened to become _root, remove LINK
                        if tag.attrib['DOM'] == '_root':
                            del tag.attrib['LINK']

                        if i == 0:
                            tag.text = temp_text
                        else:
                            tag.text = elem[0]

                        if i == len(
                                dict_of_fixed[token.attrib['LEMMA'],
                                              token.attrib['FEAT']][0]) - 1:
                            tag.tail = temp_tail
                            if no_dot:
                                tag.text = tag.text.rstrip('.')
                                tag.attrib['LEMMA'] = tag.attrib[
                                    'LEMMA'].strip('.')
                        else:
                            tag.tail = ' \n'

                        if 'LINK' not in tag.attrib and tag.attrib[
                                'DOM'] == '_root':
                            tag.attrib['ENH'] = '0:root'
                        else:
                            tag.attrib['ENH'] = str(
                                tag.attrib['DOM']) + ':' + tag.attrib['LINK']

                        sentence.insert(shift_position, tag)
                        current_position += 1
                        shift_position += 1

            sorted_tokens = sorted(
                sentence.findall('W'),
                key=lambda x: float(x.attrib.get('ID', '100500')))
            while len(sentence.findall('W')) != 0:
                sentence.remove(sentence.findall('W')[-1])
            while len(sentence.findall('LF')) != 0:
                sentence.remove(sentence.findall('LF')[-1])
            for token in sorted_tokens:
                sentence.append(token)

        for sentence in root[-1].findall('S'):
            for i, token in enumerate(sentence.findall('W')):
                if i == 0:
                    token.text = token.text[0].upper() + token.text[1:]
                token.attrib['LEMMA'] = token.attrib['LEMMA'].replace('|', ',')
                if '|' in token.text:
                    token.text = token.text.replace('|', ',')
                if token.attrib['LEMMA'].endswith('-знак'):
                    token.attrib['LEMMA'] = token.text
                    token.attrib['FEAT'] = 'SYM'

        # small fixes from Olga 04.04.2018
        for sentence in root[-1].findall('S'):
            for i, token in enumerate(sentence.findall('W')):
                if token.attrib['LEMMA'] in glue:
                    if ifname.split('/')[
                            -1] == '2011Petrushka.xml' and sentence.attrib[
                                'ID'] == '141':
                        sentence.findall('W')[39].attrib['DOM'] = '46'
                        sentence.findall('W')[40].attrib['DOM'] = '40'
                        sentence.findall('W')[40].attrib['LINK'] = 'fixed'
                        sentence.findall('W')[41].attrib['DOM'] = '46'
                        for h in range(39, 42):
                            sentence.findall(
                                'W')[h].attrib['ENH'] = sentence.findall('W')[
                                    h].attrib['DOM'] + ':' + sentence.findall(
                                        'W')[h].attrib['LINK']
                    elif len(sentence.findall('W')) > i+1 and \
                       sentence.findall('W')[i+1].attrib['LEMMA'] == 'бы' and \
                       sentence.findall('W')[i+1].attrib['LINK'] != 'fixed':

                        sentence.findall('W')[i + 1].attrib['LINK'] = 'fixed'
                        if token.attrib['DOM'] == sentence.findall('W')[
                                i + 1].attrib['ID']:
                            token.attrib['DOM'] = sentence.findall('W')[
                                i + 1].attrib['DOM']
                            token.attrib['ENH'] = ':'.join(
                                (token.attrib['DOM'], token.attrib['LINK']))

                        sentence.findall('W')[i + 1].attrib[
                            'DOM'] = sentence.findall('W')[i].attrib['ID']
                        sentence.findall('W')[i + 1].attrib['ENH'] = ':'.join(
                            (sentence.findall('W')[i + 1].attrib['DOM'],
                             sentence.findall('W')[i + 1].attrib['LINK']))

                    elif len(sentence.findall('W')) > i+1 and \
                       sentence.findall('W')[i+1].attrib['LEMMA'] == 'бы' and \
                       sentence.findall('W')[i+1].attrib['DOM'] != sentence.findall('W')[i].attrib['ID']:
                        print(token.attrib['ID'])
                        print(*[(token.attrib['ID'], token.text,
                                 token.attrib['DOM'],
                                 token.attrib.get('LINK', 'EMPTY'), token.tail)
                                for token in sentence],
                              sep='\n')
                        print('*' * 20)
                elif token.attrib['LEMMA'] == 'второе' and (
                        'ADJ' in token.attrib['FEAT']
                        or 'nmod' in token.attrib.get('LINK', 'EMPTY')):

                    token.attrib['LEMMA'] = 'второй'
                    token.attrib['FEAT'] = token.attrib['FEAT'].replace(
                        'Animacy=Inan|', '').replace('NOUN', 'ADJ')
                    if token.attrib.get('LINK', 'EMPTY') == 'nmod':
                        token.attrib['LINK'] = 'amod'
                        token.attrib['ENH'] = token.attrib['ENH'].replace(
                            'nmod', 'amod')

                elif token.attrib[
                        'LEMMA'] == 'вооружать' and 'ADJ' in token.attrib[
                            'FEAT']:
                    token.attrib['LEMMA'] = 'вооруженный'
                elif token.attrib[
                        'LEMMA'] == 'весь' and 'PRON' in token.attrib['FEAT']:
                    token.attrib['FEAT'] = ' '.join(
                        ['DET'] + token.attrib['FEAT'].split()[1:])
                elif token.attrib[
                        'LEMMA'] == 'главное' and 'ADV' in token.attrib['FEAT']:
                    token.attrib[
                        'FEAT'] = 'NOUN Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing'
                elif token.attrib[
                        'LEMMA'] == 'дома' and 'NOUN' in token.attrib['FEAT']:
                    token.attrib['LEMMA'] = 'дом'
                elif token.attrib[
                        'LEMMA'] == 'звонок' and 'ADJ' in token.attrib['FEAT']:
                    token.attrib['FEAT'] = token.attrib['FEAT'].replace(
                        'Degree=Pos', 'Animacy=Inan|Case=Gen').replace(
                            'ADJ', 'NOUN').replace('|Variant=Short', '')
                elif token.attrib[
                        'LEMMA'] == 'многие' and 'ADJ' in token.attrib['FEAT']:
                    token.attrib['FEAT'] = 'NUM'
                    token.attrib['LEMMA'] = 'много'
                elif token.attrib[
                        'LEMMA'] == 'легкий' and 'NOUN' in token.attrib['FEAT']:
                    token.attrib['LEMMA'] = 'легкие'
                elif token.attrib['LEMMA'] == 'плюс' and 'SYM' in token.attrib[
                        'FEAT']:
                    token.attrib['LEMMA'] = '+'
                elif token.attrib['LEMMA'] == 'ли':
                    if token.attrib['LINK'] == 'conj':
                        token.attrib['LINK'] = 'advmod'
                    if token.attrib['LINK'] == 'discourse':
                        token.attrib['LINK'] = 'fixed'
                    token.attrib['ENH'] = token.attrib['ENH'].split(
                        ':')[0] + ':' + token.attrib['LINK']
                elif token.attrib['LEMMA'] == 'значит':
                    token.attrib['LINK'] = 'discourse'
                    token.attrib['ENH'] = token.attrib['ENH'].split(
                        ':')[0] + ':' + token.attrib['LINK']
                elif token.attrib['LEMMA'] == 'один' and 'ADJ' in token.attrib[
                        'FEAT']:
                    token.attrib['FEAT'] = ' '.join(
                        ['DET'] + token.attrib['FEAT'].split()[1:])

        for sentence in root[-1].findall('S'):
            for token in sentence.findall('W'):
                if token.attrib[
                        'LEMMA'] in dict_of_lemmas and 'Aspect=Perf' in token.attrib[
                            'FEAT']:
                    token.attrib['LEMMA'] = dict_of_lemmas[
                        token.attrib['LEMMA']]

        #this is test for debug purposes:
        #for sent in root[-1].findall('S'):

        #for wt in sent.findall('W'):
        #   link, pos, feats, head_token, head_pos, head_feats, head_root = get_info(wt, sent)
        #   if wt.attrib.get('LINK') == 'acl:relcl' and head_token.attrib.get('LINK') == 'obl':
        #      print(wt.attrib['ID'], ifname, sent.attrib['ID'])

        # Change lemma capitalisation
        for sent in root[-1].findall('S'):

            for word in sent.findall('W'):
                link, pos, feats, head_token, head_pos, head_feats, head_root = get_info(
                    word, sent)
                if pos == 'PROPN' and word.attrib['ID'] == '1':

                    if word.attrib['LEMMA'] not in {
                            'формула', 'чижик', 's', 'ps', 'f**k', 'да', 'ох'
                    }:
                        if word.text.isupper():
                            word.attrib['LEMMA'] = word.attrib['LEMMA'].upper()
                        else:
                            word.attrib['LEMMA'] = word.attrib['LEMMA'].title()
                    elif word.attrib['LEMMA'] == 'чижик':
                        word.attrib['FEAT'] = word.attrib['FEAT'].replace(
                            'PROPN', 'NOUN')
                    elif word.attrib['LEMMA'] == 'ох':
                        word.attrib['FEAT'] = 'PART'
                    elif word.attrib['LEMMA'] in {'s', 'ps', 'f**k', 'да'}:
                        word.attrib['FEAT'] = word.attrib['FEAT'].replace(
                            'PROPN', 'X')
                    elif word.attrib['LEMMA'] == 'формула':
                        if sent[1].text == '1':
                            #sent[1].attrib['LINK'] = 'fixed' maybe later for all occurances
                            word.attrib['LEMMA'] = word.attrib['LEMMA'].title()
                        else:
                            word.attrib['FEAT'] = word.attrib['FEAT'].replace(
                                'PROPN', 'NOUN')
                elif pos == 'PROPN' and word.attrib['ID'] != '1':
                    if word.text.isupper():
                        word.attrib['LEMMA'] = word.attrib['LEMMA'].upper()
                    else:
                        word.attrib['LEMMA'] = word.attrib['LEMMA'].title()

                if word.text.istitle(
                ) and pos != 'PROPN' and word.attrib['ID'] != '1':
                    if word.text not in uncertain:
                        word.attrib['FEAT'] = 'PROPN' + (
                            word.attrib['FEAT'] + '\t').split('\t',
                                                              maxsplit=1)[1]
                        if word.text.isupper():
                            word.attrib['LEMMA'] = word.attrib['LEMMA'].upper()
                        else:
                            word.attrib['LEMMA'] = word.attrib['LEMMA'].title()
                    elif word.text in uncertain and word.attrib['LEMMA'].lower(
                    ) in certain:
                        word.attrib['FEAT'] = 'PROPN' + (
                            word.attrib['FEAT'] + '\t').split('\t',
                                                              maxsplit=1)[1]
                        if word.text.isupper():
                            word.attrib['LEMMA'] = word.attrib['LEMMA'].upper()
                        else:
                            word.attrib['LEMMA'] = word.attrib['LEMMA'].title()
                    else:
                        pass  # TODO: сделать ветку для неразобранных

        tree.write(ofname, encoding="utf-8")
    return
def main(ifname_list, ofname_list):
    #collect all PROPN
    proper_detected = defaultdict(int)
    for ifname, ofname in zip(ifname_list, ofname_list):
        tree = ET.parse(ifname)
        root = tree.getroot()
        for sent in root[-1].findall('S'):
            for word in sent.findall('W'):
                feats = word.attrib.get('FEAT', 'EMPTY').split()
                if 'PROPN' in feats:
                    proper_detected[word.attrib['LEMMA']] += 1

    not_proper_detected = defaultdict(int)
    for ifname, ofname in zip(ifname_list, ofname_list):
        tree = ET.parse(ifname)
        root = tree.getroot()
        for sent in root[-1].findall('S'):
            for word in sent.findall('W'):
                if word.attrib['LEMMA'] in proper_detected:
                    feats = word.attrib.get('FEAT', 'EMPTY').split()
                    if 'PROPN' not in feats:
                        not_proper_detected[word.attrib['LEMMA']] += 1

    for ifname, ofname in zip(ifname_list, ofname_list):
        tree = ET.parse(ifname)
        root = tree.getroot()

        for sent in root[-1].findall('S'):
            for word in sent.findall('W'):
                feats = word.attrib.get('FEAT', 'EMPTY').split()
                if (word.text is not None and word.text.istitle()
                        and ('NOUN' in feats or 'ADJ' in feats)
                        and word.attrib['LEMMA'] in proper_detected
                        and proper_detected[word.attrib['LEMMA']] >
                        not_proper_detected[word.attrib['LEMMA']]):

                    feats[0] = 'PROPN'
                    word.attrib['FEAT'] = ' '.join(feats)

        for sent in root[-1].findall('S'):
            nidChain = []
            listOfChains = []
            for word in sent.findall('W'):
                feats = word.attrib.get('FEAT', 'EMPTY').split()
                if 'NID' in feats:
                    nidChain.append(word)
                elif nidChain != []:
                    listOfChains.append(nidChain)
                    nidChain = []
            if nidChain != []:
                listOfChains.append(nidChain)
                nidChain = []

            for nidChain in listOfChains:
                if len(nidChain) == 1:
                    assign(nidChain)
                else:
                    ids = [elem.attrib['ID'] for elem in nidChain]
                    condidates = [
                        elem for elem in nidChain
                        if elem.attrib['DOM'] not in ids
                    ]
                    if len(condidates) == 1:
                        domNumber = condidates[0].attrib['DOM']
                        nidChain = revertLink(domNumber, nidChain)
                        assign(nidChain)
                    else:
                        for item in condidates:
                            revisedNidChaine = [item]
                            going = False
                            currentHeadID = item.attrib['ID']
                            while not going:
                                for elem in nidChain:
                                    if elem.attrib['DOM'] == currentHeadID:
                                        revisedNidChaine.append(elem)
                                        currentHeadID = elem.attrib['ID']
                                        break
                                else:
                                    going = True
                            if len(revisedNidChaine) > 1:
                                nidChain = revertLink(item.attrib['DOM'],
                                                      revisedNidChaine)
                                assign(nidChain)
                            else:
                                assign(revisedNidChaine)

        for sent in root[-1].findall('S'):
            for word in sent.findall('W'):
                link, pos, feats, head_token, head_pos, head_feats, head_root = get_info(
                    word, sent)
                if word.attrib.get('LEMMA', 'EMPTY') in [
                        'все', 'это', 'то'
                ] and pos in ['PROPN', 'NOUN']:
                    feats_temp = word.attrib['FEAT'].split(' ')
                    word.attrib['FEAT'] = 'PRON ' + feats_temp[1]

        tree.write(ofname, encoding="UTF-8")
Exemple #12
0
def munch(ifiles, ofiles):
    """
    Process all files in ifiles list.
    Output into ofiles list.
    """
    for ifname, ofname in zip(ifiles, ofiles):
        tree = ET.parse(ifname)
        root = tree.getroot()
        for sentence in root[-1].findall('S'):
            for token in sentence.findall(
                    'W'):  # step 0: detect and re-annotate 'не'
                if token.attrib['LEMMA'] == 'не' and 'VERB' in token.attrib[
                        'FEAT']:
                    link, pos, feats, head_token, head_pos, head_feats, head_root = get_info(
                        token, sentence)
                    children = get_children(sentence, token.attrib['ID'])
                    if token.text != 'FANTOM' and all(ch.text != 'FANTOM'
                                                      for ch in children):
                        for elem in children:
                            if 'VerbForm=Inf' in elem.attrib['FEAT']:
                                gr_children = get_children(
                                    sentence, elem.attrib['ID'])
                                break
                        for item in gr_children:
                            if item.attrib['LEMMA'] in suspicious:
                                gr_gr_children = get_children(
                                    sentence, item.attrib['ID'])
                                if all(gr_gr.attrib['FEAT'].split()[0] != 'ADP'
                                       for gr_gr in gr_gr_children):
                                    token.attrib['LEMMA'] = suspicious[
                                        item.attrib['LEMMA']]
                                    token.text = token.text + item.text
                                    token.attrib['FEAT'] = item.attrib['FEAT']
                                    item.attrib['DEL'] = 'YES'
                                break
                    elif token.text != 'FANTOM' and any(ch.text == 'FANTOM'
                                                        for ch in children):
                        for elem in children:
                            if 'VerbForm=Inf' in elem.attrib['FEAT']:
                                gr_children = get_children(
                                    sentence, elem.attrib['ID'])
                                break
                        for item in gr_children:
                            if item.attrib['LEMMA'] in suspicious:
                                gr_gr_children = get_children(
                                    sentence, item.attrib['ID'])
                                if all(gr_gr.attrib['FEAT'].split()[0] != 'ADP'
                                       for gr_gr in gr_gr_children):
                                    token.attrib['LEMMA'] = suspicious[
                                        item.attrib['LEMMA']]
                                    token.attrib['FEAT'] = item.attrib['FEAT']
                                    item.attrib['DEL'] = 'YES'
                                break
                    elif token.text == 'FANTOM' and children == []:
                        if sentence.attrib['ID'] == '217':
                            for elem in sentence.findall('W'):
                                if elem.attrib['ID'] == '11':
                                    elem.attrib['DEL'] = 'YES'
                                if elem.attrib['ID'] == '12':
                                    elem.attrib['LEMMA'] = 'нечего'
                        if sentence.attrib['ID'] == '94':
                            for elem in sentence.findall('W'):
                                if elem.attrib['ID'] == '11':
                                    elem.attrib['DEL'] = 'YES'
                        if sentence.attrib['ID'] == '169':
                            for elem in sentence.findall('W'):
                                if elem.attrib['ID'] == '6':
                                    elem.attrib['DOM'] = '14'
                                if elem.attrib['ID'] == '9':
                                    elem.attrib['DEL'] = 'YES'
                                if elem.attrib['ID'] == '10':
                                    elem.attrib['LEMMA'] = 'некого'
                                if elem.attrib['ID'] == '11':
                                    elem.attrib['DOM'] = '13'
                                if elem.attrib['ID'] == '12':
                                    elem.attrib['DEL'] = 'YES'
                                if elem.attrib['ID'] == '13':
                                    elem.attrib['LEMMA'] = 'негде'
                                    elem.attrib['DOM'] = '10'

                    elif token.text == 'FANTOM' and any(ch.text == 'FANTOM'
                                                        for ch in children):
                        for elem in sentence.findall('W'):
                            if elem.attrib['ID'] == '11':
                                elem.attrib['DEL'] = 'YES'
                            if elem.attrib['ID'] == '2':
                                elem.attrib['LEMMA'] = suspicious[
                                    elem.attrib['LEMMA']]
                                elem.attrib['DOM'] = '_root'
                                del elem.attrib['LINK']
                            if elem.attrib['DOM'] == '1':
                                elem.attrib['DOM'] == '2'

                    elif token.text == 'FANTOM' and all(ch.text != 'FANTOM'
                                                        for ch in children):
                        if all('VerbForm=Inf' not in ch.attrib['FEAT']
                               for ch in children):
                            if sentence.attrib['ID'] == '440':
                                for elem in sentence.findall('W'):
                                    if elem.attrib['ID'] == '16':
                                        elem.attrib['DOM'] = '18'
                                    if elem.attrib['ID'] == '17':
                                        elem.attrib['DEL'] = 'YES'
                                    if elem.attrib['ID'] == '18':
                                        elem.attrib['LEMMA'] = suspicious[
                                            elem.attrib['LEMMA']]
                        for elem in children:
                            if 'VerbForm=Inf' in elem.attrib['FEAT']:
                                gr_children = get_children(
                                    sentence, elem.attrib['ID'])
                                if head_token is None:
                                    for item in gr_children:
                                        if item.attrib['LEMMA'] in suspicious:
                                            gr_gr_children = get_children(
                                                sentence, item.attrib['ID'])
                                            if all(gr_gr.attrib['FEAT'].split(
                                            )[0] != 'ADP' for gr_gr in
                                                   gr_gr_children):
                                                item.attrib[
                                                    'LEMMA'] = suspicious[
                                                        item.attrib['LEMMA']]
                                                item.attrib['DOM'] = '_root'
                                                del item.attrib['LINK']
                                                token.attrib['DEL'] = 'YES'
                                                for renum in sentence.findall(
                                                        'W'):
                                                    if renum.attrib[
                                                            'DOM'] == token.attrib[
                                                                'ID']:
                                                        renum.attrib[
                                                            'DOM'] = item.attrib[
                                                                'ID']
                                            break
                                    else:
                                        for broken in children:
                                            if broken.attrib[
                                                    'LEMMA'] in suspicious:
                                                broken.attrib[
                                                    'LEMMA'] = suspicious[
                                                        broken.attrib['LEMMA']]
                                                broken.attrib['DOM'] = '_root'
                                                del broken.attrib['LINK']
                                                token.attrib['DEL'] = 'YES'
                                                for renum in sentence.findall(
                                                        'W'):
                                                    if renum.attrib[
                                                            'DOM'] == token.attrib[
                                                                'ID']:
                                                        renum.attrib[
                                                            'DOM'] = broken.attrib[
                                                                'ID']

                                else:
                                    for item in gr_children:
                                        if item.attrib['LEMMA'] in suspicious:
                                            gr_gr_children = get_children(
                                                sentence, item.attrib['ID'])
                                            if all(gr_gr.attrib['FEAT'].split(
                                            )[0] != 'ADP' for gr_gr in
                                                   gr_gr_children):
                                                token.attrib[
                                                    'LEMMA'] = suspicious[
                                                        item.attrib['LEMMA']]
                                                token.attrib[
                                                    'FEAT'] = item.attrib[
                                                        'FEAT']
                                                token.text = item.text
                                                item.attrib['DEL'] = "YES"
                                                for renum in sentence.findall(
                                                        'W'):
                                                    if renum.attrib[
                                                            'DOM'] == item.attrib[
                                                                'ID']:
                                                        renum.attrib[
                                                            'DOM'] = token.attrib[
                                                                'ID']
                    else:
                        pass
        for sentence in root[-1].findall(
                'S'):  # step 2: collect token numbers old:new
            numbering = {}
            token_number = 0
            for token in sentence.findall('W'):
                if 'DEL' not in token.attrib:
                    token_number += 1
                numbering[token.attrib['ID']] = str(token_number)

            for word in sentence.findall('W'):  # step 3: assign new numbers
                word.attrib['ID'] = numbering[word.attrib['ID']]
                if word.attrib['DOM'] != '_root':
                    word.attrib['DOM'] = numbering[word.attrib['DOM']]
            for elem in sentence.findall('W'):  # step 4: remove tokens
                if 'DEL' in elem.attrib:
                    sentence.remove(elem)

        for sentence in root[-1].findall('S'):
            for token in sentence.findall('W'):  # Mood=Cnd fix
                if token.attrib['LEMMA'] in {'бы', 'б', 'чтобы', 'чтоб'}:
                    link, pos, feats, head_token, head_pos, head_feats, head_root = get_info(
                        token, sentence)
                    try:
                        if head_token.attrib['LEMMA'] not in forbidden_head:
                            if pos in {'SCONJ', 'PART'}:
                                token.attrib['FEAT'] = token.attrib[
                                    'FEAT'] + ' Mood=Cnd'
                            else:
                                token.attrib['FEAT'] = token.attrib[
                                    'FEAT'].replace(' Foreign=Yes', '')
                    except:
                        print('Something went wrong')
                        print(*[(elem.text, elem.tail.rstrip('\n'),
                                 elem.attrib) for elem in sentence],
                              sep='\n')
                        print()

        tree.write(ofname, encoding="UTF-8")
    return
def munch(ifiles, ofiles):
    """
    Process all files in ifiles list.
    Output into ofiles list.
    """
    for ifname, ofname in zip(ifiles, ofiles):
        tree = ET.parse(ifname)
        root = tree.getroot()
        for sentence in root[-1].findall('S'):   
            remove_sentence = False
            for token in sentence.findall('W'):
                if token.attrib.get('FEAT','EMPTY').split()[0] == 'COM':
                    link, pos, feats, head_token, head_pos, head_feats, head_root, nodetype = get_info(token, sentence, get_nodetype=True)                    
                    if not nodetype and head_pos == 'COM':
                        print(ifname, sentence.attrib['ID'])
                    elif nodetype:
                        token.attrib['FEAT'] = head_token.attrib['FEAT']
                    elif not nodetype and head_pos in ['PR', 'NUM', 'CONJ']:
                        remove_sentence = True
                        continue
                    elif not nodetype and head_pos == 'V':
                        if token.text in ['не', 'полу']:
                            head_token.attrib['LEMMA'] = token.attrib['LEMMA'] + head_token.attrib['LEMMA']
                            head_token.text = token.text + head_token.text
                        else:
                            remove_sentence = True
                            continue
                    elif not nodetype and head_pos == 'A':
                        head_token.attrib['LEMMA'] = token.attrib['LEMMA'] + head_token.attrib['LEMMA']
                        head_token.text = token.text + head_token.text
                    elif not nodetype and head_pos == 'S':   
                        if head_token.attrib['LEMMA'] not in ['слово', 'фактор', 'циклон', 'янус', 'буква', 'орбита', 'мониторинг', 'спектроскопия']:
                            head_token.attrib['LEMMA'] = token.attrib['LEMMA'] + token.tail.strip() + head_token.attrib['LEMMA']
                            head_token.text = token.text + token.tail.strip() + head_token.text
                        else:
                            remove_sentence = True
                            continue                            
                    else:
                        print(ifname, sentence.attrib['ID'])
            if remove_sentence:
                sentence.clear()
                continue
        for sentence in root[-1].findall('S'):
            for token in sentence.findall('W'): 
                if 'NODETYPE' in token.attrib:
                    token.text = 'FANTOM'
                    del token.attrib['NODETYPE']
                    if 'LEMMA' not in token.attrib:
                        token.attrib['LEMMA'] = 'FANTOM'
                if 'LINK' in token.attrib and token.attrib['LINK'] == 'предик':
                    dom = int(token.attrib['DOM'])
                    number = int(token.attrib['ID'])
                    for item in sentence.findall('W'):
                        if 'LINK' in item.attrib and item.attrib['LINK'] == 'предик' and \
                            int(item.attrib['ID']) != number and int(item.attrib['DOM']) == dom:
                            remove_sentence = True
                # Это для отдельно болтающихся предлогох/союзов
                if token.attrib.get('FEAT', '') and  token.attrib.get('FEAT', '').split()[0] in garbage and token.attrib['DOM'] == '_root':
                    child_id = token.attrib['ID']
                    children = get_children_attrib(sentence, child_id)
                    if children == []:
                        remove_sentence = True
            if remove_sentence:
                sentence.clear()
                continue
        tree.write(ofname, encoding = 'utf-8')
    return
Exemple #14
0
#c = get_cosine_sim("AI is our friend and it has been friendly", "AI and humans have always been friendly")
# print(c)
# sys.exit()

if reload:
    datas = []
    for y in sorted(glob("*/info.yml")):
        d = os.path.dirname(y)
        if d in ("psoe110", ):
            continue
        print("Analizando %s" % d)
        os.chdir(cwd)
        os.chdir(d)

        data = get_info(autocomplete=True)
        soup = get_soup(data.output + ".html")
        body = soup.find("body")
        body_txt = re.sub(r"  +", " ", body.get_text()).strip()
        body_slp = body_txt.split()

        data.pages = get_pages(data.output + ".html")
        data.caracteres = len(body_txt)
        data.palabras = len(body_slp)
        data.parrafos = len(body.findAll(["p", "li"]))
        data.capitulos = len(body.findAll(["h1"]))  # , "h2"]))
        data.root = d
        filesize = data.get("filesize", {})
        for k in ("md", "html", "epub"):
            filesize[k] = os.path.getsize(data.output + '.' + k)
        for k in ("pdf", "html", "xml"):
Exemple #15
0
def munch(ifiles, ofiles):
    # part 1
    count = 0
    for ifname, ofname in zip(ifiles, ofiles):
        tree = ET.parse(ifname)
        root = tree.getroot()
        for sent in root[-1].findall('S'):
            for word in sent.findall('W'):
                if word.attrib.get(
                        'FEAT',
                        'EMPTY').split()[0] == 'PR' and word.attrib.get(
                            'LEMMA', 'EMPTY') in lemmas_adv:
                    word.attrib['FEAT'] = 'ADV'
                    if 'LINK' in word.attrib and word.attrib[
                            'LINK'] not in conjrels:
                        word.attrib['LINK'] = 'advmod'

        for sent in root[-1].findall('S'):
            for word in sent.findall('W'):
                if word.attrib.get('LINK',
                                   'EMPTY') in ['количест', 'аппрокс-колич']:
                    link, pos, feats, head_token, head_pos, head_feats, head_root = get_info(
                        word, sent)
                    feats_str = ''.join(feats)

                    if 'NUM' in pos and word.attrib['LEMMA'] == 'один':
                        word.attrib['LINK'] = 'nummod'
                    elif word.attrib['LEMMA'].endswith('1'):
                        word.attrib['LINK'] = 'nummod'
                    elif feats_str == 'NUM':
                        word.attrib['LINK'] = 'nummod'
                    elif 'ИМ' in feats_str:
                        word.attrib['LINK'] = 'nummod:gov'
                    elif 'ВИНОД' in feats_str:
                        word.attrib['LINK'] = 'nummod:gov'
                    elif 'ОД' not in feats_str and 'НЕОД' not in feats_str and 'ВИН' in feats_str:
                        head_feats_str = ''.join(head_feats)
                        if 'ОД' in head_feats_str:
                            word.attrib['LINK'] = 'nummod:gov'
                        else:
                            word.attrib['LINK'] = 'nummod'
                    else:
                        word.attrib['LINK'] = 'nummod'

        for sent in root[-1].findall('S'):
            for word in sent.findall('W'):
                if word.attrib.get('LEMMA', 'EMPTY') in big_num + big_num_fem:
                    link, pos, feats, head_token, head_pos, head_feats, head_root = get_info(
                        word, sent)
                    if link == 'колич-вспом':  # we need to deal with it later
                        if head_token.attrib['FEAT'].split(' ')[
                                0] == 'A':  # will be converted in syntax.py
                            pass
                        else:
                            #fix sentences
                            if ifname.endswith('newsYa_16.xml'
                                               ) and sent.attrib['ID'] == '31':
                                sent[44].attrib['DOM'] = '47'
                                sent[44].attrib['LINK'] = 'колич-вспом'
                            elif ifname.endswith(
                                    '2014Na_dvukh_voinakh_2.xml'
                            ) and sent.attrib['ID'] == '232':
                                sent[8].attrib['DOM'] = '12'
                                sent[9].attrib['DOM'] = '12'
                            elif ifname.endswith(
                                    '2014Na_dvukh_voinakh_1.xml'
                            ) and sent.attrib['ID'] == '397':
                                sent[2].attrib['DOM'] = '5'
                            elif ifname.endswith(
                                    '2003Opasnaya_blizost.xml'
                            ) and sent.attrib['ID'] == '5':
                                sent[18].attrib['DOM'] = '14'
                                sent[18].attrib['LINK'] = 'компл-аппоз'
                                sent[17].attrib['DOM'] = '19'
                                sent[17].attrib['LINK'] = 'nummod'
                                sent[16].attrib['LINK'] = 'колич-вспом'
                                sent[14].attrib['DOM'] = '18'
                                sent[14].attrib['LINK'] = 'колич-вспом'
                            elif ifname.endswith(
                                    '2003Bolshie_peremeny.xml'
                            ) and sent.attrib['ID'] == '6':
                                sent[11].attrib['DOM'] = '5'
                                sent[11].attrib['LINK'] = 'обст'
                                sent[9].attrib['DOM'] = '12'
                                sent[9].attrib['LINK'] = 'nummod'
                                sent[8].attrib['LINK'] = 'колич-вспом'
                                sent[6].attrib['DOM'] = '10'
                                sent[6].attrib['LINK'] = 'колич-вспом'
                                sent[5].attrib['DOM'] = '12'
                            elif ifname.endswith(
                                    '2003Bolshie_peremeny.xml'
                            ) and sent.attrib['ID'] == '45':
                                sent[14].attrib['DOM'] = '20'
                                sent[15].attrib['DOM'] = '19'
                                sent[15].attrib['LINK'] = 'колич-вспом'
                                sent[17].attrib['LINK'] = 'колич-вспом'
                                sent[18].attrib['DOM'] = '20'
                                sent[18].attrib['LINK'] = 'nummod:gov'
                                sent[19].attrib['LINK'] = 'обст'
                                sent[19].attrib['DOM'] = '26'
                                sent[21].attrib['DOM'] = '24'
                                sent[21].attrib['LINK'] = 'колич-вспом'
                                sent[22].attrib['DOM'] = '24'
                            elif ifname.endswith(
                                    '2003Tyurma_dlya_svekrovei.xml'
                            ) and sent.attrib['ID'] == '18':
                                sent[12].attrib['DOM'] = '18'
                                sent[13].attrib['DOM'] = '17'
                                sent[13].attrib['LINK'] = 'колич-вспом'
                                sent[15].attrib['LINK'] = 'колич-вспом'
                                sent[16].attrib['DOM'] = '18'
                                sent[16].attrib['LINK'] = 'nummod'
                                sent[17].attrib['DOM'] = '9'
                                sent[17].attrib['LINK'] = '3-компл'
                            elif ifname.endswith(
                                    '2014Vladimir_Vladimirovich.xml'
                            ) and sent.attrib['ID'] == '96':
                                sent[15].attrib['DOM'] = '19'
                                sent[15].attrib['LINK'] = 'колич-вспом'
                                sent[17].attrib['LINK'] = 'колич-вспом'
                                sent[18].attrib['DOM'] = '20'
                                sent[18].attrib['LINK'] = 'nummod'
                                sent[19].attrib['DOM'] = '15'
                                sent[19].attrib['LINK'] = '1-компл'
                            else:
                                print('Unaccounted entry:',
                                      ifname,
                                      sent.attrib['ID'],
                                      file=sys.stderr)
                    else:
                        feats_str = ''.join(feats)
                        children = get_children_attrib(sent, word.attrib['ID'])

                        if link == 'предик' and any(
                                child['LINK'] == 'квазиагент'
                                for child in children):
                            continue

                        if 'РОД' in feats_str:
                            new_link = 'nummod'
                        elif word.attrib.get('LEMMA', 'EMPTY') in big_num and (
                                'ИМ' in feats_str or 'ВИН' in feats_str):
                            new_link = 'nummod:gov'
                        elif word.attrib.get(
                                'LEMMA', 'EMPTY') in big_num_fem and (
                                    'ИМ' in feats_str or 'ВИНОД' in feats_str):
                            new_link = 'nummod:gov'
                        else:
                            if all(child['LINK'] != 'квазиагент'
                                   for child in children):
                                # Not interested in this condition
                                pass
                            else:
                                for child in children:
                                    if child['LINK'] == 'квазиагент':
                                        if 'РОД' in child[
                                                'FEAT'] or '$' in child[
                                                    'LEMMA']:
                                            new_link = 'nummod:gov'
                                        else:
                                            new_link = 'nummod'

                        for child_token in children:
                            if child_token['LINK'] == 'квазиагент':
                                child_token['LINK'] = link
                                child_token['DOM'] = word.attrib['DOM']

                                word.attrib['LINK'] = new_link
                                word.attrib['DOM'] = child_token['ID']
                                for ch in children:
                                    if ch['ID'] != child_token['ID'] and ch[
                                            'LINK'] not in [
                                                'nummod', 'nummod:gov'
                                            ]:
                                        ch['DOM'] = child_token['ID']

        for sent in root[-1].findall('S'):
            for word in sent.findall('W'):
                if word.attrib.get('LEMMA', '') in [
                        'сколько', 'несколько'
                ] and word.attrib.get('LINK',
                                      'EMPTY') in ['присвяз', 'соч-союзн']:
                    word.attrib['FEAT'] = 'NUM'
                if word.attrib.get(
                        'LEMMA',
                        'EMPTY') in lemmas_to_check and word.attrib.get(
                            'LINK', 'EMPTY') not in [
                                'огранич', 'присвяз', 'соч-союзн', 'nummod',
                                'nummod:gov'
                            ]:
                    children = get_children_attrib(sent, word.attrib['ID'])
                    if len(children) != 0:
                        if word.attrib.get('LINK',
                                           '') == 'предик' and word.attrib.get(
                                               'LEMMA', '') != 'сколько':
                            pass  # do nothing
                        elif word.attrib['DOM'] == '_root' or word.attrib[
                                'LINK'] in ['вспом']:

                            if word.attrib['DOM'] == '_root' and any(
                                    ch['LINK'] == '1-компл' for ch in children
                            ) and 'СРАВ' not in word.attrib.get('FEAT', ''):
                                if (ifname + '_' + sent.attrib['ID']
                                    ).split('/')[1] in [
                                        'uppsalaBitov_3.xml_454',
                                        '2007Pylesos.xml_77',
                                        '2009Nebesnye_formatsii.xml_8',
                                        '2012Chto_delat_posle_24_dekabrya.xml_9',
                                        '2003Nelzya_sebya_delit.xml_49',
                                        '2006Dobretsov.xml_57',
                                        '2011Mariam_Petrosyan.xml_296',
                                        '2003Lyubit_drakona.xml_122',
                                        'uppsalaKorp_220.xml_112',
                                        '2005Sluzhit_by_rad.xml_82',
                                        '2009Final_Ligi_Chempionov.xml_30',
                                        '2003Zhores.xml_336',
                                        '2003Opasnaya_blizost.xml_74'
                                    ]:

                                    for ch in children:
                                        if ch['LINK'] == '1-компл':
                                            ch['DOM'] = word.attrib['DOM']
                                            for child in children:
                                                if child['ID'] != ch[
                                                        'ID'] and child[
                                                            'LINK'] not in {
                                                                'огранич',
                                                                'колич-огран'
                                                            }:
                                                    child['DOM'] = ch['ID']
                                            word.attrib['DOM'] = ch['ID']
                                            del ch['LINK']
                                            word.attrib['LINK'] = 'nummod:gov'
                                    word.attrib['FEAT'] = 'NUM'
                            else:
                                pass
                        elif word.attrib.get(
                                'LINK', '') == 'предик' and word.attrib.get(
                                    'LEMMA', '') in ['сколько', 'несколько']:
                            if len(children) == 1 and (
                                    'S' in children[0]['FEAT']
                                    or 'A ' in children[0]['FEAT']):
                                children[0]['LINK'] = 'предик'
                                children[0]['DOM'] = word.attrib['DOM']
                                word.attrib['DOM'] = children[0]['ID']
                                word.attrib['LINK'] = 'nummod:gov'
                                word.attrib['FEAT'] = 'NUM'

                            else:
                                for child in children:
                                    if 'S ' in child['FEAT']:
                                        child['LINK'] = 'предик'
                                        child['DOM'] = word.attrib['DOM']
                                        word.attrib['DOM'] = child['ID']
                                        word.attrib['LINK'] = 'nummod:gov'
                                        word.attrib['FEAT'] = 'NUM'
                                        if len(children) == 3:
                                            for ch in children:
                                                if ch['LINK'] == 'соч-союзн':
                                                    ch['DOM'] = child['ID']

                        elif all(ch['LINK'] in
                                 ['огранич', 'колич-огран', 'вспом', 'case']
                                 for ch in children):
                            pass  # don't need to do anything

                        elif word.attrib.get('LINK', '') == 'обст':
                            if word.attrib.get('LEMMA', '') == 'немного':
                                pass  # do nothing
                            elif word.attrib['LEMMA'] in [
                                    'больше', 'столько', 'мало', 'много'
                            ]:
                                if len(children) == 1:
                                    if word.attrib['LEMMA'] in [
                                            'мало', 'много'
                                    ]:
                                        malo = True
                                    else:
                                        malo = False
                                    if children[0]['FEAT'].strip(
                                    ).split(' ')[0] in [
                                            'S'
                                    ]:  #TODO проверить - может быть тут еще надо поменять связь
                                        children[0]['DOM'] = word.attrib['DOM']
                                        word.attrib['DOM'] = children[0]['ID']
                                        if malo and children[0][
                                                'LINK'] != 'предик':
                                            word.attrib['LINK'] = 'nummod:gov'
                                            word.attrib['FEAT'] = 'NUM'
                                    elif children[0]['FEAT'].strip().split(
                                            ' ')[0] not in [
                                                'CONJ', 'A', 'ADV', 'V'
                                            ]:
                                        print('Unaccounted entry (FEAT):',
                                              file=sys.stderr)
                                else:
                                    for ch in children:
                                        if ch['LINK'] == 'сравнит':
                                            if ch['FEAT'].strip().split(
                                                    ' ')[0] in ['S']:
                                                ch['DOM'] = word.attrib['DOM']
                                                word.attrib['DOM'] = ch['ID']
                                            break
                            elif word.attrib['LEMMA'] == 'меньше':
                                pass
                            elif word.attrib['LEMMA'] in [
                                    'сколько', 'несколько'
                            ]:
                                for ch in children:
                                    if ch['LINK'] == '1-компл':
                                        ch['LINK'] = word.attrib['LINK']
                                        ch['DOM'] = word.attrib['DOM']
                                        word.attrib['DOM'] = ch['ID']
                                        word.attrib['LINK'] = 'nummod:gov'
                                        word.attrib['FEAT'] = 'NUM'
                                        break
                                else:
                                    print('Unaccounted entry (FEAT):',
                                          file=sys.stderr)

                            elif word.attrib['LEMMA'] in ['более', 'менее']:
                                for ch in children:
                                    if ch['FEAT'].strip().split(' ')[0] in [
                                            'S'
                                    ] and ch['LINK'] != 'атриб':
                                        ch['DOM'] = word.attrib['DOM']
                                        word.attrib['DOM'] = ch['ID']
                                        for chld in children:
                                            if chld['ID'] != ch['ID'] and chld[
                                                    'LINK'] not in ['огранич']:
                                                chld['DOM'] = ch['ID']

                        elif word.attrib.get('LINK', '').endswith(
                                'компл'
                        ):  # перевесить на существительное. Сколько -> NUM?
                            if (len(children) == 1 and
                                    children[0]['FEAT'].strip().split(' ')[0]
                                    in ['S', 'A'] and children[0]['LINK']
                                    not in ['атриб', 'предик']):
                                children[0]['DOM'] = word.attrib['DOM']
                                word.attrib['DOM'] = children[0]['ID']
                                if word.attrib['LEMMA'] in [
                                        'сколько', 'несколько'
                                ]:
                                    word.attrib['LINK'] = 'nummod:gov'
                                    word.attrib['FEAT'] = 'NUM'
                            elif (len(children) == 1
                                  and children[0]['FEAT'].strip().split(' ')[0]
                                  in ['CONJ', 'V']):
                                pass

                            elif len(children) == 1:
                                if ifname.endswith(
                                        '2003Vyzhivshii_kamikadze.xml'
                                ) and sent.attrib['ID'] == '257':
                                    children[0]['DOM'] = word.attrib['DOM']
                                    word.attrib['DOM'] = children[0]['ID']
                                if ifname.endswith(
                                        '2003Artist_mimansa.xml'
                                ) and sent.attrib['ID'] == '330':
                                    word.attrib['FEAT'] = 'NUM'
                            elif (len(children) > 1):
                                candidate_1 = None
                                candidate_2 = None
                                found_predic = False
                                for chld in children:
                                    if chld['LINK'] == 'предик':
                                        found_predic = True
                                        break
                                    if chld['LINK'] == '1-компл':
                                        candidate_1 = chld
                                        break
                                    if chld['LINK'] == 'сравнит':
                                        candidate_2 = chld

                                if not found_predic:
                                    if candidate_1 is not None:
                                        candidate_1['DOM'] = word.attrib['DOM']
                                        word.attrib['DOM'] = candidate_1['ID']
                                    elif candidate_2 is not None:
                                        candidate_2['DOM'] = word.attrib['DOM']
                                        word.attrib['DOM'] = candidate_2['ID']

                                    if word.attrib['LEMMA'] in [
                                            'сколько', 'несколько'
                                    ]:
                                        word.attrib['LINK'] = 'nummod:gov'
                                        word.attrib['FEAT'] = 'NUM'
                            else:
                                print('Unaccounted entry (FEAT):',
                                      file=sys.stderr)

                        elif word.attrib.get(
                                'LINK', ''
                        ) == 'вводн':  # there is no 'сколько', 'несколько' here
                            for chld in children:
                                if chld['FEAT'].strip().split(' ')[0] == 'S':
                                    chld['DOM'] = word.attrib['DOM']
                                    word.attrib['DOM'] = chld['ID']

                        elif word.attrib.get('LINK', '') == 'подч-союзн':
                            word.attrib['FEAT'] = 'NUM'
                            if any(ch['LINK'] == 'предик' for ch in children):
                                pass
                            else:
                                for ch in children:
                                    if ch['LINK'] == '1-компл':
                                        ch['DOM'] = word.attrib['DOM']
                                        for child in children:
                                            if child['ID'] != ch[
                                                    'ID'] and child[
                                                        'LINK'] not in {
                                                            'огранич',
                                                            'колич-огран'
                                                        }:
                                                child['DOM'] = ch['ID']
                                        word.attrib['DOM'] = ch['ID']
                                        ch['LINK'] = word.attrib['LINK']
                                        word.attrib['LINK'] = 'nummod:gov'

                        elif word.attrib.get('LINK', '') in [
                                'соотнос', 'кратн', 'электив', 'аппоз',
                                'эксплет', 'атриб', 'релят', 'квазиагент',
                                'колич-копред', 'разъяснит', 'сент-соч',
                                'сравнит', 'изъясн', 'компл-аппоз'
                        ]:
                            pass

                        elif word.attrib.get('LINK', '') == 'уточн':
                            for ch in children:
                                if ch['LINK'] == 'сравнит':
                                    ch['DOM'] = word.attrib['DOM']
                                    for child in children:
                                        if child['ID'] != ch['ID'] and child[
                                                'LINK'] not in {
                                                    'огранич', 'колич-огран'
                                                }:
                                            child['DOM'] = ch['ID']
                                    word.attrib['DOM'] = ch['ID']

                        elif word.attrib.get('LINK', '') in ['длительн']:
                            if any(ch['LINK'] == '1-компл' for ch in children):
                                for ch in children:
                                    if ch['LINK'] == '1-компл':
                                        ch['DOM'] = word.attrib['DOM']
                                        for child in children:
                                            if child['ID'] != ch[
                                                    'ID'] and child[
                                                        'LINK'] not in {
                                                            'огранич',
                                                            'колич-огран'
                                                        }:
                                                child['DOM'] = ch['ID']
                                        word.attrib['DOM'] = ch['ID']
                                        ch['LINK'] = word.attrib['LINK']
                                        word.attrib['LINK'] = 'nummod:gov'
                                word.attrib['FEAT'] = 'NUM'
                            else:
                                for ch in children:
                                    if ch['LINK'] == 'сравнит':
                                        ch['DOM'] = word.attrib['DOM']
                                        for child in children:
                                            if child['ID'] != ch[
                                                    'ID'] and child[
                                                        'LINK'] not in {
                                                            'огранич',
                                                            'колич-огран'
                                                        }:
                                                child['DOM'] = ch['ID']
                                        word.attrib['DOM'] = ch['ID']

                        elif word.attrib.get(
                                'LINK', '') in ['примыкат', 'колич-огран']:
                            if any(ch['LINK'] == 'сравнит'
                                   for ch in children) and not any(
                                       ch['LINK'] == '1-компл'
                                       for ch in children):
                                pass
                            else:
                                if word.attrib.get(
                                        'LINK',
                                        '') == 'примыкат' and word.attrib.get(
                                            'LEMMA', '') == 'более':
                                    pass  # one exception

                                for ch in children:
                                    if ch['LINK'] == '1-компл':
                                        ch['DOM'] = word.attrib['DOM']
                                        for child in children:
                                            if child['ID'] != ch[
                                                    'ID'] and child[
                                                        'LINK'] not in {
                                                            'огранич',
                                                            'колич-огран'
                                                        }:
                                                child['DOM'] = ch['ID']
                                        word.attrib['DOM'] = ch['ID']
                                        ch['LINK'] = word.attrib['LINK']
                                        word.attrib['LINK'] = 'nummod:gov'
                                word.attrib['FEAT'] = 'NUM'

                        elif word.attrib.get('LINK', '') == 'сравн-союзн':
                            if any(ch['LINK'] == 'предик'
                                   for ch in children) or any(
                                       ch['LINK'] == 'разъяснит'
                                       for ch in children):
                                pass
                            else:

                                for ch in children:
                                    if ch['LINK'] == '1-компл':
                                        ch['DOM'] = word.attrib['DOM']
                                        for child in children:
                                            if child['ID'] != ch[
                                                    'ID'] and child[
                                                        'LINK'] not in {
                                                            'огранич',
                                                            'колич-огран'
                                                        }:
                                                child['DOM'] = ch['ID']
                                        word.attrib['DOM'] = ch['ID']
                                        ch['LINK'] = word.attrib['LINK']
                                        word.attrib['LINK'] = 'nummod:gov'
                                word.attrib['FEAT'] = 'NUM'

                        elif word.attrib.get('LINK', '') == 'сочин':
                            if any(ch['LINK'] == '1-компл' for ch in
                                   children) and 'СРАВ' not in word.attrib.get(
                                       'FEAT', ''):
                                for ch in children:
                                    if ch['LINK'] == '1-компл':
                                        ch['DOM'] = word.attrib['DOM']
                                        for child in children:
                                            if child['ID'] != ch[
                                                    'ID'] and child[
                                                        'LINK'] not in {
                                                            'огранич',
                                                            'колич-огран'
                                                        }:
                                                child['DOM'] = ch['ID']
                                        word.attrib['DOM'] = ch['ID']
                                        ch['LINK'] = word.attrib['LINK']
                                        word.attrib['LINK'] = 'nummod:gov'
                                word.attrib['FEAT'] = 'NUM'

                            else:
                                pass

                        else:
                            #for_debug_rels[word.attrib['LINK']] += 1
                            print('Error in numerals.py: missing condition')
                            print(word.attrib.get('ID', ''),
                                  word.attrib.get('LINK', ''),
                                  word.attrib.get('LEMMA', ''),
                                  word.attrib.get('FEAT', ''),
                                  file=sys.stderr)
                            print(children, file=sys.stderr)
                            print(ifname + '_' + sent.attrib['ID'],
                                  file=sys.stderr)
                            print(*[(token.attrib.get('ID',
                                                      'EMPTY'), token.text,
                                     token.attrib.get('DOM', 'EMPTY'),
                                     token.attrib.get('FEAT', 'EMPTY'),
                                     token.attrib.get('LINK',
                                                      'EMPTY'), token.tail)
                                    for token in sent],
                                  file=sys.stderr,
                                  sep='\n')
                            print('***', file=sys.stderr)
                            #count += 1

        #for sent in root[-1].findall('S'):
        #    for word in sent.findall('W'):
        #        if word.attrib.get('LINK', 'EMPTY') in ['nummod', 'nummod:gov']:
        #            print(word.attrib.get('ID', ''), word.attrib.get('LINK', ''), word.attrib.get('FEAT', ''), file=sys.stderr)
        #            print(*[(token.attrib.get('ID', 'EMPTY'), token.text, token.attrib.get('DOM', 'EMPTY'), token.attrib.get('FEAT', 'EMPTY'), token.attrib.get('LINK', 'EMPTY'), token.tail) for token in sent], file=sys.stderr, sep='\n')
        #            print('***', file=sys.stderr)
        #            continue

        tree.write(ofname, encoding="UTF-8")
    #print(count)
    #for elem in for_debug_rels:
    #    print(elem, for_debug_rels[elem])
    return