Beispiel #1
0
def read_treebank(tb,
                  max_size=None,
                  shuffle=False,
                  lowercase=True,
                  skip_multi=True,
                  pos_filter={'NOUN', 'VERB', 'ADJ', 'ADV'}):
    d = set()
    n_sent, n_node = 0, 0
    m = lang_re.match(tb)
    lang = m.group(1)
    for tbf in glob.glob(tb + '/*.conllu'):
        for sent in conllu_sentences(tbf):
            n_sent += 1
            for node in sent.nodes:
                if not (node.form and node.lemma) \
                        or sent.get_multi(node) \
                        or node.upos not in pos_filter:
                    continue
                n_node += 1
                feats = [node.upos]
                if node.feats is not None:
                    feats += node.feats.split("|")
                feats = tuple(feats)
                if lowercase:
                    d.add((to_lower(node.form,
                                    lang), to_lower(node.lemma, lang), feats))
                else:
                    d.add((node.form, node.lemma, feats))
    d = list(d)
    if shuffle: random.shuffle(d)
    if max_size: d = d[:max_size]
    return d
Beispiel #2
0
#!/usr/bin/env python3

import sys, argparse, re
from udtools.conllu import conllu_sentences

ap = argparse.ArgumentParser()
ap.add_argument('input_file')
args = ap.parse_args()

tb = conllu_sentences(args.input_file)

for sent_num, sent in enumerate(tb):
    for i, node in enumerate(sent.nodes[1:]):
        if node.upos == 'PRON' \
                and node.get_feat('PronType') is None\
                and node.lemma in {'ben', 'sen', 'biz', 'siz',
                'hepimiz', 'herkes', 'kimse',
                'bizler', 'sizler', 'onlar',
                'hiçbirimiz', 'hiçbiriniz',
                'bazılarınız', 'bazılarımız',
                'kiminiz', 'bazılarımız',
                'kendi'}:
            node.add_feat('PronType', 'Prs')
        elif node.upos == 'PRON' \
                and node.get_feat('PronType') is None\
                and node.lemma in {'o', 'şu', 'bu', 'bura', 'şura', 'ora'}:
            node.add_feat('PronType', 'Dem')
        elif node.upos == 'DET' \
                and node.get_feat('PronType') is None\
                and node.get_feat('Definite'):
            node.add_feat('PronType', 'Art')