Ejemplo n.º 1
0
def main():
    cdroot()
    input_file = 'data/parsing/ko_penn-ud-revised/ko_penn-ud-revised.conllu'
    train_file = input_file.replace('.conllu', '.train.conllu')
    dev_file = input_file.replace('.conllu', '.dev.conllu')
    test_file = input_file.replace('.conllu', '.test.conllu')
    with open(input_file) as src, open(train_file, 'w') as train, open(dev_file, 'w') as dev, open(test_file,
                                                                                                   'w') as test:
        everything = src.read()
        sents = everything.split('\n\n')
        fileid_sents = {}
        for sent in sents:
            sent = sent.strip()
            if not sent:
                continue
            fileid = sent.split('\n')[0].split()[3].split('.')[0]
            sents_in_file = fileid_sents.get(fileid, None)
            if not sents_in_file:
                sents_in_file = []
                fileid_sents[fileid] = sents_in_file
            sents_in_file.append(sent)
        counts = [0] * 3
        for each_file, sents_in_file in fileid_sents.items():
            total = len(sents_in_file)
            t = math.ceil(total * .79)
            d = math.floor(total * .9105)
            for idx, (f, ss) in enumerate(
                    zip([train, dev, test], [sents_in_file[:t], sents_in_file[t:d], sents_in_file[d:]])):
                if ss:
                    f.write('\n\n'.join(ss))
                    f.write('\n\n')
                    counts[idx] += len(ss)
        print(counts)
Ejemplo n.º 2
0
def main():
    cdroot()
    fs = sorted(glob.glob('data/iwpt2020/test-udpipe/??.conllu'))
    pretrained = 'bert-base-multilingual-cased'
    tokenizer = AutoTokenizer.from_pretrained(pretrained)
    config = AutoConfig.from_pretrained(pretrained)
    stat = Counter()
    for idx, f in enumerate(fs):
        langcode = f.split('/')[-2]
        if len(langcode) != 2:
            langcode = os.path.basename(f).split('.')[0]
        # if langcode != 'fi':
        #     continue
        print(f'{idx + 1:02d}/{len(fs)} {langcode}')
        total = 0
        for sent in read_conll(f):
            words = [x[1] for x in sent]
            length = len_of_sent(words, config, tokenizer)
            if length > 256:
                stat[langcode] += 1
            total += 1
        stat[langcode] += 0
        # stat[langcode] /= total
        # stat[langcode] *= 100
    for k, v in sorted(stat.items()):
        print(k, end='\t')
    print()
    for k, v in sorted(stat.items()):
        print(v, end='\t')
Ejemplo n.º 3
0
def main():
    cdroot()
    graph = [0, 5, 1, 5, 5, 1, 0, 8, 10, 8, 8, 12, 13, 10]
    G = visualize(graph)
    A = to_agraph(G)
    A.layout('dot')
    png = 'data/test/dep.png'
    A.draw(png)
    os.system(f'open {png}')
Ejemplo n.º 4
0
def main():
    cdroot()
    testfiles = sorted(glob.glob('data/iwpt2020/test-blind/*.txt'))
    for idx, txt in enumerate(testfiles):
        basename = os.path.basename(txt)
        langcode = basename.split('.')[0]
        print(f'{idx + 1:02d}/{len(testfiles)} {basename}')
        if idx + 1 != 3:
            continue
        run(langcode)
def main():
    cdroot()
    dataset = CoNLLSentence.from_file(
        'data/iwpt2020/train-dev/UD_Arabic-PADT/ar_padt-ud-train.enhanced_collapse_empty_nodes.conllu',
        conllu=True)
    data = Counter()
    for each in dataset:
        for word in each:
            data[len(word.head)] += 1
    plot_bar(data, 'num of heads', 'num of samples')
Ejemplo n.º 6
0
def main():
    print('Collapse empty nodes for all conllu files ...')
    cdroot()
    conllu_files = glob.glob('data/iwpt2020/**/*.conllu', recursive=True)
    conllu_files = [
        x for x in conllu_files if 'enhanced_collapse_empty_nodes' not in x
    ]
    for idx, f in enumerate(conllu_files):
        print(f'\r{idx + 1}/{len(conllu_files)} {f}', end='')
        enhanced_collapse_empty_nodes(
            f,
            f'{f.replace(".conllu", ".enhanced_collapse_empty_nodes.conllu")}')
Ejemplo n.º 7
0
def main():
    print('Preprocess blind test data with UDPipe ...')
    cdroot()
    files = glob.glob('data/iwpt2020/test-blind/*.txt')
    os.makedirs('data/iwpt2020/test-udpipe', exist_ok=True)
    for idx, txt in enumerate(files):
        basename = os.path.basename(txt)
        print(f'{idx + 1}/{len(files)} {basename}')
        lang = basename.split('.')[0]
        with open(txt) as src, open(f'data/iwpt2020/test-udpipe/{lang}.conllu', 'w') as out:
            text = src.read()
            conllu = parse(text, lang)
            out.write(conllu)
Ejemplo n.º 8
0
def main():
    print('Shortening sentences according to the number of subtokens ...')
    cdroot()
    files = sorted(
        x
        for x in glob.glob('data/iwpt2020/train-dev-combined/**/train.conllu')
        if 'short' not in x)
    shorten(files)
    files = sorted(
        x for x in glob.glob('data/iwpt2020/train-dev-combined/**/dev.conllu')
        if 'short' not in x)
    shorten(files)
    files = sorted(
        x for x in glob.glob('data/iwpt2020/test-udpipe/*.fixed.conllu')
        if 'short' not in x)
    shorten(files)
Ejemplo n.º 9
0
def main():
    cdroot()
    fs = sorted(glob.glob('data/iwpt2020/test-blind/*.txt'))
    # total = load_json('data/model/iwpt2020/dev.json')
    for idx, txt in enumerate(fs):
        basename = os.path.basename(txt)
        langcode = basename.split('.')[0]
        print(f'{idx + 1:02d}/{len(fs)} {basename}')
        # if idx + 1 < 13:
        #     continue
        # if langcode != 'ar':
        #     continue
        # if langcode in total:
        #     continue
        # run(langcode, do_train=True, mbert=False, do_eval=False)
        do_train = len(sys.argv) == 1 or sys.argv[1] != 'predict'
        run(langcode, do_train=do_train, mbert=False, do_eval=True)
def main():
    cdroot()
    submission = 'data/model/iwpt2020/emorynlp-submission'
    os.makedirs(submission, exist_ok=True)
    fs = sorted(glob.glob('data/iwpt2020/test-blind/*.txt'))
    for idx, txt in enumerate(fs):
        basename = os.path.basename(txt)
        langcode = basename.split('.')[0]
        print(f'{idx + 1:02d}/{len(fs)} {basename}')
        src = f'data/model/iwpt2020/{langcode}/{langcode}.conllu'
        dst = f'{submission}/{langcode}.conllu'
        tmp = f'/home/hhe43/tmp/{langcode}.conllu'
        copyfile(src, tmp)
        remove_complete_edges(tmp, tmp)
        remove_collapse_edges(tmp, tmp)
        src = tmp
        conllu_quick_fix(src, dst)
Ejemplo n.º 11
0
def main():
    cdroot()
    path = 'all_stat.pkl'
    try:
        stat = load_pickle(path)
    except FileNotFoundError:
        stat = dict(
            (model, fetch_all_result(url)) for model, url in online.items())
        save_pickle(stat, path)
    stat['bert_sdp']['Czech']['UAS'] = 75.95
    stat['bert_sdp']['Czech']['LAS'] = 72.96
    stat['bert_sdp']['Czech']['CLAS'] = 66.06
    stat['bert_sdp']['Czech']['EULAS'] = 69.79
    stat['bert_sdp']['Czech']['ELAS'] = 68.47

    stat['bert_sdp']['Finnish']['UAS'] = 85.89
    stat['bert_sdp']['Finnish']['LAS'] = 84.29
    stat['bert_sdp']['Finnish']['CLAS'] = 80.89
    stat['bert_sdp']['Finnish']['EULAS'] = 80.10
    stat['bert_sdp']['Finnish']['ELAS'] = 79.38

    stat['bert_ens']['Czech']['UAS'] = 86.83
    stat['bert_ens']['Czech']['LAS'] = 83.58
    stat['bert_ens']['Czech']['CLAS'] = 80.84
    stat['bert_ens']['Czech']['EULAS'] = 53.45
    stat['bert_ens']['Czech']['ELAS'] = 51.05

    print(' & ' + ' & '.join('\multicolumn{1}{c|}{\\bf AR}'.replace(
        'AR', ISO639_to_code[lang].upper())
                             for lang in list(stat['bert_sdp'].keys())[:-1]),
          end='\\\\\n\hline\hline\n')
    for model in stat.keys():
        if model.endswith('dep'):
            name = 'DTP'
        elif model.endswith('sdp'):
            name = 'DGP'
        else:
            name = 'ENS'
        print(name, end=' & ')
        print(' & '.join(f'{s:.2f}' if s else "-"
                         for s in [x['EULAS']
                                   for x in stat[model].values()][:-1]),
              end=' \\\\\n')
        if name == 'ENS':
            print('\hline')
def main():
    cdroot()
    fs = sorted(glob.glob('data/iwpt2020/test-blind/*.txt'))
    num_multi_heads = Counter()
    for idx, txt in enumerate(fs):
        basename = os.path.basename(txt)
        langcode = basename.split('.')[0]
        print(f'{idx + 1:02d}/{len(fs)} {basename}')
        stat = head_stat(langcode)
        num_multi_heads[langcode] = (sum(stat.values()) - stat[1]) / sum(
            stat.values())
        # plot_bar(stat, 'num of heads', 'num of tokens')
    plot_bar(num_multi_heads,
             'language',
             '# tokens with multiple heads',
             ann_y=False,
             sort_y=True,
             save='heads.pdf')
def main():
    cdroot()
    testfiles = sorted(glob.glob('data/iwpt2020/test-blind/*.txt'))
    for idx, txt in enumerate(testfiles):
        basename = os.path.basename(txt)
        langcode = basename.split('.')[0]
        print(f'{idx + 1:02d}/{len(testfiles)} {basename}')
        run(langcode)
    for stat, split in zip([trn, dev, tst], ['trn', 'dev', 'tst']):
        print(split)
        for k, v in stat.items():
            print(f'{k}\t', end='')
            for lang, num in v.items():
                print(lang, end='\t')
            print()
            for lang, num in v.items():
                print(num, end='\t')
            print()
        print()
def main():
    cdroot()
    fs = sorted(glob.glob('data/iwpt2020/test-udpipe/*.conllu'))
    fs = [f for f in fs if 'fixed' not in f]
    for idx, f in enumerate(fs):
        langcode = f.split('/')[-2]
        if len(langcode) != 2:
            langcode = os.path.basename(f).split('.')[0]
        if langcode != 'en':
            continue
        print(f'{idx + 1:02d}/{len(fs)} {langcode}')
        with open(f.replace('.conllu', '.fixed.conllu'), 'w') as out:
            for sent in load_conll(f):
                for line in sent.split('\n'):
                    if line.startswith('#') or '-' in line.split('\t')[0]:
                        out.write(line)
                    else:
                        cells = line.split('\t')
                        cells[8] = cells[6] + ':' + cells[7]
                        out.write('\t'.join(cells))
                    out.write('\n')
                out.write('\n')
Ejemplo n.º 15
0
def main():
    cdroot()
    input_file = 'data/parsing/ko_penn-ud-revised/ko_penn-ud-revised.conllu'
    train_file = input_file.replace('.conllu', '.train.conllu')
    dev_file = input_file.replace('.conllu', '.dev.conllu')
    test_file = input_file.replace('.conllu', '.test.conllu')
    num_tokens = {}
    num_sents = {}
    for split in 'train', 'dev', 'test':
        num_tokens[split] = 0
        num_sents[split] = 0
        file = input_file.replace('.conllu', f'.{split}.conllu')
        sents = load_conll(file)
        for each in sents:
            if not each.strip():
                continue
            num_sents[split] += 1
            for line in each.split('\n'):
                if line.startswith('#') or len(line.split('\t')) != 10:
                    continue
                num_tokens[split] += 1
    print(f'# of sents: {num_sents}')
    print(f'# of tokens: {num_tokens}')
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-26 18:19
from edparser.metrics.parsing.iwpt20_eval import evaluate
from edparser.components.parsers.biaffine_parser import BiaffineTransformerDependencyParser
from iwpt2020 import cdroot

cdroot()
save_dir = 'data/model/iwpt2020/en_bert_large_dep'
parser = BiaffineTransformerDependencyParser()
dataset = 'data/iwpt2020/train-dev/'
trnfile = f'{dataset}UD_English-EWT/en_ewt-ud-train.enhanced_collapse_empty_nodes.conllu'
devfile = f'{dataset}UD_English-EWT/en_ewt-ud-dev.enhanced_collapse_empty_nodes.conllu'
testfile = devfile
# parser.fit(trnfile,
#            devfile,
#            save_dir, 'bert-large-uncased-whole-word-masking',
#            batch_size=128,
#            warmup_steps_ratio=.1,
#            samples_per_batch=150,
#            # max_samples_per_batch=32,
#            transformer_dropout=.33,
#            learning_rate=2e-3,
#            learning_rate_transformer=1e-5,
#            epochs=1
#            )
# parser.load(save_dir, tree='tarjan')
output = f'{testfile.replace(".conllu", ".pred.conllu")}'
# parser.evaluate(devfile, save_dir, warm_up=False, output=output)
score = evaluate(testfile, output)
print(
Ejemplo n.º 17
0
def main():
    cdroot()
Ejemplo n.º 18
0
def main():
    print('Combing treebanks of the same language ...')
    cdroot()
    ISO639_1 = {
        'ab': 'Abkhaz',
        'aa': 'Afar',
        'af': 'Afrikaans',
        'ak': 'Akan',
        'sq': 'Albanian',
        'am': 'Amharic',
        'ar': 'Arabic',
        'an': 'Aragonese',
        'hy': 'Armenian',
        'as': 'Assamese',
        'av': 'Avaric',
        'ae': 'Avestan',
        'ay': 'Aymara',
        'az': 'Azerbaijani',
        'bm': 'Bambara',
        'ba': 'Bashkir',
        'eu': 'Basque',
        'be': 'Belarusian',
        'bn': 'Bengali',
        'bh': 'Bihari',
        'bi': 'Bislama',
        'bs': 'Bosnian',
        'br': 'Breton',
        'bg': 'Bulgarian',
        'my': 'Burmese',
        'ca': 'Catalan; Valencian',
        'ch': 'Chamorro',
        'ce': 'Chechen',
        'ny': 'Chichewa; Chewa; Nyanja',
        'zh': 'Chinese',
        'cv': 'Chuvash',
        'kw': 'Cornish',
        'co': 'Corsican',
        'cr': 'Cree',
        'hr': 'Croatian',
        'cs': 'Czech',
        'da': 'Danish',
        'dv': 'Divehi; Maldivian;',
        'nl': 'Dutch',
        'dz': 'Dzongkha',
        'en': 'English',
        'eo': 'Esperanto',
        'et': 'Estonian',
        'ee': 'Ewe',
        'fo': 'Faroese',
        'fj': 'Fijian',
        'fi': 'Finnish',
        'fr': 'French',
        'ff': 'Fula',
        'gl': 'Galician',
        'ka': 'Georgian',
        'de': 'German',
        'el': 'Greek, Modern',
        'gn': 'Guaraní',
        'gu': 'Gujarati',
        'ht': 'Haitian',
        'ha': 'Hausa',
        'he': 'Hebrew (modern)',
        'hz': 'Herero',
        'hi': 'Hindi',
        'ho': 'Hiri Motu',
        'hu': 'Hungarian',
        'ia': 'Interlingua',
        'id': 'Indonesian',
        'ie': 'Interlingue',
        'ga': 'Irish',
        'ig': 'Igbo',
        'ik': 'Inupiaq',
        'io': 'Ido',
        'is': 'Icelandic',
        'it': 'Italian',
        'iu': 'Inuktitut',
        'ja': 'Japanese',
        'jv': 'Javanese',
        'kl': 'Kalaallisut',
        'kn': 'Kannada',
        'kr': 'Kanuri',
        'ks': 'Kashmiri',
        'kk': 'Kazakh',
        'km': 'Khmer',
        'ki': 'Kikuyu, Gikuyu',
        'rw': 'Kinyarwanda',
        'ky': 'Kirghiz, Kyrgyz',
        'kv': 'Komi',
        'kg': 'Kongo',
        'ko': 'Korean',
        'ku': 'Kurdish',
        'kj': 'Kwanyama, Kuanyama',
        'la': 'Latin',
        'lb': 'Luxembourgish',
        'lg': 'Luganda',
        'li': 'Limburgish',
        'ln': 'Lingala',
        'lo': 'Lao',
        'lt': 'Lithuanian',
        'lu': 'Luba-Katanga',
        'lv': 'Latvian',
        'gv': 'Manx',
        'mk': 'Macedonian',
        'mg': 'Malagasy',
        'ms': 'Malay',
        'ml': 'Malayalam',
        'mt': 'Maltese',
        'mi': 'Māori',
        'mr': 'Marathi (Marāṭhī)',
        'mh': 'Marshallese',
        'mn': 'Mongolian',
        'na': 'Nauru',
        'nv': 'Navajo, Navaho',
        'nb': 'Norwegian Bokmål',
        'nd': 'North Ndebele',
        'ne': 'Nepali',
        'ng': 'Ndonga',
        'nn': 'Norwegian Nynorsk',
        'no': 'Norwegian',
        'ii': 'Nuosu',
        'nr': 'South Ndebele',
        'oc': 'Occitan',
        'oj': 'Ojibwe, Ojibwa',
        'cu': 'Old Church Slavonic',
        'om': 'Oromo',
        'or': 'Oriya',
        'os': 'Ossetian, Ossetic',
        'pa': 'Panjabi, Punjabi',
        'pi': 'Pāli',
        'fa': 'Persian',
        'pl': 'Polish',
        'ps': 'Pashto, Pushto',
        'pt': 'Portuguese',
        'qu': 'Quechua',
        'rm': 'Romansh',
        'rn': 'Kirundi',
        'ro': 'Romanian, Moldavan',
        'ru': 'Russian',
        'sa': 'Sanskrit (Saṁskṛta)',
        'sc': 'Sardinian',
        'sd': 'Sindhi',
        'se': 'Northern Sami',
        'sm': 'Samoan',
        'sg': 'Sango',
        'sr': 'Serbian',
        'gd': 'Scottish Gaelic',
        'sn': 'Shona',
        'si': 'Sinhala, Sinhalese',
        'sk': 'Slovak',
        'sl': 'Slovene',
        'so': 'Somali',
        'st': 'Southern Sotho',
        'es': 'Spanish; Castilian',
        'su': 'Sundanese',
        'sw': 'Swahili',
        'ss': 'Swati',
        'sv': 'Swedish',
        'ta': 'Tamil',
        'te': 'Telugu',
        'tg': 'Tajik',
        'th': 'Thai',
        'ti': 'Tigrinya',
        'bo': 'Tibetan',
        'tk': 'Turkmen',
        'tl': 'Tagalog',
        'tn': 'Tswana',
        'to': 'Tonga',
        'tr': 'Turkish',
        'ts': 'Tsonga',
        'tt': 'Tatar',
        'tw': 'Twi',
        'ty': 'Tahitian',
        'ug': 'Uighur, Uyghur',
        'uk': 'Ukrainian',
        'ur': 'Urdu',
        'uz': 'Uzbek',
        've': 'Venda',
        'vi': 'Vietnamese',
        'vo': 'Volapük',
        'wa': 'Walloon',
        'cy': 'Welsh',
        'wo': 'Wolof',
        'fy': 'Western Frisian',
        'xh': 'Xhosa',
        'yi': 'Yiddish',
        'yo': 'Yoruba',
        'za': 'Zhuang, Chuang',
        'zu': 'Zulu',
    }
    testfiles = sorted(glob.glob('data/iwpt2020/test-blind/*.txt'))
    trainfiles = sorted(glob.glob('data/iwpt2020/train-dev/*'))
    trainfiles = [x for x in trainfiles if glob.glob(x + '/*.conllu')]
    print(trainfiles)
    for idx, txt in enumerate(testfiles):
        basename = os.path.basename(txt)
        langcode = basename.split('.')[0]
        lang = ISO639_1.get(langcode)
        fs = [x for x in trainfiles if lang.lower() in x.lower()]
        print(
            f'{idx + 1:02d}/{len(testfiles)} {basename} {[os.path.basename(x) for x in fs]}'
        )
        folder = f'data/iwpt2020/train-dev-combined/{langcode}'
        os.makedirs(folder, exist_ok=True)
        for part in ['train', 'dev']:
            combine(fs, f'{part}.enhanced_collapse_empty_nodes.conllu',
                    f'{folder}/{part}.conllu')