def main(): cdroot() input_file = 'data/parsing/ko_penn-ud-revised/ko_penn-ud-revised.conllu' train_file = input_file.replace('.conllu', '.train.conllu') dev_file = input_file.replace('.conllu', '.dev.conllu') test_file = input_file.replace('.conllu', '.test.conllu') with open(input_file) as src, open(train_file, 'w') as train, open(dev_file, 'w') as dev, open(test_file, 'w') as test: everything = src.read() sents = everything.split('\n\n') fileid_sents = {} for sent in sents: sent = sent.strip() if not sent: continue fileid = sent.split('\n')[0].split()[3].split('.')[0] sents_in_file = fileid_sents.get(fileid, None) if not sents_in_file: sents_in_file = [] fileid_sents[fileid] = sents_in_file sents_in_file.append(sent) counts = [0] * 3 for each_file, sents_in_file in fileid_sents.items(): total = len(sents_in_file) t = math.ceil(total * .79) d = math.floor(total * .9105) for idx, (f, ss) in enumerate( zip([train, dev, test], [sents_in_file[:t], sents_in_file[t:d], sents_in_file[d:]])): if ss: f.write('\n\n'.join(ss)) f.write('\n\n') counts[idx] += len(ss) print(counts)
def main(): cdroot() fs = sorted(glob.glob('data/iwpt2020/test-udpipe/??.conllu')) pretrained = 'bert-base-multilingual-cased' tokenizer = AutoTokenizer.from_pretrained(pretrained) config = AutoConfig.from_pretrained(pretrained) stat = Counter() for idx, f in enumerate(fs): langcode = f.split('/')[-2] if len(langcode) != 2: langcode = os.path.basename(f).split('.')[0] # if langcode != 'fi': # continue print(f'{idx + 1:02d}/{len(fs)} {langcode}') total = 0 for sent in read_conll(f): words = [x[1] for x in sent] length = len_of_sent(words, config, tokenizer) if length > 256: stat[langcode] += 1 total += 1 stat[langcode] += 0 # stat[langcode] /= total # stat[langcode] *= 100 for k, v in sorted(stat.items()): print(k, end='\t') print() for k, v in sorted(stat.items()): print(v, end='\t')
def main(): cdroot() graph = [0, 5, 1, 5, 5, 1, 0, 8, 10, 8, 8, 12, 13, 10] G = visualize(graph) A = to_agraph(G) A.layout('dot') png = 'data/test/dep.png' A.draw(png) os.system(f'open {png}')
def main(): cdroot() testfiles = sorted(glob.glob('data/iwpt2020/test-blind/*.txt')) for idx, txt in enumerate(testfiles): basename = os.path.basename(txt) langcode = basename.split('.')[0] print(f'{idx + 1:02d}/{len(testfiles)} {basename}') if idx + 1 != 3: continue run(langcode)
def main(): cdroot() dataset = CoNLLSentence.from_file( 'data/iwpt2020/train-dev/UD_Arabic-PADT/ar_padt-ud-train.enhanced_collapse_empty_nodes.conllu', conllu=True) data = Counter() for each in dataset: for word in each: data[len(word.head)] += 1 plot_bar(data, 'num of heads', 'num of samples')
def main(): print('Collapse empty nodes for all conllu files ...') cdroot() conllu_files = glob.glob('data/iwpt2020/**/*.conllu', recursive=True) conllu_files = [ x for x in conllu_files if 'enhanced_collapse_empty_nodes' not in x ] for idx, f in enumerate(conllu_files): print(f'\r{idx + 1}/{len(conllu_files)} {f}', end='') enhanced_collapse_empty_nodes( f, f'{f.replace(".conllu", ".enhanced_collapse_empty_nodes.conllu")}')
def main(): print('Preprocess blind test data with UDPipe ...') cdroot() files = glob.glob('data/iwpt2020/test-blind/*.txt') os.makedirs('data/iwpt2020/test-udpipe', exist_ok=True) for idx, txt in enumerate(files): basename = os.path.basename(txt) print(f'{idx + 1}/{len(files)} {basename}') lang = basename.split('.')[0] with open(txt) as src, open(f'data/iwpt2020/test-udpipe/{lang}.conllu', 'w') as out: text = src.read() conllu = parse(text, lang) out.write(conllu)
def main(): print('Shortening sentences according to the number of subtokens ...') cdroot() files = sorted( x for x in glob.glob('data/iwpt2020/train-dev-combined/**/train.conllu') if 'short' not in x) shorten(files) files = sorted( x for x in glob.glob('data/iwpt2020/train-dev-combined/**/dev.conllu') if 'short' not in x) shorten(files) files = sorted( x for x in glob.glob('data/iwpt2020/test-udpipe/*.fixed.conllu') if 'short' not in x) shorten(files)
def main(): cdroot() fs = sorted(glob.glob('data/iwpt2020/test-blind/*.txt')) # total = load_json('data/model/iwpt2020/dev.json') for idx, txt in enumerate(fs): basename = os.path.basename(txt) langcode = basename.split('.')[0] print(f'{idx + 1:02d}/{len(fs)} {basename}') # if idx + 1 < 13: # continue # if langcode != 'ar': # continue # if langcode in total: # continue # run(langcode, do_train=True, mbert=False, do_eval=False) do_train = len(sys.argv) == 1 or sys.argv[1] != 'predict' run(langcode, do_train=do_train, mbert=False, do_eval=True)
def main(): cdroot() submission = 'data/model/iwpt2020/emorynlp-submission' os.makedirs(submission, exist_ok=True) fs = sorted(glob.glob('data/iwpt2020/test-blind/*.txt')) for idx, txt in enumerate(fs): basename = os.path.basename(txt) langcode = basename.split('.')[0] print(f'{idx + 1:02d}/{len(fs)} {basename}') src = f'data/model/iwpt2020/{langcode}/{langcode}.conllu' dst = f'{submission}/{langcode}.conllu' tmp = f'/home/hhe43/tmp/{langcode}.conllu' copyfile(src, tmp) remove_complete_edges(tmp, tmp) remove_collapse_edges(tmp, tmp) src = tmp conllu_quick_fix(src, dst)
def main(): cdroot() path = 'all_stat.pkl' try: stat = load_pickle(path) except FileNotFoundError: stat = dict( (model, fetch_all_result(url)) for model, url in online.items()) save_pickle(stat, path) stat['bert_sdp']['Czech']['UAS'] = 75.95 stat['bert_sdp']['Czech']['LAS'] = 72.96 stat['bert_sdp']['Czech']['CLAS'] = 66.06 stat['bert_sdp']['Czech']['EULAS'] = 69.79 stat['bert_sdp']['Czech']['ELAS'] = 68.47 stat['bert_sdp']['Finnish']['UAS'] = 85.89 stat['bert_sdp']['Finnish']['LAS'] = 84.29 stat['bert_sdp']['Finnish']['CLAS'] = 80.89 stat['bert_sdp']['Finnish']['EULAS'] = 80.10 stat['bert_sdp']['Finnish']['ELAS'] = 79.38 stat['bert_ens']['Czech']['UAS'] = 86.83 stat['bert_ens']['Czech']['LAS'] = 83.58 stat['bert_ens']['Czech']['CLAS'] = 80.84 stat['bert_ens']['Czech']['EULAS'] = 53.45 stat['bert_ens']['Czech']['ELAS'] = 51.05 print(' & ' + ' & '.join('\multicolumn{1}{c|}{\\bf AR}'.replace( 'AR', ISO639_to_code[lang].upper()) for lang in list(stat['bert_sdp'].keys())[:-1]), end='\\\\\n\hline\hline\n') for model in stat.keys(): if model.endswith('dep'): name = 'DTP' elif model.endswith('sdp'): name = 'DGP' else: name = 'ENS' print(name, end=' & ') print(' & '.join(f'{s:.2f}' if s else "-" for s in [x['EULAS'] for x in stat[model].values()][:-1]), end=' \\\\\n') if name == 'ENS': print('\hline')
def main(): cdroot() fs = sorted(glob.glob('data/iwpt2020/test-blind/*.txt')) num_multi_heads = Counter() for idx, txt in enumerate(fs): basename = os.path.basename(txt) langcode = basename.split('.')[0] print(f'{idx + 1:02d}/{len(fs)} {basename}') stat = head_stat(langcode) num_multi_heads[langcode] = (sum(stat.values()) - stat[1]) / sum( stat.values()) # plot_bar(stat, 'num of heads', 'num of tokens') plot_bar(num_multi_heads, 'language', '# tokens with multiple heads', ann_y=False, sort_y=True, save='heads.pdf')
def main(): cdroot() testfiles = sorted(glob.glob('data/iwpt2020/test-blind/*.txt')) for idx, txt in enumerate(testfiles): basename = os.path.basename(txt) langcode = basename.split('.')[0] print(f'{idx + 1:02d}/{len(testfiles)} {basename}') run(langcode) for stat, split in zip([trn, dev, tst], ['trn', 'dev', 'tst']): print(split) for k, v in stat.items(): print(f'{k}\t', end='') for lang, num in v.items(): print(lang, end='\t') print() for lang, num in v.items(): print(num, end='\t') print() print()
def main(): cdroot() fs = sorted(glob.glob('data/iwpt2020/test-udpipe/*.conllu')) fs = [f for f in fs if 'fixed' not in f] for idx, f in enumerate(fs): langcode = f.split('/')[-2] if len(langcode) != 2: langcode = os.path.basename(f).split('.')[0] if langcode != 'en': continue print(f'{idx + 1:02d}/{len(fs)} {langcode}') with open(f.replace('.conllu', '.fixed.conllu'), 'w') as out: for sent in load_conll(f): for line in sent.split('\n'): if line.startswith('#') or '-' in line.split('\t')[0]: out.write(line) else: cells = line.split('\t') cells[8] = cells[6] + ':' + cells[7] out.write('\t'.join(cells)) out.write('\n') out.write('\n')
def main(): cdroot() input_file = 'data/parsing/ko_penn-ud-revised/ko_penn-ud-revised.conllu' train_file = input_file.replace('.conllu', '.train.conllu') dev_file = input_file.replace('.conllu', '.dev.conllu') test_file = input_file.replace('.conllu', '.test.conllu') num_tokens = {} num_sents = {} for split in 'train', 'dev', 'test': num_tokens[split] = 0 num_sents[split] = 0 file = input_file.replace('.conllu', f'.{split}.conllu') sents = load_conll(file) for each in sents: if not each.strip(): continue num_sents[split] += 1 for line in each.split('\n'): if line.startswith('#') or len(line.split('\t')) != 10: continue num_tokens[split] += 1 print(f'# of sents: {num_sents}') print(f'# of tokens: {num_tokens}')
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-26 18:19 from edparser.metrics.parsing.iwpt20_eval import evaluate from edparser.components.parsers.biaffine_parser import BiaffineTransformerDependencyParser from iwpt2020 import cdroot cdroot() save_dir = 'data/model/iwpt2020/en_bert_large_dep' parser = BiaffineTransformerDependencyParser() dataset = 'data/iwpt2020/train-dev/' trnfile = f'{dataset}UD_English-EWT/en_ewt-ud-train.enhanced_collapse_empty_nodes.conllu' devfile = f'{dataset}UD_English-EWT/en_ewt-ud-dev.enhanced_collapse_empty_nodes.conllu' testfile = devfile # parser.fit(trnfile, # devfile, # save_dir, 'bert-large-uncased-whole-word-masking', # batch_size=128, # warmup_steps_ratio=.1, # samples_per_batch=150, # # max_samples_per_batch=32, # transformer_dropout=.33, # learning_rate=2e-3, # learning_rate_transformer=1e-5, # epochs=1 # ) # parser.load(save_dir, tree='tarjan') output = f'{testfile.replace(".conllu", ".pred.conllu")}' # parser.evaluate(devfile, save_dir, warm_up=False, output=output) score = evaluate(testfile, output) print(
def main(): cdroot()
def main(): print('Combing treebanks of the same language ...') cdroot() ISO639_1 = { 'ab': 'Abkhaz', 'aa': 'Afar', 'af': 'Afrikaans', 'ak': 'Akan', 'sq': 'Albanian', 'am': 'Amharic', 'ar': 'Arabic', 'an': 'Aragonese', 'hy': 'Armenian', 'as': 'Assamese', 'av': 'Avaric', 'ae': 'Avestan', 'ay': 'Aymara', 'az': 'Azerbaijani', 'bm': 'Bambara', 'ba': 'Bashkir', 'eu': 'Basque', 'be': 'Belarusian', 'bn': 'Bengali', 'bh': 'Bihari', 'bi': 'Bislama', 'bs': 'Bosnian', 'br': 'Breton', 'bg': 'Bulgarian', 'my': 'Burmese', 'ca': 'Catalan; Valencian', 'ch': 'Chamorro', 'ce': 'Chechen', 'ny': 'Chichewa; Chewa; Nyanja', 'zh': 'Chinese', 'cv': 'Chuvash', 'kw': 'Cornish', 'co': 'Corsican', 'cr': 'Cree', 'hr': 'Croatian', 'cs': 'Czech', 'da': 'Danish', 'dv': 'Divehi; Maldivian;', 'nl': 'Dutch', 'dz': 'Dzongkha', 'en': 'English', 'eo': 'Esperanto', 'et': 'Estonian', 'ee': 'Ewe', 'fo': 'Faroese', 'fj': 'Fijian', 'fi': 'Finnish', 'fr': 'French', 'ff': 'Fula', 'gl': 'Galician', 'ka': 'Georgian', 'de': 'German', 'el': 'Greek, Modern', 'gn': 'Guaraní', 'gu': 'Gujarati', 'ht': 'Haitian', 'ha': 'Hausa', 'he': 'Hebrew (modern)', 'hz': 'Herero', 'hi': 'Hindi', 'ho': 'Hiri Motu', 'hu': 'Hungarian', 'ia': 'Interlingua', 'id': 'Indonesian', 'ie': 'Interlingue', 'ga': 'Irish', 'ig': 'Igbo', 'ik': 'Inupiaq', 'io': 'Ido', 'is': 'Icelandic', 'it': 'Italian', 'iu': 'Inuktitut', 'ja': 'Japanese', 'jv': 'Javanese', 'kl': 'Kalaallisut', 'kn': 'Kannada', 'kr': 'Kanuri', 'ks': 'Kashmiri', 'kk': 'Kazakh', 'km': 'Khmer', 'ki': 'Kikuyu, Gikuyu', 'rw': 'Kinyarwanda', 'ky': 'Kirghiz, Kyrgyz', 'kv': 'Komi', 'kg': 'Kongo', 'ko': 'Korean', 'ku': 'Kurdish', 'kj': 'Kwanyama, Kuanyama', 'la': 'Latin', 'lb': 'Luxembourgish', 'lg': 'Luganda', 'li': 'Limburgish', 'ln': 'Lingala', 'lo': 'Lao', 'lt': 'Lithuanian', 'lu': 'Luba-Katanga', 'lv': 'Latvian', 'gv': 'Manx', 'mk': 'Macedonian', 'mg': 'Malagasy', 'ms': 'Malay', 'ml': 'Malayalam', 'mt': 'Maltese', 'mi': 'Māori', 'mr': 'Marathi (Marāṭhī)', 'mh': 'Marshallese', 'mn': 'Mongolian', 'na': 'Nauru', 'nv': 'Navajo, Navaho', 'nb': 'Norwegian Bokmål', 'nd': 'North Ndebele', 'ne': 'Nepali', 'ng': 'Ndonga', 'nn': 'Norwegian Nynorsk', 'no': 'Norwegian', 'ii': 'Nuosu', 'nr': 'South Ndebele', 'oc': 'Occitan', 'oj': 'Ojibwe, Ojibwa', 'cu': 'Old Church Slavonic', 'om': 'Oromo', 'or': 'Oriya', 'os': 'Ossetian, Ossetic', 'pa': 'Panjabi, Punjabi', 'pi': 'Pāli', 'fa': 'Persian', 'pl': 'Polish', 'ps': 'Pashto, Pushto', 'pt': 'Portuguese', 'qu': 'Quechua', 'rm': 'Romansh', 'rn': 'Kirundi', 'ro': 'Romanian, Moldavan', 'ru': 'Russian', 'sa': 'Sanskrit (Saṁskṛta)', 'sc': 'Sardinian', 'sd': 'Sindhi', 'se': 'Northern Sami', 'sm': 'Samoan', 'sg': 'Sango', 'sr': 'Serbian', 'gd': 'Scottish Gaelic', 'sn': 'Shona', 'si': 'Sinhala, Sinhalese', 'sk': 'Slovak', 'sl': 'Slovene', 'so': 'Somali', 'st': 'Southern Sotho', 'es': 'Spanish; Castilian', 'su': 'Sundanese', 'sw': 'Swahili', 'ss': 'Swati', 'sv': 'Swedish', 'ta': 'Tamil', 'te': 'Telugu', 'tg': 'Tajik', 'th': 'Thai', 'ti': 'Tigrinya', 'bo': 'Tibetan', 'tk': 'Turkmen', 'tl': 'Tagalog', 'tn': 'Tswana', 'to': 'Tonga', 'tr': 'Turkish', 'ts': 'Tsonga', 'tt': 'Tatar', 'tw': 'Twi', 'ty': 'Tahitian', 'ug': 'Uighur, Uyghur', 'uk': 'Ukrainian', 'ur': 'Urdu', 'uz': 'Uzbek', 've': 'Venda', 'vi': 'Vietnamese', 'vo': 'Volapük', 'wa': 'Walloon', 'cy': 'Welsh', 'wo': 'Wolof', 'fy': 'Western Frisian', 'xh': 'Xhosa', 'yi': 'Yiddish', 'yo': 'Yoruba', 'za': 'Zhuang, Chuang', 'zu': 'Zulu', } testfiles = sorted(glob.glob('data/iwpt2020/test-blind/*.txt')) trainfiles = sorted(glob.glob('data/iwpt2020/train-dev/*')) trainfiles = [x for x in trainfiles if glob.glob(x + '/*.conllu')] print(trainfiles) for idx, txt in enumerate(testfiles): basename = os.path.basename(txt) langcode = basename.split('.')[0] lang = ISO639_1.get(langcode) fs = [x for x in trainfiles if lang.lower() in x.lower()] print( f'{idx + 1:02d}/{len(testfiles)} {basename} {[os.path.basename(x) for x in fs]}' ) folder = f'data/iwpt2020/train-dev-combined/{langcode}' os.makedirs(folder, exist_ok=True) for part in ['train', 'dev']: combine(fs, f'{part}.enhanced_collapse_empty_nodes.conllu', f'{folder}/{part}.conllu')