# for en, ch, entag, _ in tools.group_n_lines(in_file, n=4): # out_file.write(ch) class dreye_phrases(luigi.ExternalTask): def output(self): return luigi.LocalTarget('data/dreye/dreye_phrases.txt') class dreye_sents(luigi.ExternalTask): def output(self): return luigi.LocalTarget('data/dreye/dreye_sents.txt') dreye_purenp_zh = gentask.slice_lines_grouped_by_n( 'dreye_purenp_zh', dreye_npvp(), 'data/dreye/dreye.pure_np.zh.txt', n=4, s=1, input_target_key='pure_np') dreye_np_zh = gentask.slice_lines_grouped_by_n( 'dreye_np_zh', dreye_npvp(), 'data/dreye/dreye.np.zh.txt', n=4, s=1, input_target_key='np') dreye_vp_zh = gentask.slice_lines_grouped_by_n( 'dreye_vp_zh', dreye_npvp(), 'data/dreye/dreye.vp.zh.txt', n=4, s=1, input_target_key='vp') # dreye_purenp_zh_tag = gentask.zhtoktag( # 'dreye_purenp_zh_tag', dreye_purenp_zh(), 'data/dreye/dreye.pure_np.zh.tag.txt') # dreye_np_zh_tag = gentask.zhtoktag( # 'dreye_np_zh_tag', dreye_np_zh(), 'data/dreye/dreye.np.zh.tag.txt')
from pathlib import Path import sys from sbc4_tm_lm_tasks import sbc4_tok_tag_tm, sbc4_tag_lm from collections import Counter, defaultdict from operator import itemgetter from itertools import chain from functools import reduce import operator import gentask_pattern orig_ench = gentask.localtarget_task('src_data/medal.ench.txt') target_dir = Path('tgt_data/medal') ench = gentask.transformat_tab2lines('line_sep_ench', orig_ench(), target_dir / 'ench.txt') en = gentask.slice_lines_grouped_by_n('en', ench(), target_dir / 'en.txt', n=3, s=0) en_unidecode = gentask.unidecode('en_unidecode', en(), target_dir / 'en.unidecode.txt') en_retok = gentask.word_tokenize('en_retok', en_unidecode(), target_dir / 'en.retok.txt') en_truecase = gentask.truecase('medal_en_truecase', en_retok(), en_retok(), target_dir / 'en.truecase.txt') # en_genia = gentask.geniatagger('medal_en_genia', en_truecase(), # target_dir / 'en.genia.txt') # en_genia_line_iih = gentask.genia_line_IIH( # 'en_genia_line_iih', en_genia(), target_dir / 'en.genia.hiih.txt' # ) # horizontal and IIH
def output(self): return luigi.LocalTarget(str(target_dir / 'chtag.chpreprcs.txt')) def run(self): with self.input().open('r') as inf, self.output().open('w') as outf: for zh, en, _ in tools.group_n_lines(inf, n=3): zh, en = zh.strip(), en.strip() zh = tools.zhsent_preprocess(zh) with contextlib.redirect_stdout(outf): print(zh) print(en) print() sbc4_zh = gentask.slice_lines_grouped_by_n('sbc4_zh', sbc4(), target_dir / 'ch.txt', n=3, s=0) sbc4_tag = gentask.slice_lines_grouped_by_n('sbc4_tag', sbc4(), target_dir / 'tag.txt', n=3, s=1) sbc4_tag_lm = gentask.lm( 'sbc4_tag_lm', sbc4_tag, target_dir / 'tag.lm', target_dir / 'tag.blm') sbc4_zhpreprocess_slash = gentask.transformat_line2slash( 'sbc4_zhpreprocess_slash', sbc4_zhpreprocess(), target_dir / 'zhpreprcs.slash.txt') sbc4_tok_tag_tm = gentask.phrasetable( 'sbc4_tok_tag_phrasetable', sbc4_zhpreprocess(),
#!/usr/bin/env python # -*- coding: utf-8 -*- import gentask import luigi from sbc4_tm_lm_tasks import sbc4 from pathlib import Path base_dir = Path('data/zhtoktag_eval/') sbc4_train = gentask.slice_lines_grouped_by_n( 'sbc4_train', sbc4(), base_dir / 'sbc4_train.txt', n=3 * 10, s=slice(0, 3 * 9)) sbc4_test = gentask.slice_lines_grouped_by_n( 'sbc4_test', sbc4(), base_dir / 'sbc4_test.txt', n=3 * 10, s=slice(3 * 9, 3 * 10)) sbc4_test_zh = gentask.slice_lines_grouped_by_n( 'sbc4_test_zh', sbc4_test(), base_dir / 'sbc4_test.zh.txt', n=3, s=0) sbc4_test_zh_untok = gentask.untok( 'sbc4_test_zh_untok', sbc4_test_zh(), base_dir / 'sbc4_test.zh.untok.txt') sbc4_train_tag = gentask.slice_lines_grouped_by_n( 'sbc4_train_tag', sbc4_train(), base_dir / 'sbc4_train.tag.txt', n=3, s=1) sbc4_train_tag_lm = gentask.lm( 'sbc4_train_tag_lm', sbc4_train_tag(), base_dir / 'sbc4_train.tag.lm', base_dir / 'sbc4_train.tag.blm')