zh, en = zh.strip(), en.strip() zh = tools.zhsent_preprocess(zh) with contextlib.redirect_stdout(outf): print(zh) print(en) print() sbc4_zh = gentask.slice_lines_grouped_by_n('sbc4_zh', sbc4(), target_dir / 'ch.txt', n=3, s=0) sbc4_tag = gentask.slice_lines_grouped_by_n('sbc4_tag', sbc4(), target_dir / 'tag.txt', n=3, s=1) sbc4_tag_lm = gentask.lm( 'sbc4_tag_lm', sbc4_tag, target_dir / 'tag.lm', target_dir / 'tag.blm') sbc4_zhpreprocess_slash = gentask.transformat_line2slash( 'sbc4_zhpreprocess_slash', sbc4_zhpreprocess(), target_dir / 'zhpreprcs.slash.txt') sbc4_tok_tag_tm = gentask.phrasetable( 'sbc4_tok_tag_phrasetable', sbc4_zhpreprocess(), target_dir / 'toktag.phrasetable.h5') if __name__ == "__main__": luigi.run(local_scheduler=True)
sbc4_test_zh = gentask.slice_lines_grouped_by_n( 'sbc4_test_zh', sbc4_test(), base_dir / 'sbc4_test.zh.txt', n=3, s=0) sbc4_test_zh_untok = gentask.untok( 'sbc4_test_zh_untok', sbc4_test_zh(), base_dir / 'sbc4_test.zh.untok.txt') sbc4_train_tag = gentask.slice_lines_grouped_by_n( 'sbc4_train_tag', sbc4_train(), base_dir / 'sbc4_train.tag.txt', n=3, s=1) sbc4_train_tag_lm = gentask.lm( 'sbc4_train_tag_lm', sbc4_train_tag(), base_dir / 'sbc4_train.tag.lm', base_dir / 'sbc4_train.tag.blm') sbc4_train_zh_to_tok_tag_phrasetable = gentask.phrasetable( 'sbc4_train_zh_to_tok_tag_phrasetable', sbc4_train(), base_dir / 'sbc4_train.zh2toktag.phrasetable.h5') sbc4_train_toktag_sbc4_test = gentask.zhtoktag( 'sbc4_train_toktag_sbc4_test', sbc4_test_zh_untok(), base_dir / 'sbc4_test.zh.untok.tok.txt', tm=sbc4_train_zh_to_tok_tag_phrasetable(), lm=sbc4_train_tag_lm()) sbc4_test_slash = gentask.transformat_line2slash( 'sbc4_test_slash', sbc4_test(), base_dir / 'sbc4_test.slash.txt') wdiff_sbc4_test = gentask.word_diff('wdiff_sbc4_test', sbc4_test_slash(), sbc4_train_toktag_sbc4_test(), base_dir / 'sbc4-test.wdiff') wdiff_errors_sbc4_test = gentask.word_diff_errors( 'wdiff_errors_sbc4_test', wdiff_sbc4_test(), base_dir / 'sbc4-test.wdiff.errors')