Beispiel #1
0
                zh, en = zh.strip(), en.strip()
                zh = tools.zhsent_preprocess(zh)
                with contextlib.redirect_stdout(outf):
                    print(zh)
                    print(en)
                    print()


sbc4_zh = gentask.slice_lines_grouped_by_n('sbc4_zh', sbc4(),
                                           target_dir / 'ch.txt',
                                           n=3,
                                           s=0)
sbc4_tag = gentask.slice_lines_grouped_by_n('sbc4_tag', sbc4(),
                                            target_dir / 'tag.txt',
                                            n=3,
                                            s=1)

sbc4_tag_lm = gentask.lm(
    'sbc4_tag_lm', sbc4_tag, target_dir / 'tag.lm', target_dir / 'tag.blm')

sbc4_zhpreprocess_slash = gentask.transformat_line2slash(
    'sbc4_zhpreprocess_slash', sbc4_zhpreprocess(),
    target_dir / 'zhpreprcs.slash.txt')

sbc4_tok_tag_tm = gentask.phrasetable(
    'sbc4_tok_tag_phrasetable', sbc4_zhpreprocess(),
    target_dir / 'toktag.phrasetable.h5')

if __name__ == "__main__":
    luigi.run(local_scheduler=True)
sbc4_train_tag = gentask.slice_lines_grouped_by_n(
    'sbc4_train_tag', sbc4_train(), base_dir / 'sbc4_train.tag.txt', n=3, s=1)

sbc4_train_tag_lm = gentask.lm(
    'sbc4_train_tag_lm', sbc4_train_tag(), base_dir / 'sbc4_train.tag.lm', base_dir / 'sbc4_train.tag.blm')

sbc4_train_zh_to_tok_tag_phrasetable = gentask.phrasetable(
    'sbc4_train_zh_to_tok_tag_phrasetable', sbc4_train(), base_dir / 'sbc4_train.zh2toktag.phrasetable.h5')


sbc4_train_toktag_sbc4_test = gentask.zhtoktag(
    'sbc4_train_toktag_sbc4_test', sbc4_test_zh_untok(), base_dir / 'sbc4_test.zh.untok.tok.txt', tm=sbc4_train_zh_to_tok_tag_phrasetable(), lm=sbc4_train_tag_lm())


sbc4_test_slash = gentask.transformat_line2slash(
    'sbc4_test_slash', sbc4_test(), base_dir / 'sbc4_test.slash.txt')

wdiff_sbc4_test = gentask.word_diff('wdiff_sbc4_test',
                                    sbc4_test_slash(), sbc4_train_toktag_sbc4_test(), base_dir / 'sbc4-test.wdiff')


wdiff_errors_sbc4_test = gentask.word_diff_errors(
    'wdiff_errors_sbc4_test', wdiff_sbc4_test(), base_dir / 'sbc4-test.wdiff.errors')

wdiff_src_error_words_sbc4_test = gentask.word_diff_src_error_words(
    'wdiff_src_error_words_sbc4_test', wdiff_errors_sbc4_test(), base_dir / 'sbc4-test.wdiff.src_error_words')


if __name__ == "__main__":
    luigi.run(local_scheduler=True)