Beispiel #1
0
                zh, en = zh.strip(), en.strip()
                zh = tools.zhsent_preprocess(zh)
                with contextlib.redirect_stdout(outf):
                    print(zh)
                    print(en)
                    print()


sbc4_zh = gentask.slice_lines_grouped_by_n('sbc4_zh', sbc4(),
                                           target_dir / 'ch.txt',
                                           n=3,
                                           s=0)
sbc4_tag = gentask.slice_lines_grouped_by_n('sbc4_tag', sbc4(),
                                            target_dir / 'tag.txt',
                                            n=3,
                                            s=1)

sbc4_tag_lm = gentask.lm(
    'sbc4_tag_lm', sbc4_tag, target_dir / 'tag.lm', target_dir / 'tag.blm')

sbc4_zhpreprocess_slash = gentask.transformat_line2slash(
    'sbc4_zhpreprocess_slash', sbc4_zhpreprocess(),
    target_dir / 'zhpreprcs.slash.txt')

sbc4_tok_tag_tm = gentask.phrasetable(
    'sbc4_tok_tag_phrasetable', sbc4_zhpreprocess(),
    target_dir / 'toktag.phrasetable.h5')

if __name__ == "__main__":
    luigi.run(local_scheduler=True)
sbc4_test = gentask.slice_lines_grouped_by_n(
    'sbc4_test', sbc4(), base_dir / 'sbc4_test.txt', n=3 * 10, s=slice(3 * 9, 3 * 10))

sbc4_test_zh = gentask.slice_lines_grouped_by_n(
    'sbc4_test_zh', sbc4_test(), base_dir / 'sbc4_test.zh.txt', n=3, s=0)


sbc4_test_zh_untok = gentask.untok(
    'sbc4_test_zh_untok', sbc4_test_zh(), base_dir / 'sbc4_test.zh.untok.txt')


sbc4_train_tag = gentask.slice_lines_grouped_by_n(
    'sbc4_train_tag', sbc4_train(), base_dir / 'sbc4_train.tag.txt', n=3, s=1)

sbc4_train_tag_lm = gentask.lm(
    'sbc4_train_tag_lm', sbc4_train_tag(), base_dir / 'sbc4_train.tag.lm', base_dir / 'sbc4_train.tag.blm')

sbc4_train_zh_to_tok_tag_phrasetable = gentask.phrasetable(
    'sbc4_train_zh_to_tok_tag_phrasetable', sbc4_train(), base_dir / 'sbc4_train.zh2toktag.phrasetable.h5')


sbc4_train_toktag_sbc4_test = gentask.zhtoktag(
    'sbc4_train_toktag_sbc4_test', sbc4_test_zh_untok(), base_dir / 'sbc4_test.zh.untok.tok.txt', tm=sbc4_train_zh_to_tok_tag_phrasetable(), lm=sbc4_train_tag_lm())


sbc4_test_slash = gentask.transformat_line2slash(
    'sbc4_test_slash', sbc4_test(), base_dir / 'sbc4_test.slash.txt')

wdiff_sbc4_test = gentask.word_diff('wdiff_sbc4_test',
                                    sbc4_test_slash(), sbc4_train_toktag_sbc4_test(), base_dir / 'sbc4-test.wdiff')