Exemple #1
0
#             for en, ch, entag, _ in tools.group_n_lines(in_file, n=4):
#                 out_file.write(ch)


class dreye_phrases(luigi.ExternalTask):

    def output(self):
        return luigi.LocalTarget('data/dreye/dreye_phrases.txt')


class dreye_sents(luigi.ExternalTask):

    def output(self):
        return luigi.LocalTarget('data/dreye/dreye_sents.txt')

dreye_purenp_zh = gentask.slice_lines_grouped_by_n(
    'dreye_purenp_zh', dreye_npvp(), 'data/dreye/dreye.pure_np.zh.txt', n=4, s=1, input_target_key='pure_np')

dreye_np_zh = gentask.slice_lines_grouped_by_n(
    'dreye_np_zh', dreye_npvp(), 'data/dreye/dreye.np.zh.txt', n=4, s=1, input_target_key='np')


dreye_vp_zh = gentask.slice_lines_grouped_by_n(
    'dreye_vp_zh', dreye_npvp(), 'data/dreye/dreye.vp.zh.txt', n=4, s=1, input_target_key='vp')


# dreye_purenp_zh_tag = gentask.zhtoktag(
#     'dreye_purenp_zh_tag', dreye_purenp_zh(), 'data/dreye/dreye.pure_np.zh.tag.txt')

# dreye_np_zh_tag = gentask.zhtoktag(
#     'dreye_np_zh_tag', dreye_np_zh(), 'data/dreye/dreye.np.zh.tag.txt')
Exemple #2
0
from pathlib import Path
import sys
from sbc4_tm_lm_tasks import sbc4_tok_tag_tm, sbc4_tag_lm
from collections import Counter, defaultdict
from operator import itemgetter
from itertools import chain
from functools import reduce
import operator
import gentask_pattern
orig_ench = gentask.localtarget_task('src_data/medal.ench.txt')

target_dir = Path('tgt_data/medal')
ench = gentask.transformat_tab2lines('line_sep_ench', orig_ench(),
                                     target_dir / 'ench.txt')
en = gentask.slice_lines_grouped_by_n('en', ench(), target_dir / 'en.txt',
                                      n=3,
                                      s=0)
en_unidecode = gentask.unidecode('en_unidecode', en(),
                                 target_dir / 'en.unidecode.txt')
en_retok = gentask.word_tokenize('en_retok', en_unidecode(),
                                 target_dir / 'en.retok.txt')
en_truecase = gentask.truecase('medal_en_truecase', en_retok(), en_retok(),
                               target_dir / 'en.truecase.txt')

# en_genia = gentask.geniatagger('medal_en_genia', en_truecase(),
#                                target_dir / 'en.genia.txt')

# en_genia_line_iih = gentask.genia_line_IIH(
#     'en_genia_line_iih', en_genia(), target_dir / 'en.genia.hiih.txt'
# )  # horizontal and IIH
    def output(self):
        return luigi.LocalTarget(str(target_dir / 'chtag.chpreprcs.txt'))

    def run(self):
        with self.input().open('r') as inf, self.output().open('w') as outf:
            for zh, en, _ in tools.group_n_lines(inf, n=3):
                zh, en = zh.strip(), en.strip()
                zh = tools.zhsent_preprocess(zh)
                with contextlib.redirect_stdout(outf):
                    print(zh)
                    print(en)
                    print()


sbc4_zh = gentask.slice_lines_grouped_by_n('sbc4_zh', sbc4(),
                                           target_dir / 'ch.txt',
                                           n=3,
                                           s=0)
sbc4_tag = gentask.slice_lines_grouped_by_n('sbc4_tag', sbc4(),
                                            target_dir / 'tag.txt',
                                            n=3,
                                            s=1)

sbc4_tag_lm = gentask.lm(
    'sbc4_tag_lm', sbc4_tag, target_dir / 'tag.lm', target_dir / 'tag.blm')

sbc4_zhpreprocess_slash = gentask.transformat_line2slash(
    'sbc4_zhpreprocess_slash', sbc4_zhpreprocess(),
    target_dir / 'zhpreprcs.slash.txt')

sbc4_tok_tag_tm = gentask.phrasetable(
    'sbc4_tok_tag_phrasetable', sbc4_zhpreprocess(),
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import gentask
import luigi
from sbc4_tm_lm_tasks import sbc4

from pathlib import Path
base_dir = Path('data/zhtoktag_eval/')


sbc4_train = gentask.slice_lines_grouped_by_n(
    'sbc4_train', sbc4(), base_dir / 'sbc4_train.txt', n=3 * 10, s=slice(0, 3 * 9))

sbc4_test = gentask.slice_lines_grouped_by_n(
    'sbc4_test', sbc4(), base_dir / 'sbc4_test.txt', n=3 * 10, s=slice(3 * 9, 3 * 10))

sbc4_test_zh = gentask.slice_lines_grouped_by_n(
    'sbc4_test_zh', sbc4_test(), base_dir / 'sbc4_test.zh.txt', n=3, s=0)


sbc4_test_zh_untok = gentask.untok(
    'sbc4_test_zh_untok', sbc4_test_zh(), base_dir / 'sbc4_test.zh.untok.txt')


sbc4_train_tag = gentask.slice_lines_grouped_by_n(
    'sbc4_train_tag', sbc4_train(), base_dir / 'sbc4_train.tag.txt', n=3, s=1)

sbc4_train_tag_lm = gentask.lm(
    'sbc4_train_tag_lm', sbc4_train_tag(), base_dir / 'sbc4_train.tag.lm', base_dir / 'sbc4_train.tag.blm')