コード例 #1
0
ファイル: fbis_tasks.py プロジェクト: d2207197/smttoktag
    def output(self):
        return {
            'en': luigi.LocalTarget('data/fbis/fbis.en.pruned'),
            'ch': luigi.LocalTarget('data/fbis/fbis.ch.pruned')
        }

    def run(self):
        with self.input()['en'].open(
            'r') as en_infile, self.input()['ch'].open('r') as ch_infile:
            with self.output()['en'].open(
                'w') as en_outfile, self.output()['ch'].open(
                    'w') as ch_outfile:
                for enline, chline in zip(en_infile, ch_infile):
                    if len(chline) > 120:
                        continue
                    en_outfile.write(enline)
                    ch_outfile.write(chline)


fbis_ch_untok = gentask.untok(
    'fbis_ch_untok', fbis_en_ch_prune_long(), 'data/fbis/fbis.ch.untok',
    input_target_key='ch')

fbis_ch_untok_toktag = gentask.zhtoktag(
    'fbis_ch_untok_toktag', fbis_ch_untok(), 'data/fbis/fbis.ch.untok.tok.txt',
    tm=sbc4_zh_to_tok_tag_phrasetable(),
    lm=sbc4_tag_lm())

if __name__ == '__main__':
    luigi.run(local_scheduler=True)
コード例 #2
0
ファイル: movie_sub_task.py プロジェクト: d2207197/smttoktag
# en_patterns_allline = gentask.pattern_allline(
#     'en_patterns_allline', en_genia_line_iih(), target_dir / 'en.patterns.d')

# en_patterns_pretty = gentask.patterns_pretty(
# 'en_patterns_pretty', en_patterns(), target_dir / 'en.patterns.json')

# patterns_allline_task = gentask_pattern.pipeline_allline_task(
# 'moviesub_en_patterns_allline', en_truecase())

filtered_patterns = gentask_pattern.filtered_patterns_from_sentences(
    'moviesub_en_filtered_patterns', en())

# ch = gentask.slice_lines_grouped_by_n('ch', ench(), target_dir / 'ch.txt',
# n=3,
# s=1)
ch_untok = gentask.untok('ch_untok', ch(), target_dir / 'ch.untok.txt')
ch_toktag = gentask.zhtoktag('ch_toktag', ch_untok(),
                             target_dir / 'ch.toktag.txt',
                             tm=sbc4_tok_tag_tm(),
                             lm=sbc4_tag_lm())

ch_tok = gentask.remove_slashtag('ch_tok', ch_toktag(),
                                 target_dir / 'ch.tok.txt')

en_chtok = gentask.parallel_lines_merge('en_chtok', en(), ch_tok(),
                                        target_dir / 'en_chtok.txt')

# giza_task = gentask_giza.giza(inputf=str(target_dir / 'en_chtok.txt'),
#                               outputd=str(target_dir / 'giza/'))

unpack_singleline_patterns = gentask.localtarget_task(
コード例 #3
0
from pathlib import Path
base_dir = Path('data/zhtoktag_eval/')


sbc4_train = gentask.slice_lines_grouped_by_n(
    'sbc4_train', sbc4(), base_dir / 'sbc4_train.txt', n=3 * 10, s=slice(0, 3 * 9))

sbc4_test = gentask.slice_lines_grouped_by_n(
    'sbc4_test', sbc4(), base_dir / 'sbc4_test.txt', n=3 * 10, s=slice(3 * 9, 3 * 10))

sbc4_test_zh = gentask.slice_lines_grouped_by_n(
    'sbc4_test_zh', sbc4_test(), base_dir / 'sbc4_test.zh.txt', n=3, s=0)


sbc4_test_zh_untok = gentask.untok(
    'sbc4_test_zh_untok', sbc4_test_zh(), base_dir / 'sbc4_test.zh.untok.txt')


sbc4_train_tag = gentask.slice_lines_grouped_by_n(
    'sbc4_train_tag', sbc4_train(), base_dir / 'sbc4_train.tag.txt', n=3, s=1)

sbc4_train_tag_lm = gentask.lm(
    'sbc4_train_tag_lm', sbc4_train_tag(), base_dir / 'sbc4_train.tag.lm', base_dir / 'sbc4_train.tag.blm')

sbc4_train_zh_to_tok_tag_phrasetable = gentask.phrasetable(
    'sbc4_train_zh_to_tok_tag_phrasetable', sbc4_train(), base_dir / 'sbc4_train.zh2toktag.phrasetable.h5')


sbc4_train_toktag_sbc4_test = gentask.zhtoktag(
    'sbc4_train_toktag_sbc4_test', sbc4_test_zh_untok(), base_dir / 'sbc4_test.zh.untok.tok.txt', tm=sbc4_train_zh_to_tok_tag_phrasetable(), lm=sbc4_train_tag_lm())