Esempio n. 1
0
# -*- coding: utf-8 -*-

import luigi
import tools
import gentask
from sbc4_tm_lm_tasks import sbc4_zh_to_tok_tag_phrasetable, sbc4_tag_lm


class test_zh_data(luigi.ExternalTask):

    def output(self):
        return luigi.LocalTarget('data/testzh.txt')


gentask.zhtoktag('test_zh_tok', test_zh_data(), 'tt',
                 tm=sbc4_zh_to_tok_tag_phrasetable(), lm=sbc4_tag_lm())


class oxford_np_ench(luigi.ExternalTask):

    def output(self):
        return luigi.LocalTarget('data/oxford.np.ench.txt')


class oxford_np_ch(luigi.Task):

    def requires(self):
        return oxford_np_ench()

    def output(self):
        return luigi.LocalTarget('data/oxford.np.ch.txt')
Esempio n. 2
0
    def output(self):
        return {
            'en': luigi.LocalTarget('data/fbis/fbis.en.pruned'),
            'ch': luigi.LocalTarget('data/fbis/fbis.ch.pruned')
        }

    def run(self):
        with self.input()['en'].open(
            'r') as en_infile, self.input()['ch'].open('r') as ch_infile:
            with self.output()['en'].open(
                'w') as en_outfile, self.output()['ch'].open(
                    'w') as ch_outfile:
                for enline, chline in zip(en_infile, ch_infile):
                    if len(chline) > 120:
                        continue
                    en_outfile.write(enline)
                    ch_outfile.write(chline)


fbis_ch_untok = gentask.untok(
    'fbis_ch_untok', fbis_en_ch_prune_long(), 'data/fbis/fbis.ch.untok',
    input_target_key='ch')

fbis_ch_untok_toktag = gentask.zhtoktag(
    'fbis_ch_untok_toktag', fbis_ch_untok(), 'data/fbis/fbis.ch.untok.tok.txt',
    tm=sbc4_zh_to_tok_tag_phrasetable(),
    lm=sbc4_tag_lm())

if __name__ == '__main__':
    luigi.run(local_scheduler=True)