Beispiel #1
0
import luigi

import gentask
import gentask_giza
from pathlib import Path
import sys
from sbc4_tm_lm_tasks import sbc4_tok_tag_tm, sbc4_tag_lm
from collections import Counter, defaultdict
from operator import itemgetter
from itertools import chain
from functools import reduce
import gentask_pattern
import gentask_spg

ch = gentask.localtarget_task('tgt_data/moviesub/ch.txt')
en = gentask.localtarget_task('tgt_data/moviesub/en.txt')

target_dir = Path('tgt_data/moviesub')

# ench = gentask.transformat_tab2lines('line_sep_ench', orig_ench(),
# target_dir / 'ench.txt')
# en = gentask.slice_lines_grouped_by_n('en', ench(), target_dir / 'en.txt',
#                                       n=3,
#                                       s=0)
# en_unidecode = gentask.unidecode('en_unidecode', en(),
#                                  target_dir / 'en.unidecode.txt')
# en_retok = gentask.word_tokenize('en_retok', en_unidecode(),
#                                  target_dir / 'en.retok.txt')
# en_truecase = gentask.truecase('moviesub_en_truecase', en_retok(), en_retok(),
#                                target_dir / 'en.truecase.txt')
Beispiel #2
0
 def requires(self):
     return localtarget_task(self.inputf)()
Beispiel #3
0
# -*- coding: utf-8 -*-

import luigi

import gentask
import gentask_giza
from pathlib import Path
import sys
from sbc4_tm_lm_tasks import sbc4_tok_tag_tm, sbc4_tag_lm
from collections import Counter, defaultdict
from operator import itemgetter
from itertools import chain
from functools import reduce
import operator
import gentask_pattern
orig_ench = gentask.localtarget_task('src_data/medal.ench.txt')

target_dir = Path('tgt_data/medal')
ench = gentask.transformat_tab2lines('line_sep_ench', orig_ench(),
                                     target_dir / 'ench.txt')
en = gentask.slice_lines_grouped_by_n('en', ench(), target_dir / 'en.txt',
                                      n=3,
                                      s=0)
en_unidecode = gentask.unidecode('en_unidecode', en(),
                                 target_dir / 'en.unidecode.txt')
en_retok = gentask.word_tokenize('en_retok', en_unidecode(),
                                 target_dir / 'en.retok.txt')
en_truecase = gentask.truecase('medal_en_truecase', en_retok(), en_retok(),
                               target_dir / 'en.truecase.txt')

# en_genia = gentask.geniatagger('medal_en_genia', en_truecase(),