Ejemplo n.º 1
0
 def tokenize_files(self, file_list, lang_tokenizer):
     outfile_group = []
     for file in file_list:
         outfile = tempfile.NamedTemporaryFile(delete=False)
         tokenizer.tokenize_file(lang_tokenizer, file, outfile.name)
         outfile_group.append(outfile.name)
     return outfile_group
Ejemplo n.º 2
0
 def _preprocess_file(self, config, input):
     if 'tokenization' in config:
         tok_config = config['tokenization']
         src_tokenizer = tokenizer.build_tokenizer(tok_config['source'])
         output = "%s.tok" % input
         tokenizer.tokenize_file(src_tokenizer, input, output)
         return output
     return input