def tokenize_files(self, file_list, lang_tokenizer): outfile_group = [] for file in file_list: outfile = tempfile.NamedTemporaryFile(delete=False) tokenizer.tokenize_file(lang_tokenizer, file, outfile.name) outfile_group.append(outfile.name) return outfile_group
def _preprocess_file(self, config, input): if 'tokenization' in config: tok_config = config['tokenization'] src_tokenizer = tokenizer.build_tokenizer(tok_config['source']) output = "%s.tok" % input tokenizer.tokenize_file(src_tokenizer, input, output) return output return input