def generate_tokenizer(self): additional_tokens = None if self.mark_language: additional_tokens = [LANGUAGE_TOKENS('en')] if self.tokenization == 'bpe': tokz = BPETokenizer(self.code_file, vocab_file=self.vocab_file, num_symbols=self.num_symbols, additional_tokens=additional_tokens, pre_tokenize=self.pre_tokenize) if not hasattr(tokz, 'bpe'): sentences = (self.captions[i] for i in self.indexes) tokz.learn_bpe(sentences, from_filenames=False) else: tokz = self.__tokenizers[self.tokenization]( vocab_file=self.vocab_file, additional_tokens=additional_tokens, pre_tokenize=self.pre_tokenize) if not hasattr(tokz, 'vocab'): assert self.split == 'train', "better generate vocab for training split" sentences = (self.captions[i] for i in self.indexes) logging.info('generating vocabulary. saving to %s' % self.vocab_file) tokz.get_vocab(sentences, from_filenames=False) tokz.save_vocab(self.vocab_file) tokz.load_vocab(self.vocab_file, limit=self.vocab_limit, min_count=self.vocab_min_count) self.tokenizer = tokz
def generate_tokenizers(self): self.tokenizers = OrderedDict() additional_tokens = None if self.mark_language: additional_tokens = [LANGUAGE_TOKENS(l) for l in self.languages] for l in self.languages: if self.shared_vocab: files = [self.input_files[t] for t in self.languages] else: files = [self.input_files[l]] if self.tokenization == 'bpe': tokz = BPETokenizer(self.code_files[l], vocab_file=self.vocab_files[l], num_symbols=self.num_symbols, additional_tokens=additional_tokens) if not hasattr(tokz, 'bpe'): tokz.learn_bpe(files) else: tokz = self.__tokenizers[self.tokenization]( vocab_file=self.vocab_files[l], vocab_limit=self.vocab_limit, additional_tokens=additional_tokens) if not hasattr(tokz, 'vocab'): logging.info('generating vocabulary. saving to %s' % self.vocab_files[l]) tokz.get_vocab(files) tokz.save_vocab(self.vocab_files[l]) tokz.load_vocab(self.vocab_files[l], limit=self.vocab_limit) self.tokenizers[l] = tokz
def generate_tokenizers(self): self.tokenizers = OrderedDict() additional_tokens = None if self.mark_language: additional_tokens = [LANGUAGE_TOKENS(l) for l in self.languages] for l in self.languages: if self.shared_vocab: files = [self.input_files[t] for t in self.languages] else: files = [self.input_files[l]] if self.tokenization == 'bpe': tokz = BPETokenizer(self.code_files[l], vocab_file=self.vocab_files[l], num_symbols=self.num_symbols, additional_tokens=additional_tokens) if not hasattr(tokz, 'bpe'): tokz.learn_bpe(files) else: tokz = self.__tokenizers[self.tokenization]( vocab_file=self.vocab_files[l], additional_tokens=additional_tokens) if not hasattr(tokz, 'vocab'): logging.info('generating vocabulary. saving to %s' % self.vocab_files[l]) tokz.get_vocab(files) tokz.save_vocab(self.vocab_files[l]) tokz.load_vocab(self.vocab_files[l], limit=self.vocab_limit) self.tokenizers[l] = tokz
def generate_tokenizer(self): additional_tokens = None if self.mark_language: additional_tokens = [LANGUAGE_TOKENS('en')] if self.tokenization == 'bpe': tokz = BPETokenizer(self.code_file, vocab_file=self.vocab_file, num_symbols=self.num_symbols, additional_tokens=additional_tokens) if not hasattr(tokz, 'bpe'): sentences = (d['caption'] for d in self.data.coco.anns.values()) tokz.learn_bpe(sentences, from_filenames=False) else: tokz = self.__tokenizers[self.tokenization]( vocab_file=self.vocab_file, additional_tokens=additional_tokens) if not hasattr(tokz, 'vocab'): sentences = (d['caption'] for d in self.data.coco.anns.values()) logging.info('generating vocabulary. saving to %s' % self.vocab_file) tokz.get_vocab(sentences, from_filenames=False) tokz.save_vocab(self.vocab_file) self.tokenizer = tokz
def generate_tokenizer(self): additional_tokens = None if self.mark_language: additional_tokens = [LANGUAGE_TOKENS('en')] if self.tokenization == 'bpe': tokz = BPETokenizer(self.code_file, vocab_file=self.vocab_file, num_symbols=self.num_symbols, additional_tokens=additional_tokens, pre_tokenize=self.pre_tokenize) if not hasattr(tokz, 'bpe'): sentences = (d['caption'] for d in self.data.coco.anns.values()) tokz.learn_bpe(sentences, from_filenames=False) else: tokz = self.__tokenizers[self.tokenization]( vocab_file=self.vocab_file, additional_tokens=additional_tokens, pre_tokenize=self.pre_tokenize) if not hasattr(tokz, 'vocab'): sentences = (d['caption'] for d in self.data.coco.anns.values()) logging.info('generating vocabulary. saving to %s' % self.vocab_file) tokz.get_vocab(sentences, from_filenames=False) tokz.save_vocab(self.vocab_file) self.tokenizer = tokz
def generate_tokenizers(self): self.tokenizers = OrderedDict() additional_tokens = None if self.mark_language: additional_tokens = [LANGUAGE_TOKENS(l) for l in self.languages] for l in self.languages: if self.shared_vocab: files = [self.input_files[t] for t in self.languages] else: files = [self.input_files[l]] if self.tokenization == 'sentencepiece': tokz = SentencePiece(self.model_prefixes[l], num_symbols=self.num_symbols, additional_tokens=additional_tokens) if getattr(tokz, 'model', None) is None: tokz.learn_model(files) elif self.tokenization == 'word+char': word_vocab_file = self.vocab_files[l] + '.word' char_vocab_file = self.vocab_files[l] + '.char' tokz = WordCharTokenizer( word_vocab_file=word_vocab_file, char_vocab_file=char_vocab_file, word_additional_tokens=additional_tokens, char_additional_tokens=additional_tokens) if not hasattr(tokz.word_tokenizer, 'vocab'): logging.info('generating vocabulary. saving to %s' % self.vocab_files[l]) tokz.get_vocab(files) tokz.save_vocab(word_vocab_file, char_vocab_file) tokz.load_vocab(word_vocab_file, char_vocab_file, limit=self.vocab_limit) else: if self.tokenization == 'bpe': tokz = BPETokenizer( self.code_files[l], vocab_file=self.vocab_files[l], num_symbols=self.num_symbols, additional_tokens=additional_tokens, use_moses=l if self.use_moses else None) if not hasattr(tokz, 'bpe'): tokz.learn_bpe(files) else: tokz = self.__tokenizers[self.tokenization]( vocab_file=self.vocab_files[l], additional_tokens=additional_tokens, use_moses=l if self.use_moses else None) if not hasattr(tokz, 'vocab'): logging.info('generating vocabulary. saving to %s' % self.vocab_files[l]) tokz.get_vocab(files) tokz.save_vocab(self.vocab_files[l]) tokz.load_vocab(self.vocab_files[l], limit=self.vocab_limit) self.tokenizers[l] = tokz
from seq2seq.tools.tokenizer import Tokenizer, BPETokenizer, CharTokenizer test_file = '../../README.md' text = 'machine learning - hello world' tokenizer = Tokenizer(vocab_file='test.vocab') tokenizer.get_vocab([test_file], from_filenames=True) tokenized = tokenizer.tokenize(text) print(tokenized, tokenizer.detokenize(tokenized)) char_tokenizer = CharTokenizer(vocab_file='test_char.vocab') char_tokenizer.get_vocab([test_file], from_filenames=True) tokenized = char_tokenizer.tokenize(text) print(tokenized, char_tokenizer.detokenize(tokenized)) bpe_tokenizer = BPETokenizer('test_bpe.codes', 'test_bpe.vocab', num_symbols=100) bpe_tokenizer.learn_bpe([test_file], from_filenames=True) bpe_tokenizer.get_vocab([test_file], from_filenames=True) tokenized = bpe_tokenizer.tokenize(text) print(tokenized, bpe_tokenizer.detokenize(tokenized))
from seq2seq.tools.tokenizer import Tokenizer, BPETokenizer, CharTokenizer test_file = '../../README.md' text = 'machine learning - hello world' tokenizer = Tokenizer(vocab_file='test.vocab') tokenizer.get_vocab([test_file], from_filenames=True) tokenized = tokenizer.tokenize(text) print(tokenized, tokenizer.detokenize(tokenized)) char_tokenizer = CharTokenizer(vocab_file='test_char.vocab') char_tokenizer.get_vocab([test_file], from_filenames=True) tokenized = char_tokenizer.tokenize(text) print(tokenized, char_tokenizer.detokenize(tokenized)) bpe_tokenizer = BPETokenizer('test_bpe.codes', 'test_bpe.vocab', num_symbols=100, use_moses=True) bpe_tokenizer.learn_bpe([test_file], from_filenames=True) bpe_tokenizer.get_vocab([test_file], from_filenames=True) tokenized = bpe_tokenizer.tokenize(text) print(tokenized, bpe_tokenizer.detokenize(tokenized))