def generate_tokenizer(self):
        additional_tokens = None
        if self.mark_language:
            additional_tokens = [LANGUAGE_TOKENS('en')]

        if self.tokenization == 'bpe':
            tokz = BPETokenizer(self.code_file,
                                vocab_file=self.vocab_file,
                                num_symbols=self.num_symbols,
                                additional_tokens=additional_tokens,
                                pre_tokenize=self.pre_tokenize)
            if not hasattr(tokz, 'bpe'):
                sentences = (self.captions[i] for i in self.indexes)
                tokz.learn_bpe(sentences, from_filenames=False)
        else:
            tokz = self.__tokenizers[self.tokenization](
                vocab_file=self.vocab_file,
                additional_tokens=additional_tokens,
                pre_tokenize=self.pre_tokenize)

        if not hasattr(tokz, 'vocab'):
            assert self.split == 'train', "better generate vocab for training split"
            sentences = (self.captions[i] for i in self.indexes)
            logging.info('generating vocabulary. saving to %s' %
                         self.vocab_file)
            tokz.get_vocab(sentences, from_filenames=False)
            tokz.save_vocab(self.vocab_file)
        tokz.load_vocab(self.vocab_file,
                        limit=self.vocab_limit,
                        min_count=self.vocab_min_count)
        self.tokenizer = tokz
Exemple #2
0
    def generate_tokenizers(self):
        self.tokenizers = OrderedDict()
        additional_tokens = None
        if self.mark_language:
            additional_tokens = [LANGUAGE_TOKENS(l) for l in self.languages]
        for l in self.languages:
            if self.shared_vocab:
                files = [self.input_files[t] for t in self.languages]
            else:
                files = [self.input_files[l]]

            if self.tokenization == 'bpe':
                tokz = BPETokenizer(self.code_files[l],
                                    vocab_file=self.vocab_files[l],
                                    num_symbols=self.num_symbols,
                                    additional_tokens=additional_tokens)
                if not hasattr(tokz, 'bpe'):
                    tokz.learn_bpe(files)
            else:
                tokz = self.__tokenizers[self.tokenization](
                    vocab_file=self.vocab_files[l],
                    vocab_limit=self.vocab_limit,
                    additional_tokens=additional_tokens)

            if not hasattr(tokz, 'vocab'):
                logging.info('generating vocabulary. saving to %s' %
                             self.vocab_files[l])
                tokz.get_vocab(files)
                tokz.save_vocab(self.vocab_files[l])
            tokz.load_vocab(self.vocab_files[l], limit=self.vocab_limit)
            self.tokenizers[l] = tokz
    def generate_tokenizers(self):
        self.tokenizers = OrderedDict()
        additional_tokens = None
        if self.mark_language:
            additional_tokens = [LANGUAGE_TOKENS(l) for l in self.languages]
        for l in self.languages:
            if self.shared_vocab:
                files = [self.input_files[t] for t in self.languages]
            else:
                files = [self.input_files[l]]

            if self.tokenization == 'bpe':
                tokz = BPETokenizer(self.code_files[l],
                                    vocab_file=self.vocab_files[l],
                                    num_symbols=self.num_symbols,
                                    additional_tokens=additional_tokens)
                if not hasattr(tokz, 'bpe'):
                    tokz.learn_bpe(files)
            else:
                tokz = self.__tokenizers[self.tokenization](
                    vocab_file=self.vocab_files[l],
                    additional_tokens=additional_tokens)

            if not hasattr(tokz, 'vocab'):
                logging.info('generating vocabulary. saving to %s' %
                             self.vocab_files[l])
                tokz.get_vocab(files)
                tokz.save_vocab(self.vocab_files[l])
            tokz.load_vocab(self.vocab_files[l], limit=self.vocab_limit)
            self.tokenizers[l] = tokz
Exemple #4
0
    def generate_tokenizer(self):
        additional_tokens = None
        if self.mark_language:
            additional_tokens = [LANGUAGE_TOKENS('en')]

        if self.tokenization == 'bpe':
            tokz = BPETokenizer(self.code_file,
                                vocab_file=self.vocab_file,
                                num_symbols=self.num_symbols,
                                additional_tokens=additional_tokens)
            if not hasattr(tokz, 'bpe'):
                sentences = (d['caption']
                             for d in self.data.coco.anns.values())
                tokz.learn_bpe(sentences, from_filenames=False)
        else:
            tokz = self.__tokenizers[self.tokenization](
                vocab_file=self.vocab_file,
                additional_tokens=additional_tokens)

        if not hasattr(tokz, 'vocab'):
            sentences = (d['caption'] for d in self.data.coco.anns.values())
            logging.info('generating vocabulary. saving to %s' %
                         self.vocab_file)
            tokz.get_vocab(sentences, from_filenames=False)
            tokz.save_vocab(self.vocab_file)
        self.tokenizer = tokz
    def generate_tokenizer(self):
        additional_tokens = None
        if self.mark_language:
            additional_tokens = [LANGUAGE_TOKENS('en')]

        if self.tokenization == 'bpe':
            tokz = BPETokenizer(self.code_file,
                                vocab_file=self.vocab_file,
                                num_symbols=self.num_symbols,
                                additional_tokens=additional_tokens,
                                pre_tokenize=self.pre_tokenize)
            if not hasattr(tokz, 'bpe'):
                sentences = (d['caption']
                             for d in self.data.coco.anns.values())
                tokz.learn_bpe(sentences, from_filenames=False)
        else:
            tokz = self.__tokenizers[self.tokenization](
                vocab_file=self.vocab_file,
                additional_tokens=additional_tokens,
                pre_tokenize=self.pre_tokenize)

        if not hasattr(tokz, 'vocab'):
            sentences = (d['caption'] for d in self.data.coco.anns.values())
            logging.info('generating vocabulary. saving to %s' %
                         self.vocab_file)
            tokz.get_vocab(sentences, from_filenames=False)
            tokz.save_vocab(self.vocab_file)
        self.tokenizer = tokz
Exemple #6
0
    def generate_tokenizers(self):
        self.tokenizers = OrderedDict()
        additional_tokens = None
        if self.mark_language:
            additional_tokens = [LANGUAGE_TOKENS(l) for l in self.languages]
        for l in self.languages:
            if self.shared_vocab:
                files = [self.input_files[t] for t in self.languages]
            else:
                files = [self.input_files[l]]
            if self.tokenization == 'sentencepiece':
                tokz = SentencePiece(self.model_prefixes[l],
                                     num_symbols=self.num_symbols,
                                     additional_tokens=additional_tokens)
                if getattr(tokz, 'model', None) is None:
                    tokz.learn_model(files)
            elif self.tokenization == 'word+char':
                word_vocab_file = self.vocab_files[l] + '.word'
                char_vocab_file = self.vocab_files[l] + '.char'
                tokz = WordCharTokenizer(
                    word_vocab_file=word_vocab_file,
                    char_vocab_file=char_vocab_file,
                    word_additional_tokens=additional_tokens,
                    char_additional_tokens=additional_tokens)

                if not hasattr(tokz.word_tokenizer, 'vocab'):
                    logging.info('generating vocabulary. saving to %s' %
                                 self.vocab_files[l])
                    tokz.get_vocab(files)
                    tokz.save_vocab(word_vocab_file, char_vocab_file)
                tokz.load_vocab(word_vocab_file,
                                char_vocab_file,
                                limit=self.vocab_limit)
            else:
                if self.tokenization == 'bpe':
                    tokz = BPETokenizer(
                        self.code_files[l],
                        vocab_file=self.vocab_files[l],
                        num_symbols=self.num_symbols,
                        additional_tokens=additional_tokens,
                        use_moses=l if self.use_moses else None)
                    if not hasattr(tokz, 'bpe'):
                        tokz.learn_bpe(files)
                else:
                    tokz = self.__tokenizers[self.tokenization](
                        vocab_file=self.vocab_files[l],
                        additional_tokens=additional_tokens,
                        use_moses=l if self.use_moses else None)

                if not hasattr(tokz, 'vocab'):
                    logging.info('generating vocabulary. saving to %s' %
                                 self.vocab_files[l])
                    tokz.get_vocab(files)
                    tokz.save_vocab(self.vocab_files[l])
                tokz.load_vocab(self.vocab_files[l], limit=self.vocab_limit)
            self.tokenizers[l] = tokz
from seq2seq.tools.tokenizer import Tokenizer, BPETokenizer, CharTokenizer

test_file = '../../README.md'
text = 'machine learning - hello world'

tokenizer = Tokenizer(vocab_file='test.vocab')
tokenizer.get_vocab([test_file], from_filenames=True)
tokenized = tokenizer.tokenize(text)
print(tokenized, tokenizer.detokenize(tokenized))

char_tokenizer = CharTokenizer(vocab_file='test_char.vocab')
char_tokenizer.get_vocab([test_file], from_filenames=True)
tokenized = char_tokenizer.tokenize(text)
print(tokenized, char_tokenizer.detokenize(tokenized))

bpe_tokenizer = BPETokenizer('test_bpe.codes', 'test_bpe.vocab', num_symbols=100)
bpe_tokenizer.learn_bpe([test_file], from_filenames=True)
bpe_tokenizer.get_vocab([test_file], from_filenames=True)

tokenized = bpe_tokenizer.tokenize(text)
print(tokenized, bpe_tokenizer.detokenize(tokenized))
from seq2seq.tools.tokenizer import Tokenizer, BPETokenizer, CharTokenizer

test_file = '../../README.md'
text = 'machine learning - hello world'

tokenizer = Tokenizer(vocab_file='test.vocab')
tokenizer.get_vocab([test_file], from_filenames=True)
tokenized = tokenizer.tokenize(text)
print(tokenized, tokenizer.detokenize(tokenized))

char_tokenizer = CharTokenizer(vocab_file='test_char.vocab')
char_tokenizer.get_vocab([test_file], from_filenames=True)
tokenized = char_tokenizer.tokenize(text)
print(tokenized, char_tokenizer.detokenize(tokenized))

bpe_tokenizer = BPETokenizer('test_bpe.codes', 'test_bpe.vocab', num_symbols=100, use_moses=True)
bpe_tokenizer.learn_bpe([test_file], from_filenames=True)
bpe_tokenizer.get_vocab([test_file], from_filenames=True)

tokenized = bpe_tokenizer.tokenize(text)
print(tokenized, bpe_tokenizer.detokenize(tokenized))