def __process_data( text_path: str, dst_folder: str, vocab_size: int, tokenizer_type: str, spe_type: str, spe_character_coverage: float, lower_case: bool, ): """ Converts flac to wav and build manifests's json Args: text_path: source with text lines dst_folder: where wav files will be stored vocab_size: vocabular size used in encoding the text tokenizer_type: type of tokenization to perform - wpe or spe spe_type: type of tokenization model used for spe. spe_character_coverage: float value between 0 and 1 (as a percentage). For languages with a vast charset, can be < 1.0, but for all other languages, it should be set as 1.0 lower_case: whether to tokenize with lower case character set only (for english) Returns: """ if tokenizer_type == 'spe': tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_{}_v{}').format(tokenizer_type, spe_type, vocab_size) if not os.path.exists(tokenizer_dir): os.makedirs(tokenizer_dir) if os.path.exists(os.path.join(tokenizer_dir, 'tokenizer.model')): logging.warning("Model file already exists, overriding old model file !") os.remove(os.path.join(tokenizer_dir, 'tokenizer.model')) tokenizer_path, vocab_path = create_spt_model( data_file=text_path, vocab_size=vocab_size, sample_size=-1, do_lower_case=lower_case, output_dir=tokenizer_dir, tokenizer_type=spe_type, character_coverage=spe_character_coverage, ) else: tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_v{}').format(tokenizer_type, vocab_size) if not os.path.exists(tokenizer_dir): os.makedirs(tokenizer_dir) tokenizer = tokenizers.BertWordPieceTokenizer(lowercase=lower_case) tokenizer.train(text_path, vocab_size=vocab_size) tokenizer.save_model(tokenizer_dir) return tokenizer_dir
def __process_data(text_path: str, dst_folder: str, vocab_size: int, tokenizer_type: str, spe_type: str): """ Converts flac to wav and build manifests's json Args: text_path: source with text lines dst_folder: where wav files will be stored vocab_size: vocabular size used in encoding the text tokenizer_type: type of tokenization to perform - wpe or spe spe_type: type of tokenization model used for spe. Returns: """ tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_v{}').format( tokenizer_type, vocab_size) if not os.path.exists(tokenizer_dir): os.makedirs(tokenizer_dir) if tokenizer_type == 'spe': if os.path.exists(os.path.join(tokenizer_dir, 'tokenizer.model')): logging.warning( "Model file already exists, overriding old model file !") os.remove(os.path.join(tokenizer_dir, 'tokenizer.model')) tokenizer_path, vocab_path = create_spt_model( data_file=text_path, vocab_size=vocab_size, sample_size=-1, do_lower_case=True, output_dir=tokenizer_dir, tokenizer_type=spe_type, ) else: tokenizer = tokenizers.BertWordPieceTokenizer(lowercase=True) tokenizer.train(text_path, vocab_size=vocab_size) tokenizer.save_model(tokenizer_dir) return tokenizer_dir
def train_tokenizers( out_dir, src_fname, tgt_fname, shared_tokenizer, encoder_tokenizer_name, encoder_tokenizer_vocab_size, encoder_tokenizer_coverage, decoder_tokenizer_name, decoder_tokenizer_vocab_size, decoder_tokenizer_coverage, global_rank, encoder_training_sample_size=-1, decoder_training_sample_size=-1, encoder_special_tokens=None, decoder_special_tokens=None, spt_symbols=None, multilingual=False, ): encoder_tokenizer_model = None decoder_tokenizer_model = None os.makedirs(out_dir, exist_ok=True) supported_train_tokenizers = ['yttm', 'sentencepiece'] if encoder_special_tokens: if isinstance(encoder_special_tokens, dict): encoder_special_tokens = list(encoder_special_tokens.values()) print(encoder_special_tokens) if decoder_special_tokens: if isinstance(decoder_special_tokens, dict): decoder_special_tokens = list(decoder_special_tokens.values()) if multilingual and encoder_tokenizer_name != 'sentencepiece': raise NotImplementedError( f"Currently we only support training setencepiece tokenizer for multilingual model." ) if shared_tokenizer: if ( encoder_tokenizer_name not in supported_train_tokenizers or decoder_tokenizer_name not in supported_train_tokenizers ): raise NotImplementedError( f"Currently we only support tokenizers in {supported_train_tokenizers} for shared tokenizer." ) encoder_tokenizer_model = os.path.join( out_dir, 'shared_tokenizer.%d.BPE.model' % (encoder_tokenizer_vocab_size) ) decoder_tokenizer_model = encoder_tokenizer_model if global_rank == 0: if os.path.isfile(encoder_tokenizer_model): logging.info( f'Shared tokenizer model {encoder_tokenizer_model} already exists. Remove file if training a new tokenizer model.' ) else: logging.info( f'Shared tokenizer model {encoder_tokenizer_model} not found. Training tokenizer model.' ) with tempfile.TemporaryDirectory() as tmp: concat_data_path = os.path.join(tmp, 'concat_dataset.txt') os.system('cat %s %s > %s' % (src_fname, tgt_fname, concat_data_path)) if encoder_tokenizer_name == "yttm": yttm.BPE.train( data=concat_data_path, vocab_size=encoder_tokenizer_vocab_size, model=os.path.join(out_dir, encoder_tokenizer_model), coverage=encoder_tokenizer_coverage, n_threads=-1, ) else: create_spt_model( data_file=concat_data_path, vocab_size=encoder_tokenizer_vocab_size, sample_size=encoder_training_sample_size, do_lower_case=False, tokenizer_type='bpe', character_coverage=encoder_tokenizer_coverage, output_dir=out_dir, bos=True, eos=True, pad=True, control_symbols=spt_symbols, user_defined_symbols=encoder_special_tokens, ) os.rename( os.path.join(out_dir, 'tokenizer.model'), os.path.join(out_dir, encoder_tokenizer_model), ) else: if encoder_tokenizer_name in supported_train_tokenizers: encoder_tokenizer_model = os.path.join( out_dir, 'tokenizer.encoder.%d.BPE.model' % (encoder_tokenizer_vocab_size) ) if global_rank == 0: if os.path.isfile(encoder_tokenizer_model): logging.info( f'Encoder tokenizer model {encoder_tokenizer_model} already exists. Remove file if training a new tokenizer model.' ) else: logging.info( f'Encoder tokenizer model {encoder_tokenizer_model} not found. Training tokenizer model.' ) if encoder_tokenizer_name == "yttm": yttm.BPE.train( data=src_fname, vocab_size=encoder_tokenizer_vocab_size, model=encoder_tokenizer_model, coverage=encoder_tokenizer_coverage, n_threads=-1, ) else: dir_name = os.path.dirname(encoder_tokenizer_model) create_spt_model( data_file=src_fname, vocab_size=encoder_tokenizer_vocab_size, sample_size=encoder_training_sample_size, do_lower_case=False, tokenizer_type='bpe', character_coverage=encoder_tokenizer_coverage, output_dir=dir_name, bos=True, eos=True, pad=True, control_symbols=spt_symbols, user_defined_symbols=encoder_special_tokens, ) os.rename(os.path.join(dir_name, 'tokenizer.model'), os.path.join(encoder_tokenizer_model)) if decoder_tokenizer_name in supported_train_tokenizers: decoder_tokenizer_model = os.path.join( out_dir, 'tokenizer.decoder.%d.BPE.model' % (decoder_tokenizer_vocab_size) ) if global_rank == 0: if os.path.isfile(decoder_tokenizer_model): logging.info( f'Decoder tokenizer model {decoder_tokenizer_model} already exists. Remove file if training a new tokenizer model.' ) else: logging.info( f'Decoder tokenizer model {decoder_tokenizer_model} not found. Training tokenizer model.' ) if decoder_tokenizer_name == "yttm": yttm.BPE.train( data=tgt_fname, vocab_size=decoder_tokenizer_vocab_size, model=decoder_tokenizer_model, coverage=decoder_tokenizer_coverage, n_threads=-1, ) else: dir_name = os.path.dirname(decoder_tokenizer_model) create_spt_model( data_file=tgt_fname, vocab_size=decoder_tokenizer_vocab_size, sample_size=decoder_training_sample_size, do_lower_case=False, tokenizer_type='bpe', character_coverage=decoder_tokenizer_coverage, output_dir=dir_name, bos=True, eos=True, pad=True, control_symbols=spt_symbols, user_defined_symbols=decoder_special_tokens, ) os.rename(os.path.join(dir_name, 'tokenizer.model'), os.path.join(decoder_tokenizer_model)) return encoder_tokenizer_model, decoder_tokenizer_model
def __process_data( text_path: str, dst_folder: str, vocab_size: int, tokenizer_type: str, spe_type: str, spe_character_coverage: float, spe_train_extremely_large_corpus: bool, spe_sample_size: int, spe_max_sentencepiece_length: int, lower_case: bool, ): """ Converts flac to wav and build manifests's json Args: text_path: source with text lines dst_folder: where wav files will be stored vocab_size: vocabular size used in encoding the text tokenizer_type: type of tokenization to perform - wpe or spe spe_type: type of tokenization model used for spe. spe_character_coverage: float value between 0 and 1 (as a percentage). For languages with a vast charset, can be < 1.0, but for all other languages, it should be set as 1.0 spe_sample_size: int, default of -1. If positive integer is used, samples the dataset by given sample size. spe_train_extremely_large_corpus: bool. If dataset is too large, and user has sufficient RAM, this flag can be set to try to trained the tokenizer. Will silently fail if it runs out of RAM. spe_max_sentencepiece_length: Limits the maximum length of the SentencePiece subword that can be constructed. By default, no limit is placed. lower_case: whether to tokenize with lower case character set only (for english) Returns: """ if tokenizer_type == 'spe': tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_{}_v{}').format( tokenizer_type, spe_type, vocab_size) if not os.path.exists(tokenizer_dir): os.makedirs(tokenizer_dir) if os.path.exists(os.path.join(tokenizer_dir, 'tokenizer.model')): logging.warning( "Model file already exists, overriding old model file !") os.remove(os.path.join(tokenizer_dir, 'tokenizer.model')) tokenizer_path, vocab_path = create_spt_model( data_file=text_path, vocab_size=vocab_size, sample_size=spe_sample_size, do_lower_case=lower_case, output_dir=tokenizer_dir, tokenizer_type=spe_type, character_coverage=spe_character_coverage, train_extremely_large_corpus=spe_train_extremely_large_corpus, max_sentencepiece_length=spe_max_sentencepiece_length, ) else: tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_v{}').format( tokenizer_type, vocab_size) if not os.path.exists(tokenizer_dir): os.makedirs(tokenizer_dir) tokenizer = tokenizers.BertWordPieceTokenizer(lowercase=lower_case) tokenizer.train(text_path, vocab_size=vocab_size) tokenizer.save_model(tokenizer_dir) return tokenizer_dir