def __init__(self, encoder_model_dimension: int, decoder_model_dimension: int, dropout_rate: float, decoder_num_heads: list, encoder_num_heads: list, encoder_maximum_position_encoding: int, decoder_maximum_position_encoding: int, postnet_conv_filters: int, postnet_conv_layers: int, postnet_kernel_size: int, encoder_dense_blocks: int, decoder_dense_blocks: int, mel_channels: int, phoneme_language: str, with_stress: bool, encoder_attention_conv_filters: int = None, decoder_attention_conv_filters: int = None, encoder_attention_conv_kernel: int = None, decoder_attention_conv_kernel: int = None, encoder_feed_forward_dimension: int = None, decoder_feed_forward_dimension: int = None, debug=False, decoder_prenet_dropout=0., **kwargs): super(ForwardTransformer, self).__init__(**kwargs) self.text_pipeline = Pipeline.default_pipeline(phoneme_language, add_start_end=False, with_stress=with_stress) self.drop_n_heads = 0 self.mel_channels = mel_channels self.encoder_prenet = tf.keras.layers.Embedding( self.text_pipeline.tokenizer.vocab_size, encoder_model_dimension, name='Embedding') self.encoder = SelfAttentionBlocks( model_dim=encoder_model_dimension, dropout_rate=dropout_rate, num_heads=encoder_num_heads, feed_forward_dimension=encoder_feed_forward_dimension, maximum_position_encoding=encoder_maximum_position_encoding, dense_blocks=encoder_dense_blocks, conv_filters=encoder_attention_conv_filters, kernel_size=encoder_attention_conv_kernel, conv_activation='relu', name='Encoder') self.dur_pred = DurationPredictor(model_dim=encoder_model_dimension, kernel_size=3, conv_padding='same', conv_activation='relu', conv_block_n=2, dense_activation='relu', name='dur_pred') self.expand = Expand(name='expand', model_dim=encoder_model_dimension) self.decoder_prenet = DecoderPrenet( model_dim=decoder_model_dimension, dense_hidden_units=decoder_feed_forward_dimension, dropout_rate=decoder_prenet_dropout, name='DecoderPrenet') self.decoder = SelfAttentionBlocks( model_dim=decoder_model_dimension, dropout_rate=dropout_rate, num_heads=decoder_num_heads, feed_forward_dimension=decoder_feed_forward_dimension, maximum_position_encoding=decoder_maximum_position_encoding, dense_blocks=decoder_dense_blocks, conv_filters=decoder_attention_conv_filters, kernel_size=decoder_attention_conv_kernel, conv_activation='relu', name='Decoder') self.out = tf.keras.layers.Dense(mel_channels) self.decoder_postnet = CNNResNorm(out_size=mel_channels, kernel_size=postnet_kernel_size, padding='same', inner_activation='tanh', last_activation='linear', hidden_size=postnet_conv_filters, n_layers=postnet_conv_layers, normalization='batch', name='Postnet') self.training_input_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32), tf.TensorSpec(shape=(None, None), dtype=tf.int32) ] self.forward_input_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(), dtype=tf.float32), ] self.debug = debug self._apply_all_signatures()
def __init__(self, encoder_model_dimension: int, decoder_model_dimension: int, encoder_num_heads: list, decoder_num_heads: list, encoder_maximum_position_encoding: int, decoder_maximum_position_encoding: int, encoder_dense_blocks: int, decoder_dense_blocks: int, encoder_prenet_dimension: int, decoder_prenet_dimension: int, postnet_conv_filters: int, postnet_conv_layers: int, postnet_kernel_size: int, dropout_rate: float, mel_start_value: int, mel_end_value: int, mel_channels: int, xvec_channels: int, phoneme_language: str, with_stress: bool, encoder_attention_conv_filters: int = None, decoder_attention_conv_filters: int = None, encoder_attention_conv_kernel: int = None, decoder_attention_conv_kernel: int = None, encoder_feed_forward_dimension: int = None, decoder_feed_forward_dimension: int = None, decoder_prenet_dropout=0.5, max_r: int = 10, debug=False, **kwargs): super(AutoregressiveTransformer, self).__init__(**kwargs) self.start_vec = tf.ones( (1, mel_channels), dtype=tf.float32) * mel_start_value self.end_vec = tf.ones( (1, mel_channels), dtype=tf.float32) * mel_end_value self.stop_prob_index = 2 self.max_r = max_r self.r = max_r self.mel_channels = mel_channels self.drop_n_heads = 0 self.text_pipeline = Pipeline.default_pipeline(phoneme_language, add_start_end=True, with_stress=with_stress) self.encoder_prenet = tf.keras.layers.Embedding( self.text_pipeline.tokenizer.vocab_size, encoder_prenet_dimension, name='Embedding') self.enc_speaker_mod = enc_Speaker_module(dim=512) self.dec_speaker_mod = dec_Speaker_module(dim=256) self.encoder = SelfAttentionBlocks( model_dim=encoder_model_dimension, dropout_rate=dropout_rate, num_heads=encoder_num_heads, feed_forward_dimension=encoder_feed_forward_dimension, maximum_position_encoding=encoder_maximum_position_encoding, dense_blocks=encoder_dense_blocks, conv_filters=encoder_attention_conv_filters, kernel_size=encoder_attention_conv_kernel, conv_activation='relu', name='Encoder') self.decoder_prenet = DecoderPrenet( model_dim=decoder_model_dimension, dense_hidden_units=decoder_prenet_dimension, dropout_rate=decoder_prenet_dropout, name='DecoderPrenet') self.decoder = CrossAttentionBlocks( model_dim=decoder_model_dimension, dropout_rate=dropout_rate, num_heads=decoder_num_heads, feed_forward_dimension=decoder_feed_forward_dimension, maximum_position_encoding=decoder_maximum_position_encoding, dense_blocks=decoder_dense_blocks, conv_filters=decoder_attention_conv_filters, conv_kernel=decoder_attention_conv_kernel, conv_activation='relu', conv_padding='causal', name='Decoder') self.final_proj_mel = tf.keras.layers.Dense(self.mel_channels * self.max_r, name='FinalProj') self.decoder_postnet = Postnet(mel_channels=mel_channels, conv_filters=postnet_conv_filters, conv_layers=postnet_conv_layers, kernel_size=postnet_kernel_size, name='Postnet') self.training_input_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32), tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32) ] self.forward_input_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32), tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32) ] self.encoder_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32) ] self.decoder_signature = [ tf.TensorSpec(shape=(None, None, encoder_model_dimension), dtype=tf.float32), tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32), tf.TensorSpec(shape=(None, None, None, None), dtype=tf.float32), tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32) ] self.debug = debug self._apply_all_signatures()
train_len = metadata_len - test_len print(f'\nReading metadata from {metadatareader.metadata_path}') print(f'\nRemoving {len(remove_files)} lines out of {metadata_len_tot}.') print(f'\nMetadata contains {metadata_len} lines.') print(f'Files will be stored under {cm.data_dir}') print(f' - all: {phonemized_metadata_path}') print(f' - {train_len} training lines: {train_metadata_path}') print(f' - {test_len} validation lines: {test_metadata_path}') print('\nMetadata samples:') for i in sample_items: print(f'{i}:{metadatareader.text_dict[i]}') # run cleaner on raw text text_proc = Pipeline.default_training_pipeline( cm.config['phoneme_language'], add_start_end=False, with_stress=cm.config['with_stress']) texts = [metadatareader.text_dict[k] for k in metadatareader.filenames] clean_texts = text_proc.cleaner(list(texts)) clean_texts = dict(zip(metadatareader.filenames, clean_texts)) key_list = list(clean_texts.keys()) print('\nCleaned metadata samples:') for i in sample_items: print(f'{i}:{clean_texts[i]}') print('\nPHONEMIZING') batch_size = args.phonemizer_batch_size failed_files = [] phonemized_data = {} for i in tqdm.tqdm(range(0, len(key_list) + batch_size, batch_size)): batch_keys = key_list[i:i + batch_size]
yaml = ruamel.yaml.YAML() with open(str(Path(args.CONFIG) / 'data_config.yaml'), 'rb') as conf_yaml: config = yaml.load(conf_yaml) args.DATA_DIR = config['data_directory'] args.META_FILE = os.path.join(args.DATA_DIR, config['metadata_filename']) args.WAV_DIR = os.path.join(args.DATA_DIR, config['wav_subdir_name']) args.TARGET_DIR = config['train_data_directory'] if args.TARGET_DIR is None: args.TARGET_DIR = args.DATA_DIR mel_dir = os.path.join(args.TARGET_DIR, 'mels') if not os.path.exists(mel_dir): os.makedirs(mel_dir) phon_path = os.path.join(args.TARGET_DIR, 'phonemes.npy') text_proc = Pipeline.default_pipeline(config['phoneme_language'], add_start_end=True) if os.path.exists(phon_path) and not args.RECOMPUTE_PHON: print('Using cached phonemes.') audio_data = np.load(phon_path) else: print('\nLoading and cleaning text') audio_data = [] with open(args.META_FILE, 'r', encoding='utf-8') as f: for l in f.readlines(): l_split = l.split(args.COLUMN_SEP) filename, text = l_split[0], l_split[-1] if filename.endswith('.wav'): filename = filename.split('.')[-1] text = text_proc.cleaner(text) audio_data.append((filename, text))
args.META_FILE = os.path.join(args.DATA_DIR, config['metadata_filename']) args.WAV_DIR = os.path.join(args.DATA_DIR, config['wav_subdir_name']) args.TARGET_DIR = config['train_data_directory'] if args.TARGET_DIR is None: args.TARGET_DIR = args.DATA_DIR mel_dir = os.path.join(args.TARGET_DIR, 'mels') print(mel_dir) print(args.TARGET_DIR) if not os.path.exists(mel_dir): os.makedirs(mel_dir) phon_path = os.path.join(args.TARGET_DIR, 'phonemes.npy') print('phon_path:',phon_path) text_proc = Pipeline.default_training_pipeline(config['phoneme_language'], add_start_end=True,with_stress = 'False' ) if os.path.exists(phon_path) and not args.RECOMPUTE_PHON: print('Using cached phonemes.') audio_data = np.load(phon_path) else: print('\nLoading and cleaning text') audio_data = [] with open(args.META_FILE, 'r', encoding='utf-8') as f: csv_reader = csv.reader(f) for l in csv_reader: filename, text = l[0], l[1] text = text_proc.cleaner(text) audio_data.append((filename, text)) audio_data = np.array(audio_data)