コード例 #1
0
 def __init__(self,
              encoder_model_dimension: int,
              decoder_model_dimension: int,
              dropout_rate: float,
              decoder_num_heads: list,
              encoder_num_heads: list,
              encoder_maximum_position_encoding: int,
              decoder_maximum_position_encoding: int,
              postnet_conv_filters: int,
              postnet_conv_layers: int,
              postnet_kernel_size: int,
              encoder_dense_blocks: int,
              decoder_dense_blocks: int,
              mel_channels: int,
              phoneme_language: str,
              with_stress: bool,
              encoder_attention_conv_filters: int = None,
              decoder_attention_conv_filters: int = None,
              encoder_attention_conv_kernel: int = None,
              decoder_attention_conv_kernel: int = None,
              encoder_feed_forward_dimension: int = None,
              decoder_feed_forward_dimension: int = None,
              debug=False,
              decoder_prenet_dropout=0.,
              **kwargs):
     super(ForwardTransformer, self).__init__(**kwargs)
     self.text_pipeline = Pipeline.default_pipeline(phoneme_language,
                                                    add_start_end=False,
                                                    with_stress=with_stress)
     self.drop_n_heads = 0
     self.mel_channels = mel_channels
     self.encoder_prenet = tf.keras.layers.Embedding(
         self.text_pipeline.tokenizer.vocab_size,
         encoder_model_dimension,
         name='Embedding')
     self.encoder = SelfAttentionBlocks(
         model_dim=encoder_model_dimension,
         dropout_rate=dropout_rate,
         num_heads=encoder_num_heads,
         feed_forward_dimension=encoder_feed_forward_dimension,
         maximum_position_encoding=encoder_maximum_position_encoding,
         dense_blocks=encoder_dense_blocks,
         conv_filters=encoder_attention_conv_filters,
         kernel_size=encoder_attention_conv_kernel,
         conv_activation='relu',
         name='Encoder')
     self.dur_pred = DurationPredictor(model_dim=encoder_model_dimension,
                                       kernel_size=3,
                                       conv_padding='same',
                                       conv_activation='relu',
                                       conv_block_n=2,
                                       dense_activation='relu',
                                       name='dur_pred')
     self.expand = Expand(name='expand', model_dim=encoder_model_dimension)
     self.decoder_prenet = DecoderPrenet(
         model_dim=decoder_model_dimension,
         dense_hidden_units=decoder_feed_forward_dimension,
         dropout_rate=decoder_prenet_dropout,
         name='DecoderPrenet')
     self.decoder = SelfAttentionBlocks(
         model_dim=decoder_model_dimension,
         dropout_rate=dropout_rate,
         num_heads=decoder_num_heads,
         feed_forward_dimension=decoder_feed_forward_dimension,
         maximum_position_encoding=decoder_maximum_position_encoding,
         dense_blocks=decoder_dense_blocks,
         conv_filters=decoder_attention_conv_filters,
         kernel_size=decoder_attention_conv_kernel,
         conv_activation='relu',
         name='Decoder')
     self.out = tf.keras.layers.Dense(mel_channels)
     self.decoder_postnet = CNNResNorm(out_size=mel_channels,
                                       kernel_size=postnet_kernel_size,
                                       padding='same',
                                       inner_activation='tanh',
                                       last_activation='linear',
                                       hidden_size=postnet_conv_filters,
                                       n_layers=postnet_conv_layers,
                                       normalization='batch',
                                       name='Postnet')
     self.training_input_signature = [
         tf.TensorSpec(shape=(None, None), dtype=tf.int32),
         tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32),
         tf.TensorSpec(shape=(None, None), dtype=tf.int32)
     ]
     self.forward_input_signature = [
         tf.TensorSpec(shape=(None, None), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.float32),
     ]
     self.debug = debug
     self._apply_all_signatures()
コード例 #2
0
    def __init__(self,
                 encoder_model_dimension: int,
                 decoder_model_dimension: int,
                 encoder_num_heads: list,
                 decoder_num_heads: list,
                 encoder_maximum_position_encoding: int,
                 decoder_maximum_position_encoding: int,
                 encoder_dense_blocks: int,
                 decoder_dense_blocks: int,
                 encoder_prenet_dimension: int,
                 decoder_prenet_dimension: int,
                 postnet_conv_filters: int,
                 postnet_conv_layers: int,
                 postnet_kernel_size: int,
                 dropout_rate: float,
                 mel_start_value: int,
                 mel_end_value: int,
                 mel_channels: int,
                 xvec_channels: int,
                 phoneme_language: str,
                 with_stress: bool,
                 encoder_attention_conv_filters: int = None,
                 decoder_attention_conv_filters: int = None,
                 encoder_attention_conv_kernel: int = None,
                 decoder_attention_conv_kernel: int = None,
                 encoder_feed_forward_dimension: int = None,
                 decoder_feed_forward_dimension: int = None,
                 decoder_prenet_dropout=0.5,
                 max_r: int = 10,
                 debug=False,
                 **kwargs):
        super(AutoregressiveTransformer, self).__init__(**kwargs)
        self.start_vec = tf.ones(
            (1, mel_channels), dtype=tf.float32) * mel_start_value
        self.end_vec = tf.ones(
            (1, mel_channels), dtype=tf.float32) * mel_end_value
        self.stop_prob_index = 2
        self.max_r = max_r
        self.r = max_r
        self.mel_channels = mel_channels
        self.drop_n_heads = 0
        self.text_pipeline = Pipeline.default_pipeline(phoneme_language,
                                                       add_start_end=True,
                                                       with_stress=with_stress)
        self.encoder_prenet = tf.keras.layers.Embedding(
            self.text_pipeline.tokenizer.vocab_size,
            encoder_prenet_dimension,
            name='Embedding')
        self.enc_speaker_mod = enc_Speaker_module(dim=512)
        self.dec_speaker_mod = dec_Speaker_module(dim=256)
        self.encoder = SelfAttentionBlocks(
            model_dim=encoder_model_dimension,
            dropout_rate=dropout_rate,
            num_heads=encoder_num_heads,
            feed_forward_dimension=encoder_feed_forward_dimension,
            maximum_position_encoding=encoder_maximum_position_encoding,
            dense_blocks=encoder_dense_blocks,
            conv_filters=encoder_attention_conv_filters,
            kernel_size=encoder_attention_conv_kernel,
            conv_activation='relu',
            name='Encoder')
        self.decoder_prenet = DecoderPrenet(
            model_dim=decoder_model_dimension,
            dense_hidden_units=decoder_prenet_dimension,
            dropout_rate=decoder_prenet_dropout,
            name='DecoderPrenet')
        self.decoder = CrossAttentionBlocks(
            model_dim=decoder_model_dimension,
            dropout_rate=dropout_rate,
            num_heads=decoder_num_heads,
            feed_forward_dimension=decoder_feed_forward_dimension,
            maximum_position_encoding=decoder_maximum_position_encoding,
            dense_blocks=decoder_dense_blocks,
            conv_filters=decoder_attention_conv_filters,
            conv_kernel=decoder_attention_conv_kernel,
            conv_activation='relu',
            conv_padding='causal',
            name='Decoder')
        self.final_proj_mel = tf.keras.layers.Dense(self.mel_channels *
                                                    self.max_r,
                                                    name='FinalProj')
        self.decoder_postnet = Postnet(mel_channels=mel_channels,
                                       conv_filters=postnet_conv_filters,
                                       conv_layers=postnet_conv_layers,
                                       kernel_size=postnet_kernel_size,
                                       name='Postnet')

        self.training_input_signature = [
            tf.TensorSpec(shape=(None, None), dtype=tf.int32),
            tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32),
            tf.TensorSpec(shape=(None, None), dtype=tf.int32),
            tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32)
        ]
        self.forward_input_signature = [
            tf.TensorSpec(shape=(None, None), dtype=tf.int32),
            tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32),
            tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32)
        ]
        self.encoder_signature = [
            tf.TensorSpec(shape=(None, None), dtype=tf.int32),
            tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32)
        ]
        self.decoder_signature = [
            tf.TensorSpec(shape=(None, None, encoder_model_dimension),
                          dtype=tf.float32),
            tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32),
            tf.TensorSpec(shape=(None, None, None, None), dtype=tf.float32),
            tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32)
        ]
        self.debug = debug
        self._apply_all_signatures()
コード例 #3
0
    train_len = metadata_len - test_len
    print(f'\nReading metadata from {metadatareader.metadata_path}')
    print(f'\nRemoving {len(remove_files)} lines out of {metadata_len_tot}.')
    print(f'\nMetadata contains {metadata_len} lines.')
    print(f'Files will be stored under {cm.data_dir}')
    print(f' - all: {phonemized_metadata_path}')
    print(f' - {train_len} training lines: {train_metadata_path}')
    print(f' - {test_len} validation lines: {test_metadata_path}')

    print('\nMetadata samples:')
    for i in sample_items:
        print(f'{i}:{metadatareader.text_dict[i]}')

    # run cleaner on raw text
    text_proc = Pipeline.default_training_pipeline(
        cm.config['phoneme_language'],
        add_start_end=False,
        with_stress=cm.config['with_stress'])
    texts = [metadatareader.text_dict[k] for k in metadatareader.filenames]
    clean_texts = text_proc.cleaner(list(texts))
    clean_texts = dict(zip(metadatareader.filenames, clean_texts))
    key_list = list(clean_texts.keys())
    print('\nCleaned metadata samples:')
    for i in sample_items:
        print(f'{i}:{clean_texts[i]}')

    print('\nPHONEMIZING')
    batch_size = args.phonemizer_batch_size
    failed_files = []
    phonemized_data = {}
    for i in tqdm.tqdm(range(0, len(key_list) + batch_size, batch_size)):
        batch_keys = key_list[i:i + batch_size]
コード例 #4
0
yaml = ruamel.yaml.YAML()
with open(str(Path(args.CONFIG) / 'data_config.yaml'), 'rb') as conf_yaml:
    config = yaml.load(conf_yaml)
args.DATA_DIR = config['data_directory']
args.META_FILE = os.path.join(args.DATA_DIR, config['metadata_filename'])
args.WAV_DIR = os.path.join(args.DATA_DIR, config['wav_subdir_name'])
args.TARGET_DIR = config['train_data_directory']
if args.TARGET_DIR is None:
    args.TARGET_DIR = args.DATA_DIR

mel_dir = os.path.join(args.TARGET_DIR, 'mels')
if not os.path.exists(mel_dir):
    os.makedirs(mel_dir)

phon_path = os.path.join(args.TARGET_DIR, 'phonemes.npy')
text_proc = Pipeline.default_pipeline(config['phoneme_language'], add_start_end=True)
if os.path.exists(phon_path) and not args.RECOMPUTE_PHON:
    print('Using cached phonemes.')
    audio_data = np.load(phon_path)
else:
    print('\nLoading and cleaning text')
    
    audio_data = []
    with open(args.META_FILE, 'r', encoding='utf-8') as f:
        for l in f.readlines():
            l_split = l.split(args.COLUMN_SEP)
            filename, text = l_split[0], l_split[-1]
            if filename.endswith('.wav'):
                filename = filename.split('.')[-1]
            text = text_proc.cleaner(text)
            audio_data.append((filename, text))
コード例 #5
0
args.META_FILE = os.path.join(args.DATA_DIR, config['metadata_filename'])
args.WAV_DIR = os.path.join(args.DATA_DIR, config['wav_subdir_name'])
args.TARGET_DIR = config['train_data_directory']
if args.TARGET_DIR is None:
    args.TARGET_DIR = args.DATA_DIR

mel_dir = os.path.join(args.TARGET_DIR, 'mels')
print(mel_dir)
print(args.TARGET_DIR)

if not os.path.exists(mel_dir):
    os.makedirs(mel_dir)

phon_path = os.path.join(args.TARGET_DIR, 'phonemes.npy')
print('phon_path:',phon_path)
text_proc = Pipeline.default_training_pipeline(config['phoneme_language'], add_start_end=True,with_stress = 'False' )
if os.path.exists(phon_path) and not args.RECOMPUTE_PHON:
    print('Using cached phonemes.')
    audio_data = np.load(phon_path)
else:
    print('\nLoading and cleaning text')
    
    audio_data = []
    with open(args.META_FILE, 'r', encoding='utf-8') as f:
      csv_reader = csv.reader(f)
      for l in csv_reader:
        filename, text = l[0], l[1]
        text = text_proc.cleaner(text)
        audio_data.append((filename, text))
    audio_data = np.array(audio_data)