def __init__(self, dataset_path, audiopaths_and_text, text_cleaners, n_mel_channels, symbol_set='english_basic', n_speakers=1, load_mel_from_disk=True, max_wav_value=None, sampling_rate=None, filter_length=None, hop_length=None, win_length=None, mel_fmin=None, mel_fmax=None, **ignored): self.audiopaths_and_text = load_filepaths_and_text( dataset_path, audiopaths_and_text, has_speakers=(n_speakers > 1)) self.load_mel_from_disk = load_mel_from_disk if not load_mel_from_disk: self.max_wav_value = max_wav_value self.sampling_rate = sampling_rate self.stft = layers.TacotronSTFT( filter_length, hop_length, win_length, n_mel_channels, sampling_rate, mel_fmin, mel_fmax)
def __init__(self, dataset_path, audiopaths_and_text, args): self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text) self.max_wav_value = args.max_wav_value self.sampling_rate = args.sampling_rate self.stft = layers.TacotronSTFT( args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) self.segment_length = args.segment_length random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, dataset_path, audiopaths_and_text, args, load_mel_from_disk=True): self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text) self.text_cleaners = args.text_cleaners self.load_mel_from_disk = load_mel_from_disk if not load_mel_from_disk: self.max_wav_value = args.max_wav_value self.sampling_rate = args.sampling_rate self.stft = layers.TacotronSTFT( args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax)
def __init__(self, audiopaths_and_text, args): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = args.text_cleaners self.max_wav_value = args.max_wav_value self.sampling_rate = args.sampling_rate self.load_mel_from_disk = args.load_mel_from_disk self.stft = layers.TacotronSTFT(args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, dataset_path, audiopaths_and_text, segment_length, n_mel_channels, max_wav_value, sampling_rate, filter_length, hop_length, win_length, mel_fmin, mel_fmax, args): self.audiopaths_and_text = load_filepaths_and_text( dataset_path, audiopaths_and_text) self.max_wav_value = max_wav_value self.sampling_rate = sampling_rate self.stft = layers.TacotronSTFT(filter_length, hop_length, win_length, n_mel_channels, sampling_rate, mel_fmin, mel_fmax) self.segment_length = segment_length random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, data_utterance_paths, hparams): """Data loader for the PPG->Mel task. Args: data_utterance_paths: A text file containing a list of file paths. hparams: The hyper-parameters. """ self.data_utterance_paths = load_filepaths(data_utterance_paths) self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.is_full_ppg = hparams.is_full_ppg self.is_append_f0 = hparams.is_append_f0 self.is_cache_feats = hparams.is_cache_feats self.load_feats_from_disk = hparams.load_feats_from_disk self.feats_cache_path = hparams.feats_cache_path self.ppg_subsampling_factor = hparams.ppg_subsampling_factor self.ppg_deps = DependenciesPPG() self.encoder_model_fpath = hparams.encoder_model_fpath if self.is_cache_feats and self.load_feats_from_disk: raise ValueError('If you are loading feats from the disk, do not ' 'rewrite them back!') self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_acoustic_feat_dims, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(hparams.seed) random.shuffle(self.data_utterance_paths) self.ppg_sequences = [] self.acoustic_sequences = [] self.dvec_sequences = [] if self.load_feats_from_disk: print('Loading data from %s.' % self.feats_cache_path) with open(self.feats_cache_path, 'rb') as f: data = pickle.load(f) self.ppg_sequences = data[0] self.acoustic_sequences = data[1] self.dvec_sequences = data[2] else: for utterance_path in self.data_utterance_paths: ppg_feat_pair_spkr = self.extract_utterance_feats_spkr(utterance_path, self.is_full_ppg) self.ppg_sequences.append(ppg_feat_pair_spkr[0].astype(np.float32)) self.acoustic_sequences.append(ppg_feat_pair_spkr[1]) self.dvec_sequences.append(ppg_feat_pair_spkr[2]) if self.is_cache_feats: print('Caching data to %s.' % self.feats_cache_path) with open(self.feats_cache_path, 'wb') as f: pickle.dump([self.ppg_sequences, self.acoustic_sequences, self.dvec_sequences], f)
def __init__(self, data_utterance_paths, cache_path, hparams, bs, loop): """Data loader for the PPG->Mel task. Args: data_utterance_paths: A text file containing a list of file paths. hparams: The hyper-parameters. """ self.data_utterance_paths = load_filepaths(data_utterance_paths) self.max_wav_value = 32768.0 self.sampling_rate = hparams.sampling_rate self.is_full_ppg = True self.is_append_f0 = False self.is_cache_feats = True self.feats_cache_path = cache_path self.ppg_subsampling_factor = 1 self.ppg_deps = DependenciesPPG() # 20 data = n(4) * b(5) self.n = int(bs) - 1 self.b = 5 self.l = int(loop) - 1 self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_acoustic_feat_dims, self.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(hparams.seed) random.shuffle(self.data_utterance_paths) if self.n > 0: with open(self.feats_cache_path, 'rb') as f: data = pickle.load(f) self.ppg_sequences = data[0] self.acoustic_sequences = data[1] else: self.ppg_sequences = [] self.acoustic_sequences = [] for utterance_path in self.data_utterance_paths[self.n * self.b + self.l * 20 : (self.n+1) * self.b + self.l * 20]: ppg_feat_pair = self.extract_utterance_feats(utterance_path, self.is_full_ppg) self.ppg_sequences.append(ppg_feat_pair[0].astype( np.float32)) self.acoustic_sequences.append(ppg_feat_pair[1]) if self.is_cache_feats: print('Caching data to %s.' % self.feats_cache_path) with open(self.feats_cache_path, 'wb+') as f: pickle.dump([self.ppg_sequences, self.acoustic_sequences], f)
def __init__(self, dataset_path, audiopaths_and_text, args, speaker_ids=None): self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text) self.text_cleaners = args.text_cleaners self.max_wav_value = args.max_wav_value self.sampling_rate = args.sampling_rate self.load_mel_from_disk = args.load_mel_from_disk self.stft = layers.TacotronSTFT( args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text) self.speaker_ids = speaker_ids if speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text)
def __init__(self, data_utterance_paths, hparams): """Data loader for the PPG->Mel task. Args: data_utterance_paths: A text file containing a list of file paths. hparams: The hyper-parameters. """ self.data_utterance_paths = data_utterance_paths self.load_feats_from_disk = hparams.load_feats_from_disk self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.is_full_ppg = hparams.is_full_ppg self.is_append_f0 = hparams.is_append_f0 self.ppg_subsampling_factor = hparams.ppg_subsampling_factor self.ppg_deps = DependenciesPPG() self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_acoustic_feat_dims, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(hparams.seed) # 从disk加载数据对 self.ppg_sequences = [] self.acoustic_sequences = [] if self.load_feats_from_disk: fpath = self.data_utterance_paths with open(fpath, 'rb') as f: data = pickle.load(f) self.ppg_sequences = data[0] self.acoustic_sequences = data[1] print("Number of data:", len(self.ppg_sequences)) else: print("No data!")
def __init__( self, dataset_path, audiopaths_and_text, text_cleaners, n_mel_channels, symbol_set='english_basic', p_arpabet=1.0, n_speakers=1, load_mel_from_disk=True, load_pitch_from_disk=True, pitch_mean=214.72203, # LJSpeech defaults pitch_std=65.72038, max_wav_value=None, sampling_rate=None, filter_length=None, hop_length=None, win_length=None, mel_fmin=None, mel_fmax=None, prepend_space_to_text=False, append_space_to_text=False, pitch_online_dir=None, betabinomial_online_dir=None, use_betabinomial_interpolator=True, pitch_online_method='pyin', **ignored): # Expect a list of filenames if type(audiopaths_and_text) is str: audiopaths_and_text = [audiopaths_and_text] self.dataset_path = dataset_path self.audiopaths_and_text = load_filepaths_and_text( dataset_path, audiopaths_and_text, has_speakers=(n_speakers > 1)) self.load_mel_from_disk = load_mel_from_disk if not load_mel_from_disk: self.max_wav_value = max_wav_value self.sampling_rate = sampling_rate self.stft = layers.TacotronSTFT(filter_length, hop_length, win_length, n_mel_channels, sampling_rate, mel_fmin, mel_fmax) self.load_pitch_from_disk = load_pitch_from_disk self.prepend_space_to_text = prepend_space_to_text self.append_space_to_text = append_space_to_text assert p_arpabet == 0.0 or p_arpabet == 1.0, ( 'Only 0.0 and 1.0 p_arpabet is currently supported. ' 'Variable probability breaks caching of betabinomial matrices.') self.tp = TextProcessing(symbol_set, text_cleaners, p_arpabet=p_arpabet) self.n_speakers = n_speakers self.pitch_tmp_dir = pitch_online_dir self.f0_method = pitch_online_method self.betabinomial_tmp_dir = betabinomial_online_dir self.use_betabinomial_interpolator = use_betabinomial_interpolator if use_betabinomial_interpolator: self.betabinomial_interpolator = BetaBinomialInterpolator() expected_columns = (2 + int(load_pitch_from_disk) + (n_speakers > 1)) assert not (load_pitch_from_disk and self.pitch_tmp_dir is not None) if len(self.audiopaths_and_text[0]) < expected_columns: raise ValueError( f'Expected {expected_columns} columns in audiopaths file. ' 'The format is <mel_or_wav>|[<pitch>|]<text>[|<speaker_id>]') if len(self.audiopaths_and_text[0]) > expected_columns: print('WARNING: Audiopaths file has more columns than expected') to_tensor = lambda x: torch.Tensor([x]) if type(x) is float else x self.pitch_mean = to_tensor(pitch_mean) self.pitch_std = to_tensor(pitch_std)