import utility from autovc.model_vc import Generator from config import Config from data_converter import Converter from parallel_wavegan.utils import read_hdf5 from autovc.synthesis import build_model_melgan, melgan device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device.type == "cuda": print(torch.cuda.get_device_name(0)) converter = Converter(device) import yaml from parallel_wavegan.utils import download_pretrained_model, load_model def logmelfilterbank(audio, sampling_rate, fft_size=1024, hop_size=256, win_length=None, window="hann", num_mels=80, fmin=None, fmax=None,
f" |-embedding.npy (single (speaker) embedding per speaker)\n" f" |-...\n") args = parser.parse_args() cur_time = datetime.datetime.now().strftime("%Y%m%d_%H;%M") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #==============================Dir settings========================== input_dir = args.input_dir output_dir = args.output_dir #===========================Converter (+output dir addition)================================ if args.spectrogram_type == "standard": log.info("Using the default AutoVC spectrogram creator") converter = Converter(device) elif args.spectrogram_type == "melgan": log.info("Using a melgan spectrogram-converter for dataset generator") converter = MelganConverter(device, Config.dir_paths["melgan_config_path"], Config.dir_paths["melgan_stats_path"]) #===============================create metadata (if it does not exist already)==================== if not os.path.exists(os.path.join( output_dir, Config.train_metadata_name)): #if metadata doesnt already exist _ = converter.generate_train_data(input_dir, output_dir, Config.train_metadata_name) else: log.warning( f" ATTENTION: metadata already exists at: {os.path.join(output_dir, Config.train_metadata_name)}, now exiting..."
from librosa.filters import mel as librosa_mel_fn from numpy.random import RandomState from sklearn.preprocessing import StandardScaler import utility from autovc.model_vc import Generator from config import Config from data_converter import Converter from parallel_wavegan.utils import read_hdf5 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device.type == "cuda": print(torch.cuda.get_device_name(0)) converter = Converter(device) import yaml from parallel_wavegan.utils import download_pretrained_model, load_model def logmelfilterbank( audio, sampling_rate, fft_size=1024, #TODO: scale this based on sampling rate as well? Otherwise each fft-frame (and spectrogram entry) is less time (e.g. at 22khz, 1024 samples is 0.05sec, while at 44khz its 0.1 sec) hop_size=256, win_length=None, window="hann", num_mels=80, fmin=None,
source_list = ["p225_001"] target_speaker = "Wouter" target_list = ["1", "2", "3", "4", "5", "6", "7"] # directories input_dir = Config.dir_paths["input"] converted_data_dir = Config.dir_paths["metadata"] output_file_dir = Config.dir_paths["output"] metadata_name = Config.metadata_name device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device.type == "cuda": print(torch.cuda.get_device_name(0)) converter = Converter(device) spec_dir = Config.dir_paths["spectrograms"] specs = converter._wav_to_spec(input_dir, spec_dir) spect_convert_list = [('Wouter_test_wav_to_spect_to_wav', specs["Wouter"]["6"]) ] #6 = "This is a test sentence" # input_data = converter.wav_to_input(input_dir, source_speaker, target_speaker, source_list, target_list, converted_data_dir, metadata_name) converter.output_to_wav(spect_convert_list) print("Done") # input_data = converter.wav_to_input(input_dir, source_speaker, target_speaker, source_list, target_list, converted_data_dir, metadata_name) # output_data = inference(output_file_dir, device, input_data=input_data)
output_file_dir = os.path.join(output_file_dir, "griffin") elif args.vocoder == "wavenet": from vocoders import WaveNet vocoder_path = os.path.join(Config.dir_paths["networks"], Config.pretrained_names["wavenet"]) vocoder = WaveNet(device, vocoder_path) output_file_dir = os.path.join(output_file_dir, "wavenet") elif args.vocoder == "melgan": from vocoders import MelGan spectrogram_type = "melgan" vocoder = MelGan(device) output_file_dir = os.path.join(output_file_dir, "melgan") sr = 16000 if spectrogram_type == "standard": converter = Converter(device) elif spectrogram_type == "melgan": sr = 24000 converter = MelganConverter(device, Config.dir_paths["melgan_config_path"], Config.dir_paths["melgan_stats_path"]) skip = not args.force_preprocess input_data = converter.wav_to_convert_input(input_dir, source_speaker, target_speaker, source_list, converted_data_dir, metadata_name, skip_existing=skip, len_crop=args.len_crop)