def create_spectro(self, item:AudioItem): if self.config.mfcc: mel = MFCC(sample_rate=item.sr, n_mfcc=self.config.sg_cfg.n_mfcc, melkwargs=self.config.sg_cfg.mel_args())(item.sig) else: if self.config.sg_cfg.custom_spectro != None: mel = self.config.sg_cfg.custom_spectro(item.sig) else: if self.config.sg_cfg.n_mels > 0: c = self.config.sg_cfg mel = librosa.feature.melspectrogram(y=np.array(item.sig[0,:]), sr=item.sr, fmax=c.f_max, fmin=c.f_min, **(self.config.sg_cfg.mel_args())) mel = torch.from_numpy(mel) mel.unsqueeze_(0) else: mel = Spectrogram(**(self.config.sg_cfg.spectro_args()))(item.sig) if self.config.sg_cfg.to_db_scale: mel = AmplitudeToDB(top_db=self.config.sg_cfg.top_db)(mel) mel = mel.detach() if self.config.standardize: mel = standardize(mel) if self.config.delta: mel = torch.cat([torch.stack([m,torchdelta(m),torchdelta(m, order=2)]) for m in mel]) return mel
def __init__(self, sample_rate, n_fft, top_db, max_perc): super().__init__() self.time_stretch = TimeStretch(hop_length=None, n_freq=n_fft // 2 + 1) self.stft = Spectrogram(n_fft=n_fft, power=None) self.com_norm = ComplexNorm(power=2.) self.fm = FrequencyMasking(100) self.tm = TimeMasking(100) self.mel_specgram = MelSpectrogram(sample_rate, n_fft=n_fft, f_max=8000) self.AtoDB = AmplitudeToDB(top_db=top_db) self.max_perc = max_perc self.sample_rate = sample_rate self.resamples = [ Resample(sample_rate, sample_rate * 0.6), Resample(sample_rate, sample_rate * 0.7), Resample(sample_rate, sample_rate * 0.8), Resample(sample_rate, sample_rate * 0.9), Resample(sample_rate, sample_rate * 1), Resample(sample_rate, sample_rate * 1.1), Resample(sample_rate, sample_rate * 1.2), Resample(sample_rate, sample_rate * 1.3), Resample(sample_rate, sample_rate * 1.4) ]
sr=16000 #sampling rate min_level_db=-100 #reference values to normalize data ref_level_db=20 shape=24 #length of time axis of split specrograms to feed to generator vec_len=128 #length of vector generated by siamese vector bs = 128 #batch size delta = 2. #constant for siamese loss tag='HAP' #the tag for the training """#helper functions""" torch.set_default_tensor_type('torch.cuda.FloatTensor') #MEL-SPECTRUM print("finally start...") specobj = Spectrogram(n_fft=6*hop, win_length=6*hop, hop_length=hop, pad=0, power=2, normalized=True) specfunc = specobj.forward melobj = MelScale(n_mels=hop, sample_rate=sr, f_min=0.) melfunc = melobj.forward def melspecfunc(waveform): specgram = specfunc(waveform) mel_specgram = melfunc(specgram) return mel_specgram def spectral_convergence(input, target): return 20 * ((input - target).norm().log10() - target.norm().log10()) def GRAD(spec, transform_fn, samples=None, init_x0=None, maxiter=1000, tol=1e-6, verbose=1, evaiter=10, lr=0.003): spec = torch.Tensor(spec)
def __init__(self, nfft): self.spectro = Spectrogram(nfft, normalized=True, power=2)
def ex_waveform_spectro(): dataset = load_dataset("train", _DEFAULT_COMMONVOICE_ROOT, _DEFAULT_COMMONVOICE_VERSION) # Take one of the waveforms idx = 10 waveform, rate, dictionary = dataset[idx] n_begin = rate # 1 s. n_end = 3*rate # 2 s. waveform = waveform[:, n_begin:n_end] # B, T nfft = int(_DEFAULT_WIN_LENGTH * 1e-3 * _DEFAULT_RATE) # nmels = _DEFAULT_NUM_MELS nstep = int(_DEFAULT_WIN_STEP * 1e-3 * _DEFAULT_RATE) trans_spectro = nn.Sequential( Spectrogram(n_fft=nfft, hop_length=nstep), AmplitudeToDB() ) spectro = trans_spectro(waveform) # B, n_mels, T trans_mel_spectro = WaveformProcessor(rate=rate, win_length=_DEFAULT_WIN_LENGTH*1e-3, win_step=_DEFAULT_WIN_STEP*1e-3, nmels=_DEFAULT_NUM_MELS, augment=False, spectro_normalization=None) mel_spectro = trans_mel_spectro(waveform.transpose(0, 1)) # T, B, n_mels plot_spectro(mel_spectro[:, 0, :], [], _DEFAULT_WIN_STEP*1e-3, CharMap()) fig, axes = plt.subplots(nrows=1,ncols=3, figsize=(15, 3)) ax = axes[0] ax.plot( [i/rate for i in range(n_begin, n_end)], waveform[0]) ax.set_xlabel('Time (s.)') ax.set_ylabel('Amplitude') ax.set_title('Waveform') ax = axes[1] im = ax.imshow(spectro[0], extent=[n_begin/rate, n_end/rate, 0, spectro.shape[1]], aspect='auto', cmap='magma', origin='lower') ax.set_ylabel('Frequency bins') ax.set_xlabel('TIme (s.)') ax.set_title("Spectrogram (dB)") fig.colorbar(im, ax=ax) ax = axes[2] im = ax.imshow(mel_spectro[:, 0, :].T, extent=[n_begin/rate, n_end/rate, 0, mel_spectro.shape[0]], aspect='auto', cmap='magma', origin='lower') ax.set_ylabel('Mel scales') ax.set_xlabel('TIme (s.)') ax.set_title("Mel-Spectrogram (dB)") fig.colorbar(im, ax=ax) plt.tight_layout() plt.savefig("waveform_to_spectro.png") plt.show()
def __init__(self, train_loader, test_loader, valid_loader, general_args, trainer_args): super(GanTrainer, self).__init__(train_loader, test_loader, valid_loader, general_args) # Paths self.loadpath = trainer_args.loadpath self.savepath = trainer_args.savepath # Load the auto-encoder self.use_autoencoder = False if trainer_args.autoencoder_path and os.path.exists( trainer_args.autoencoder_path): self.use_autoencoder = True self.autoencoder = AutoEncoder(general_args=general_args).to( self.device) self.load_pretrained_autoencoder(trainer_args.autoencoder_path) self.autoencoder.eval() # Load the generator self.generator = Generator(general_args=general_args).to(self.device) if trainer_args.generator_path and os.path.exists( trainer_args.generator_path): self.load_pretrained_generator(trainer_args.generator_path) self.discriminator = Discriminator(general_args=general_args).to( self.device) # Optimizers and schedulers self.generator_optimizer = torch.optim.Adam( params=self.generator.parameters(), lr=trainer_args.generator_lr) self.discriminator_optimizer = torch.optim.Adam( params=self.discriminator.parameters(), lr=trainer_args.discriminator_lr) self.generator_scheduler = lr_scheduler.StepLR( optimizer=self.generator_optimizer, step_size=trainer_args.generator_scheduler_step, gamma=trainer_args.generator_scheduler_gamma) self.discriminator_scheduler = lr_scheduler.StepLR( optimizer=self.discriminator_optimizer, step_size=trainer_args.discriminator_scheduler_step, gamma=trainer_args.discriminator_scheduler_gamma) # Load saved states if os.path.exists(self.loadpath): self.load() # Loss function and stored losses self.adversarial_criterion = nn.BCEWithLogitsLoss() self.generator_time_criterion = nn.MSELoss() self.generator_frequency_criterion = nn.MSELoss() self.generator_autoencoder_criterion = nn.MSELoss() # Define labels self.real_label = 1 self.generated_label = 0 # Loss scaling factors self.lambda_adv = trainer_args.lambda_adversarial self.lambda_freq = trainer_args.lambda_freq self.lambda_autoencoder = trainer_args.lambda_autoencoder # Spectrogram converter self.spectrogram = Spectrogram(normalized=True).to(self.device) # Boolean indicating if the model needs to be saved self.need_saving = True # Boolean if the generator receives the feedback from the discriminator self.use_adversarial = trainer_args.use_adversarial
def __init__(self, train_loader, test_loader, valid_loader, general_args): # Device self.device = ('cuda' if torch.cuda.is_available() else 'cpu') # Data generators self.train_loader = train_loader self.valid_loader = valid_loader self.test_loader = test_loader # Iterators to cycle over the datasets self.train_loader_iter = cycle(iter(self.train_loader)) self.valid_loader_iter = cycle(iter(self.valid_loader)) self.test_loader_iter = cycle(iter(self.test_loader)) # Epoch counter self.epoch = 0 # Stored losses self.train_losses = { 'time_l2': [], 'freq_l2': [], 'autoencoder_l2': [], 'generator_adversarial': [], 'discriminator_adversarial': { 'real': [], 'fake': [] } } self.test_losses = { 'time_l2': [], 'freq_l2': [], 'autoencoder_l2': [], 'generator_adversarial': [], 'discriminator_adversarial': { 'real': [], 'fake': [] } } self.valid_losses = { 'time_l2': [], 'freq_l2': [], 'autoencoder_l2': [], 'generator_adversarial': [], 'discriminator_adversarial': { 'real': [], 'fake': [] } } # Time to frequency converter self.spectrogram = Spectrogram(normalized=True, n_fft=512, hop_length=128).to(self.device) self.amplitude_to_db = AmplitudeToDB() # Boolean indicting if auto-encoder or generator self.is_autoencoder = False # Boolean indicating if the model needs to be saved self.need_saving = True # Set the pseudo-epochs self.train_batches_per_epoch = general_args.train_batches_per_epoch self.test_batches_per_epoch = general_args.test_batches_per_epoch self.valid_batches_per_epoch = general_args.valid_batches_per_epoch
parser.add_argument('-dir', '--dataset-dir', type=str) parser.add_argument('-e', '--epoch', type=int, default=50) parser.add_argument('-d', '--device', type=str, default='cuda:0') args = parser.parse_args() sr = args.sample_rate n_fft = int(30e-3 * sr) # 48 hop_length = int(10e-3 * sr) # 16 dataset_dir = args.dataset_dir batch_size = args.batch_size lr = args.learning_rate epoch = args.epoch device = args.device pad = Pad(size) spec = Spectrogram(n_fft=n_fft, hop_length=hop_length) melscale = MelScaleDelta(order=delta_order, n_mels=n_mels, sample_rate=sr, f_min=f_min, f_max=f_max, dct_type='slaney') rescale = Rescale() transform = torchvision.transforms.Compose([pad, spec, melscale, rescale]) print(label_cnt) train_dataset = SPEECHCOMMANDS(label_dict, dataset_dir, silence_cnt=2300,
#ORIGINAL CODE FROM https://github.com/yoyololicon/spectrogram-inversion import torch import torch.nn as nn import torch.nn.functional as F from tqdm import tqdm from functools import partial import math import heapq from torchaudio.transforms import MelScale, Spectrogram torch.set_default_tensor_type('torch.cuda.FloatTensor') specobj = Spectrogram(n_fft=4 * hop, win_length=4 * hop, hop_length=hop, pad=0, power=2, normalized=False) specfunc = specobj.forward melobj = MelScale(n_mels=hop, sample_rate=sr, f_min=0.) melfunc = melobj.forward def melspecfunc(waveform): specgram = specfunc(waveform) mel_specgram = melfunc(specgram) return mel_specgram def spectral_convergence(input, target): return 20 * ((input - target).norm().log10() - target.norm().log10())
dataloader = DataLoader(dataset, batch_size=16, drop_last=False, shuffle=True) noisy_batch, clean_batch = next(iter(dataloader)) # enable eval mode model.zero_grad() model.eval() model.freeze() # disable gradients to save memory torch.set_grad_enabled(False) n_fft = (model.n_frequency_bins - 1) * 2 x_waveform = noisy_batch transform = Spectrogram(n_fft=n_fft, power=None) x_stft = transform(x_waveform) y_stft = transform(clean_batch) x_ms = x_stft.pow(2).sum(-1).sqrt() y_ms = y_stft.pow(2).sum(-1).sqrt() y_ms_hat = model(x_ms) y_stft_hat = torch.stack([y_ms_hat * torch.cos(angle(x_stft)), y_ms_hat * torch.sin(angle(x_stft))], dim=-1) window = torch.hann_window(n_fft) y_waveform_hat = istft(y_stft_hat, n_fft=n_fft, hop_length=n_fft // 2, win_length=n_fft, window=window, length=x_waveform.shape[-1]) for i, waveform in enumerate(y_waveform_hat.numpy()): sf.write('denoised' + str(i) + '.wav', waveform, 16000)
def __init__(self, sr: int, sg_cfg: SpectrogramConfig): self.sg_cfg = sg_cfg self.spec = Spectrogram(**sg_cfg.spec_args) self.to_mel = MelScale(sample_rate=sr, **sg_cfg.mel_args) self.mfcc = MFCC(sample_rate=sr, **sg_cfg.mfcc_args) self.to_db = AmplitudeToDB(top_db=sg_cfg.top_db)
max_iter=2 * 2048).to(device) griffin_lim = GriffinLim(n_fft=1024, hop_length=256).to(device) writer = tensorboard.SummaryWriter(log_dir=f'logs/test') dataset = Dataset('../DATASETS/LJSpeech-1.1/metadata.csv', '../DATASETS/LJSpeech-1.1') dataloader = DataLoader(dataset, collate_fn=dataset.collocate, batch_size=batch_size, shuffle=False, num_workers=0, drop_last=True) resample = Resample(orig_freq=22050, new_freq=sample_rate) spectogram = Spectrogram(n_fft=1024, hop_length=256).to(device) to_mel = MelScale(n_mels=80, sample_rate=sample_rate, n_stft=1024 // 2 + 1).to(device) with open('../DATASETS/LJSpeech-1.1/metadata.csv', encoding='utf8') as file: data = [line.strip().split('|') for line in file] path, text = data[0][0], data[0][1] path = f'../DATASETS/LJSpeech-1.1/wavs/{path}.wav' data, sr = torchaudio.load(path) data = resample(data) data = data.to(device) data = spectogram(data.squeeze(0)) mel_norm = ( (data.unsqueeze(0) - data.mean()) / data.std()).clamp(-1, 1) * .5 + .5 writer.add_image(f'spec/origin', mel_norm, 0)