Exemple #1
0
 def create_spectro(self, item:AudioItem):
     if self.config.mfcc: 
         mel = MFCC(sample_rate=item.sr, n_mfcc=self.config.sg_cfg.n_mfcc, melkwargs=self.config.sg_cfg.mel_args())(item.sig)
     else:
         if self.config.sg_cfg.custom_spectro != None:
             mel = self.config.sg_cfg.custom_spectro(item.sig)
         else:
             if self.config.sg_cfg.n_mels > 0:
               c = self.config.sg_cfg
               mel = librosa.feature.melspectrogram(y=np.array(item.sig[0,:]), sr=item.sr, fmax=c.f_max, fmin=c.f_min, **(self.config.sg_cfg.mel_args()))
             
               mel = torch.from_numpy(mel)
               mel.unsqueeze_(0)  
             else:
               mel = Spectrogram(**(self.config.sg_cfg.spectro_args()))(item.sig)
         if self.config.sg_cfg.to_db_scale: 
             mel = AmplitudeToDB(top_db=self.config.sg_cfg.top_db)(mel)
     mel = mel.detach()
     if self.config.standardize: 
         mel = standardize(mel)
     if self.config.delta: 
         mel = torch.cat([torch.stack([m,torchdelta(m),torchdelta(m, order=2)]) for m in mel]) 
     return mel
Exemple #2
0
 def __init__(self, sample_rate, n_fft, top_db, max_perc):
     super().__init__()
     self.time_stretch = TimeStretch(hop_length=None, n_freq=n_fft // 2 + 1)
     self.stft = Spectrogram(n_fft=n_fft, power=None)
     self.com_norm = ComplexNorm(power=2.)
     self.fm = FrequencyMasking(100)
     self.tm = TimeMasking(100)
     self.mel_specgram = MelSpectrogram(sample_rate,
                                        n_fft=n_fft,
                                        f_max=8000)
     self.AtoDB = AmplitudeToDB(top_db=top_db)
     self.max_perc = max_perc
     self.sample_rate = sample_rate
     self.resamples = [
         Resample(sample_rate, sample_rate * 0.6),
         Resample(sample_rate, sample_rate * 0.7),
         Resample(sample_rate, sample_rate * 0.8),
         Resample(sample_rate, sample_rate * 0.9),
         Resample(sample_rate, sample_rate * 1),
         Resample(sample_rate, sample_rate * 1.1),
         Resample(sample_rate, sample_rate * 1.2),
         Resample(sample_rate, sample_rate * 1.3),
         Resample(sample_rate, sample_rate * 1.4)
     ]
Exemple #3
0
sr=16000              #sampling rate
min_level_db=-100     #reference values to normalize data
ref_level_db=20

shape=24              #length of time axis of split specrograms to feed to generator
vec_len=128           #length of vector generated by siamese vector
bs = 128               #batch size
delta = 2.            #constant for siamese loss
tag='HAP'             #the tag for the training

"""#helper functions"""

torch.set_default_tensor_type('torch.cuda.FloatTensor')
#MEL-SPECTRUM
print("finally start...")
specobj = Spectrogram(n_fft=6*hop, win_length=6*hop, hop_length=hop, pad=0, power=2, normalized=True)
specfunc = specobj.forward
melobj = MelScale(n_mels=hop, sample_rate=sr, f_min=0.)
melfunc = melobj.forward

def melspecfunc(waveform):
  specgram = specfunc(waveform)
  mel_specgram = melfunc(specgram)
  return mel_specgram

def spectral_convergence(input, target):
    return 20 * ((input - target).norm().log10() - target.norm().log10())

def GRAD(spec, transform_fn, samples=None, init_x0=None, maxiter=1000, tol=1e-6, verbose=1, evaiter=10, lr=0.003):

    spec = torch.Tensor(spec)
Exemple #4
0
 def __init__(self, nfft):
     self.spectro = Spectrogram(nfft, normalized=True, power=2)
Exemple #5
0
def ex_waveform_spectro():
    dataset = load_dataset("train",
                           _DEFAULT_COMMONVOICE_ROOT,
                           _DEFAULT_COMMONVOICE_VERSION)

    # Take one of the waveforms 
    idx = 10
    waveform, rate, dictionary = dataset[idx]
    n_begin = rate  # 1 s.
    n_end = 3*rate  # 2 s.
    waveform = waveform[:, n_begin:n_end]  # B, T

    nfft = int(_DEFAULT_WIN_LENGTH * 1e-3 * _DEFAULT_RATE)
    # nmels = _DEFAULT_NUM_MELS
    nstep = int(_DEFAULT_WIN_STEP * 1e-3 * _DEFAULT_RATE)
    trans_spectro = nn.Sequential(
        Spectrogram(n_fft=nfft,
                    hop_length=nstep),
        AmplitudeToDB()
    )
    spectro = trans_spectro(waveform)  # B, n_mels, T

    trans_mel_spectro = WaveformProcessor(rate=rate,
                                          win_length=_DEFAULT_WIN_LENGTH*1e-3,
                                          win_step=_DEFAULT_WIN_STEP*1e-3,
                                          nmels=_DEFAULT_NUM_MELS,
                                          augment=False,
                                          spectro_normalization=None)
    mel_spectro = trans_mel_spectro(waveform.transpose(0, 1))  # T, B, n_mels
    plot_spectro(mel_spectro[:, 0, :], [],
                 _DEFAULT_WIN_STEP*1e-3,
                 CharMap())

    fig, axes = plt.subplots(nrows=1,ncols=3, figsize=(15, 3))

    ax = axes[0]
    ax.plot( [i/rate for i in range(n_begin, n_end)], waveform[0])
    ax.set_xlabel('Time (s.)')
    ax.set_ylabel('Amplitude')
    ax.set_title('Waveform')

    ax = axes[1]
    im = ax.imshow(spectro[0],
                   extent=[n_begin/rate, n_end/rate,
                           0, spectro.shape[1]],
                   aspect='auto',
                   cmap='magma',
                   origin='lower')
    ax.set_ylabel('Frequency bins')
    ax.set_xlabel('TIme (s.)')
    ax.set_title("Spectrogram (dB)")
    fig.colorbar(im, ax=ax)

    ax = axes[2]
    im = ax.imshow(mel_spectro[:, 0, :].T,
                   extent=[n_begin/rate, n_end/rate,
                           0, mel_spectro.shape[0]],
                   aspect='auto',
                   cmap='magma',
                   origin='lower')
    ax.set_ylabel('Mel scales')
    ax.set_xlabel('TIme (s.)')
    ax.set_title("Mel-Spectrogram (dB)")
    fig.colorbar(im, ax=ax)

    plt.tight_layout()
    plt.savefig("waveform_to_spectro.png")
    plt.show()
    def __init__(self, train_loader, test_loader, valid_loader, general_args,
                 trainer_args):
        super(GanTrainer, self).__init__(train_loader, test_loader,
                                         valid_loader, general_args)
        # Paths
        self.loadpath = trainer_args.loadpath
        self.savepath = trainer_args.savepath

        # Load the auto-encoder
        self.use_autoencoder = False
        if trainer_args.autoencoder_path and os.path.exists(
                trainer_args.autoencoder_path):
            self.use_autoencoder = True
            self.autoencoder = AutoEncoder(general_args=general_args).to(
                self.device)
            self.load_pretrained_autoencoder(trainer_args.autoencoder_path)
            self.autoencoder.eval()

        # Load the generator
        self.generator = Generator(general_args=general_args).to(self.device)
        if trainer_args.generator_path and os.path.exists(
                trainer_args.generator_path):
            self.load_pretrained_generator(trainer_args.generator_path)

        self.discriminator = Discriminator(general_args=general_args).to(
            self.device)

        # Optimizers and schedulers
        self.generator_optimizer = torch.optim.Adam(
            params=self.generator.parameters(), lr=trainer_args.generator_lr)
        self.discriminator_optimizer = torch.optim.Adam(
            params=self.discriminator.parameters(),
            lr=trainer_args.discriminator_lr)
        self.generator_scheduler = lr_scheduler.StepLR(
            optimizer=self.generator_optimizer,
            step_size=trainer_args.generator_scheduler_step,
            gamma=trainer_args.generator_scheduler_gamma)
        self.discriminator_scheduler = lr_scheduler.StepLR(
            optimizer=self.discriminator_optimizer,
            step_size=trainer_args.discriminator_scheduler_step,
            gamma=trainer_args.discriminator_scheduler_gamma)

        # Load saved states
        if os.path.exists(self.loadpath):
            self.load()

        # Loss function and stored losses
        self.adversarial_criterion = nn.BCEWithLogitsLoss()
        self.generator_time_criterion = nn.MSELoss()
        self.generator_frequency_criterion = nn.MSELoss()
        self.generator_autoencoder_criterion = nn.MSELoss()

        # Define labels
        self.real_label = 1
        self.generated_label = 0

        # Loss scaling factors
        self.lambda_adv = trainer_args.lambda_adversarial
        self.lambda_freq = trainer_args.lambda_freq
        self.lambda_autoencoder = trainer_args.lambda_autoencoder

        # Spectrogram converter
        self.spectrogram = Spectrogram(normalized=True).to(self.device)

        # Boolean indicating if the model needs to be saved
        self.need_saving = True

        # Boolean if the generator receives the feedback from the discriminator
        self.use_adversarial = trainer_args.use_adversarial
    def __init__(self, train_loader, test_loader, valid_loader, general_args):
        # Device
        self.device = ('cuda' if torch.cuda.is_available() else 'cpu')

        # Data generators
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.test_loader = test_loader

        # Iterators to cycle over the datasets
        self.train_loader_iter = cycle(iter(self.train_loader))
        self.valid_loader_iter = cycle(iter(self.valid_loader))
        self.test_loader_iter = cycle(iter(self.test_loader))

        # Epoch counter
        self.epoch = 0

        # Stored losses
        self.train_losses = {
            'time_l2': [],
            'freq_l2': [],
            'autoencoder_l2': [],
            'generator_adversarial': [],
            'discriminator_adversarial': {
                'real': [],
                'fake': []
            }
        }
        self.test_losses = {
            'time_l2': [],
            'freq_l2': [],
            'autoencoder_l2': [],
            'generator_adversarial': [],
            'discriminator_adversarial': {
                'real': [],
                'fake': []
            }
        }
        self.valid_losses = {
            'time_l2': [],
            'freq_l2': [],
            'autoencoder_l2': [],
            'generator_adversarial': [],
            'discriminator_adversarial': {
                'real': [],
                'fake': []
            }
        }

        # Time to frequency converter
        self.spectrogram = Spectrogram(normalized=True,
                                       n_fft=512,
                                       hop_length=128).to(self.device)
        self.amplitude_to_db = AmplitudeToDB()

        # Boolean indicting if auto-encoder or generator
        self.is_autoencoder = False

        # Boolean indicating if the model needs to be saved
        self.need_saving = True

        # Set the pseudo-epochs
        self.train_batches_per_epoch = general_args.train_batches_per_epoch
        self.test_batches_per_epoch = general_args.test_batches_per_epoch
        self.valid_batches_per_epoch = general_args.valid_batches_per_epoch
Exemple #8
0
    parser.add_argument('-dir', '--dataset-dir', type=str)
    parser.add_argument('-e', '--epoch', type=int, default=50)
    parser.add_argument('-d', '--device', type=str, default='cuda:0')
    args = parser.parse_args()

    sr = args.sample_rate
    n_fft = int(30e-3 * sr)  # 48
    hop_length = int(10e-3 * sr)  # 16
    dataset_dir = args.dataset_dir
    batch_size = args.batch_size
    lr = args.learning_rate
    epoch = args.epoch
    device = args.device

    pad = Pad(size)
    spec = Spectrogram(n_fft=n_fft, hop_length=hop_length)
    melscale = MelScaleDelta(order=delta_order,
                             n_mels=n_mels,
                             sample_rate=sr,
                             f_min=f_min,
                             f_max=f_max,
                             dct_type='slaney')
    rescale = Rescale()

    transform = torchvision.transforms.Compose([pad, spec, melscale, rescale])

    print(label_cnt)

    train_dataset = SPEECHCOMMANDS(label_dict,
                                   dataset_dir,
                                   silence_cnt=2300,
Exemple #9
0
#ORIGINAL CODE FROM https://github.com/yoyololicon/spectrogram-inversion

import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from functools import partial
import math
import heapq
from torchaudio.transforms import MelScale, Spectrogram

torch.set_default_tensor_type('torch.cuda.FloatTensor')

specobj = Spectrogram(n_fft=4 * hop,
                      win_length=4 * hop,
                      hop_length=hop,
                      pad=0,
                      power=2,
                      normalized=False)
specfunc = specobj.forward
melobj = MelScale(n_mels=hop, sample_rate=sr, f_min=0.)
melfunc = melobj.forward


def melspecfunc(waveform):
    specgram = specfunc(waveform)
    mel_specgram = melfunc(specgram)
    return mel_specgram


def spectral_convergence(input, target):
    return 20 * ((input - target).norm().log10() - target.norm().log10())
Exemple #10
0
dataloader = DataLoader(dataset, batch_size=16, drop_last=False, shuffle=True)
noisy_batch, clean_batch = next(iter(dataloader))

#  enable eval mode
model.zero_grad()
model.eval()
model.freeze()

# disable gradients to save memory
torch.set_grad_enabled(False)

n_fft = (model.n_frequency_bins - 1) * 2

x_waveform = noisy_batch

transform = Spectrogram(n_fft=n_fft, power=None)

x_stft = transform(x_waveform)
y_stft = transform(clean_batch)
x_ms = x_stft.pow(2).sum(-1).sqrt()
y_ms = y_stft.pow(2).sum(-1).sqrt()

y_ms_hat = model(x_ms)

y_stft_hat = torch.stack([y_ms_hat * torch.cos(angle(x_stft)),
                          y_ms_hat * torch.sin(angle(x_stft))], dim=-1)

window = torch.hann_window(n_fft)
y_waveform_hat = istft(y_stft_hat, n_fft=n_fft, hop_length=n_fft // 2, win_length=n_fft, window=window, length=x_waveform.shape[-1])
for i, waveform in enumerate(y_waveform_hat.numpy()):
    sf.write('denoised' + str(i) + '.wav', waveform, 16000)
Exemple #11
0
 def __init__(self, sr: int, sg_cfg: SpectrogramConfig):
     self.sg_cfg = sg_cfg
     self.spec = Spectrogram(**sg_cfg.spec_args)
     self.to_mel = MelScale(sample_rate=sr, **sg_cfg.mel_args)
     self.mfcc = MFCC(sample_rate=sr, **sg_cfg.mfcc_args)
     self.to_db = AmplitudeToDB(top_db=sg_cfg.top_db)
Exemple #12
0
                          max_iter=2 * 2048).to(device)
griffin_lim = GriffinLim(n_fft=1024, hop_length=256).to(device)

writer = tensorboard.SummaryWriter(log_dir=f'logs/test')

dataset = Dataset('../DATASETS/LJSpeech-1.1/metadata.csv',
                  '../DATASETS/LJSpeech-1.1')
dataloader = DataLoader(dataset,
                        collate_fn=dataset.collocate,
                        batch_size=batch_size,
                        shuffle=False,
                        num_workers=0,
                        drop_last=True)

resample = Resample(orig_freq=22050, new_freq=sample_rate)
spectogram = Spectrogram(n_fft=1024, hop_length=256).to(device)
to_mel = MelScale(n_mels=80, sample_rate=sample_rate,
                  n_stft=1024 // 2 + 1).to(device)
with open('../DATASETS/LJSpeech-1.1/metadata.csv', encoding='utf8') as file:
    data = [line.strip().split('|') for line in file]
path, text = data[0][0], data[0][1]
path = f'../DATASETS/LJSpeech-1.1/wavs/{path}.wav'
data, sr = torchaudio.load(path)

data = resample(data)
data = data.to(device)

data = spectogram(data.squeeze(0))
mel_norm = (
    (data.unsqueeze(0) - data.mean()) / data.std()).clamp(-1, 1) * .5 + .5
writer.add_image(f'spec/origin', mel_norm, 0)