Esempio n. 1
0
def _init_process(cfg, process):
    if process == 'logmel':
        return MelSpectrogram(cfg.sample_rate,
                              cfg.n_fft,
                              cfg.win_length,
                              cfg.hop_length,
                              cfg.f_min,
                              cfg.f_max,
                              pad=0,
                              n_mels=cfg.n_mels)
    elif process == 'delta':
        return ComputeDeltas(cfg.delta)
    elif process == 'time_mask':
        return TimeMasking(cfg.time_mask_len)
    elif process == 'normalize':
        return Normalize()
    else:
        raise NotImplementedError
Esempio n. 2
0
    def __init__(self):

        sample_rate = 44100
        num_mels = 128
        fft_length = 2048
        hop_length = fft_length // 2

        self.stft = Spectrogram(n_fft=fft_length,
                                win_length=fft_length,
                                hop_length=None,
                                pad=0,
                                power=None,
                                normalized=False)

        self.mst = MelSpectrogram(sample_rate=sample_rate,
                                  n_fft=fft_length,
                                  hop_length=hop_length,
                                  n_mels=num_mels)

        # Normalization (pot spec processing)
        self.complex_norm = ComplexNorm(power=2.)
Esempio n. 3
0
def main(args):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    waveform, sample_rate, _, _ = LJSPEECH("./", download=True)[0]

    mel_kwargs = {
        'sample_rate': sample_rate,
        'n_fft': 2048,
        'f_min': 40.,
        'n_mels': 80,
        'win_length': 1100,
        'hop_length': 275,
        'mel_scale': 'slaney',
        'norm': 'slaney',
        'power': 1,
    }
    transforms = torch.nn.Sequential(
        MelSpectrogram(**mel_kwargs),
        NormalizeDB(min_level_db=-100, normalization=True),
    )
    mel_specgram = transforms(waveform)

    wavernn_model = wavernn(args.checkpoint_name).eval().to(device)
    wavernn_inference_model = WaveRNNInferenceWrapper(wavernn_model)

    if args.jit:
        wavernn_inference_model = torch.jit.script(wavernn_inference_model)

    with torch.no_grad():
        output = wavernn_inference_model(
            mel_specgram.to(device),
            loss_name=args.loss,
            mulaw=(not args.no_mulaw),
            batched=(not args.no_batch_inference),
            timesteps=args.batch_timesteps,
            overlap=args.batch_overlap,
        )

    torchaudio.save(args.output_wav_path,
                    output.reshape(1, -1),
                    sample_rate=sample_rate)
Esempio n. 4
0
 def __init__(
     self,
     num_classes: int,
     hop_length: int,
     sample_rate: int,
     n_mels: int,
     n_fft: int,
     power: float,
     normalize: bool,
     use_decibels: bool,
 ) -> None:
     super().__init__()
     self.use_decibels = use_decibels
     self.melspectrogram = MelSpectrogram(
         sample_rate=sample_rate,
         n_fft=n_fft,
         hop_length=hop_length,
         n_mels=n_mels,
         power=power,
         normalized=normalize,
     )
     self.amplitude2db = AmplitudeToDB()
     self.input_bn = nn.BatchNorm2d(num_features=1)
     self.conv1 = nn.Conv2d(in_channels=1,
                            out_channels=64,
                            kernel_size=[7, 3])
     self.bn1 = nn.BatchNorm2d(num_features=64)
     self.conv2 = nn.Conv2d(in_channels=64,
                            out_channels=128,
                            kernel_size=[1, 7])
     self.bn2 = nn.BatchNorm2d(num_features=128)
     self.conv3 = nn.Conv2d(in_channels=128,
                            out_channels=256,
                            kernel_size=[1, 10])
     self.bn3 = nn.BatchNorm2d(num_features=256)
     self.conv4 = nn.Conv2d(in_channels=256,
                            out_channels=512,
                            kernel_size=[7, 1])
     self.bn4 = nn.BatchNorm2d(num_features=512)
     self.logits = nn.Linear(in_features=512, out_features=num_classes)
Esempio n. 5
0
 def _audio_transform(self):
     """
     This function contains example transforms using both PyTorchVideo and TorchAudio
     in the same Callable.
     """
     args = self.args
     n_fft = int(
         float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size
     )
     hop_length = int(
         float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size
     )
     eps = 1e-10
     return ApplyTransformToKey(
         key="audio",
         transform=Compose(
             [
                 Resample(
                     orig_freq=args.audio_raw_sample_rate,
                     new_freq=args.audio_resampled_rate,
                 ),
                 MelSpectrogram(
                     sample_rate=args.audio_resampled_rate,
                     n_fft=n_fft,
                     hop_length=hop_length,
                     n_mels=args.audio_num_mels,
                     center=False,
                 ),
                 Lambda(lambda x: x.clamp(min=eps)),
                 Lambda(torch.log),
                 UniformTemporalSubsample(args.audio_mel_num_subsample),
                 Lambda(lambda x: x.transpose(1, 0)),  # (F, T) -> (T, F)
                 Lambda(
                     lambda x: x.view(1, x.size(0), 1, x.size(1))
                 ),  # (T, F) -> (1, T, 1, F)
                 Normalize((args.audio_logmel_mean,), (args.audio_logmel_std,)),
             ]
         ),
     )
Esempio n. 6
0
 def __init__(self, sample_rate, n_fft, top_db, max_perc):
     super().__init__()
     self.time_stretch = TimeStretch(hop_length=None, n_freq=n_fft//2+1)
     self.stft = Spectrogram(n_fft=n_fft, power=None)
     self.com_norm = ComplexNorm(power=2.)
     self.fm = FrequencyMasking(50)
     self.tm = TimeMasking(50)
     self.mel_specgram = MelSpectrogram(sample_rate, n_fft=n_fft, f_max=8000)
     self.AtoDB= AmplitudeToDB(top_db=top_db)
     self.max_perc = max_perc
     self.sample_rate = sample_rate
     self.resamples = [
             Resample(sample_rate, sample_rate*0.6),
             Resample(sample_rate, sample_rate*0.7),
             Resample(sample_rate, sample_rate*0.8),
             Resample(sample_rate, sample_rate*0.9),
             Resample(sample_rate, sample_rate*1),
             Resample(sample_rate, sample_rate*1.1),
             Resample(sample_rate, sample_rate*1.2),
             Resample(sample_rate, sample_rate*1.3),
             Resample(sample_rate, sample_rate*1.4)
         ]
Esempio n. 7
0
    def __init__(self, num_classes: int, sample_rate: int) -> None:
        super().__init__()
        self.melspectrogram = MelSpectrogram(sample_rate=sample_rate)
        self.norm_input = nn.BatchNorm2d(num_features=1)
        self.maxpool = nn.MaxPool2d(kernel_size=2)

        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=2)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=48, kernel_size=3)
        self.conv4 = nn.Conv2d(in_channels=48, out_channels=48, kernel_size=3)
        self.conv5 = nn.Conv2d(in_channels=48, out_channels=96, kernel_size=3)
        self.conv6 = nn.Conv2d(in_channels=96, out_channels=128, kernel_size=3)
        self.conv7 = nn.Conv2d(in_channels=128,
                               out_channels=256,
                               kernel_size=3)

        self.drop1 = nn.Dropout(p=0.1)
        self.drop2 = nn.Dropout(p=0.2)

        self.fc = nn.Linear(in_features=1536, out_features=256)
        self.fc_norm = nn.BatchNorm1d(num_features=256)
        self.linear = nn.Linear(in_features=256, out_features=num_classes)
Esempio n. 8
0
def build_transform(feature_type, feature_size, n_fft=512, win_length=400,
                    hop_length=200, delta=False, cmvn=False, downsample=1,
                    T_mask=0, T_num_mask=0, F_mask=0, F_num_mask=0,
                    pad_to_divisible=True):
    feature_args = {
        'n_fft': n_fft,
        'win_length': win_length,
        'hop_length': hop_length,
        # 'f_min': 20,
        # 'f_max': 5800,
    }
    transform = []
    input_size = feature_size
    if feature_type == 'mfcc':
        transform.append(MFCC(
            n_mfcc=feature_size, log_mels=True, melkwargs=feature_args))
    if feature_type == 'melspec':
        transform.append(MelSpectrogram(
            n_mels=feature_size, **feature_args))
    if feature_type == 'logfbank':
        transform.append(FilterbankFeatures(
            n_filt=feature_size, **feature_args))
    if delta:
        transform.append(CatDeltas())
        input_size = input_size * 3
    # if cmvn:
    #     transform.append(CMVN())
    if downsample > 1:
        transform.append(Downsample(downsample, pad_to_divisible))
        input_size = input_size * downsample
    transform_test = torch.nn.Sequential(*transform)

    if T_mask > 0 and T_num_mask > 0:
        transform.append(TimeMasking(T_mask, T_num_mask))
    if F_mask > 0 and F_num_mask > 0:
        transform.append(FrequencyMasking(F_mask, F_num_mask))
    transform_train = torch.nn.Sequential(*transform)

    return transform_train, transform_test, input_size
Esempio n. 9
0
    def wav_tensor_to_mel(self, tensor, normalization=True, to_db=True):
        """Mel Spectrogram from raw audio tensor.

        Returns:
            - tensor of shape (n_frames, n_mels)
        """

        if normalization:
            tensor = tensor / tensor.norm(float('inf'))

        mel = MelSpectrogram(sample_rate=self.sample_rate,
                             hop_length=self.hop_len,
                             n_fft=self.n_fft,
                             n_mels=self.n_mels)(tensor)

        if to_db:
            mel_db = self.amplitude_to_db(mel)

        if normalization:
            mel_db = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min())

        return mel_db.T
Esempio n. 10
0
    def __init__(self,
                 output_class=264,
                 d_size=256,
                 sample_rate=32000,
                 n_fft=2**11,
                 top_db=80):

        super().__init__()
        self.mel = MelSpectrogram(sample_rate, n_fft=n_fft)
        self.norm_db = AmplitudeToDB(top_db=top_db)

        self.conv1 = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1))
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(0.1)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=3)
        self.dropout = nn.Dropout(0.1)

        self.conv2 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
        self.bn2 = nn.BatchNorm2d(128)
        self.relu2 = nn.ReLU(0.1)
        self.maxpool2 = nn.MaxPool2d(kernel_size=4, stride=3)
        self.dropout2 = nn.Dropout(0.1)

        self.conv3 = nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
        self.bn3 = nn.BatchNorm2d(256)
        self.relu3 = nn.ReLU(0.1)
        self.maxpool3 = nn.MaxPool2d(kernel_size=4, stride=3)
        self.dropout3 = nn.Dropout(0.1)

        self.lstm = nn.LSTM(12, 256, 2, batch_first=True)
        self.dropout_lstm = nn.Dropout(0.3)
        self.bn_lstm = nn.BatchNorm1d(256)

        self.output1 = nn.Linear(256, 512)
        self.relu_out = nn.ReLU(0.1)
        self.dropout_out = nn.Dropout(0.1)
        self.output2 = nn.Linear(512, output_class)
Esempio n. 11
0
    def __init__(self,
                 sample_rate: int,
                 mel_size: int,
                 n_fft: int,
                 win_length: int,
                 hop_length: int,
                 min_db: float,
                 max_db: float,
                 mel_min: float = 0.,
                 mel_max: float = None):
        super().__init__()
        self.mel_size = mel_size
        # db to log
        self.min_db = np.log(np.power(10, min_db / 10))
        self.max_db = np.log(np.power(10, max_db / 10))

        self.melfunc = MelSpectrogram(sample_rate=sample_rate,
                                      n_fft=n_fft,
                                      win_length=win_length,
                                      hop_length=hop_length,
                                      f_min=mel_min,
                                      f_max=mel_max,
                                      n_mels=mel_size,
                                      window_fn=torch.hann_window)
Esempio n. 12
0
 def __init__(
     self,
     sample_rate: float,
     fft_window_ms: float,
     fft_hop_ms: float,
     n_fft: int,
     f_min: float,
     n_mels: int,
     preemph: float,
     ref_db: float,
     dc_db: float,
 ):
     super().__init__()
     self.melspectrogram = MelSpectrogram(
         sample_rate=sample_rate,
         win_length=int(sample_rate * fft_window_ms / 1000),
         hop_length=int(sample_rate * fft_hop_ms / 1000),
         n_fft=n_fft,
         f_min=f_min,
         n_mels=n_mels,
     )
     self.preemph = preemph
     self.ref_db = ref_db
     self.dc_db = dc_db
Esempio n. 13
0
from torch.nn import Sequential
from torch.nn import Module
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

from typing import Tuple

commun_transforms = Sequential(
    MelSpectrogram(sample_rate=44100, n_fft=2048, hop_length=512, n_mels=64),
    AmplitudeToDB(),
)


def supervised() -> Tuple[Module, Module]:
    train_transform = commun_transforms
    val_transform = commun_transforms

    return train_transform, val_transform


def dct() -> Tuple[Module, Module]:
    return supervised()


def dct_uniloss() -> Tuple[Module, Module]:
    return supervised()


def dct_aug4adv() -> Tuple[Module, Module]:
    raise NotImplementedError

Esempio n. 14
0
    def open(self, item) -> AudioItem:
        p = Path(item)
        if self.path is not None and str(self.path) not in str(item):
            p = self.path / item
        if not p.exists():
            raise FileNotFoundError(
                f"Neither '{item}' nor '{p}' could be found")
        if not str(p).lower().endswith(AUDIO_EXTENSIONS):
            raise Exception("Invalid audio file")

        cfg = self.config
        if cfg.use_spectro:
            folder = md5(str(asdict(cfg)) + str(asdict(cfg.sg_cfg)))
            fname = f"{md5(str(p))}-{p.name}.pt"
            image_path = cfg.cache_dir / (f"{folder}/{fname}")
            if cfg.cache and not cfg.force_cache and image_path.exists():
                mel = torch.load(image_path).squeeze()
                start, end = None, None
                if cfg.duration and cfg._processed:
                    mel, start, end = tfm_crop_time(mel, cfg._sr, cfg.duration,
                                                    cfg.sg_cfg.hop,
                                                    cfg.pad_mode)
                return AudioItem(spectro=mel,
                                 path=item,
                                 max_to_pad=cfg.max_to_pad,
                                 start=start,
                                 end=end)

        sig, sr = torchaudio.load(str(p))
        if (cfg._sr is not None and sr != cfg._sr):
            raise ValueError(
                f'''Multiple sample rates detected. Sample rate {sr} of file {str(p)} 
                                does not match config sample rate {cfg._sr} 
                                this means your dataset has multiple different sample rates, 
                                please choose one and set resample_to to that value'''
            )
        if (sig.shape[0] > 1):
            if not cfg.downmix:
                warnings.warn(
                    f'''Audio file {p} has {sig.shape[0]} channels, automatically downmixing to mono, 
                                set AudioConfig.downmix=True to remove warnings'''
                )
            sig = DownmixMono(channels_first=True)(sig)
        if cfg.max_to_pad or cfg.segment_size:
            pad_len = cfg.max_to_pad if cfg.max_to_pad is not None else cfg.segment_size
            sig = tfm_padtrim_signal(sig,
                                     int(pad_len / 1000 * sr),
                                     pad_mode="zeros")

        mel = None
        if cfg.use_spectro:
            if cfg.mfcc:
                mel = MFCC(sr=sr,
                           n_mfcc=cfg.sg_cfg.n_mfcc,
                           melkwargs=cfg.sg_cfg.mel_args())(sig)
            else:
                mel = MelSpectrogram(**(cfg.sg_cfg.mel_args()))(sig)
                if cfg.sg_cfg.to_db_scale:
                    mel = SpectrogramToDB(top_db=cfg.sg_cfg.top_db)(mel)
            mel = mel.squeeze().permute(1, 0).flip(0)
            if cfg.standardize: mel = standardize(mel)
            if cfg.delta:
                mel = torch.stack(
                    [mel, torchdelta(mel),
                     torchdelta(mel, order=2)])
            else:
                mel = mel.expand(3, -1, -1)
            if cfg.cache:
                os.makedirs(image_path.parent, exist_ok=True)
                torch.save(mel, image_path)
                _record_cache_contents(cfg, [image_path])
            start, end = None, None
            if cfg.duration and cfg._processed:
                mel, start, end = tfm_crop_time(mel, cfg._sr, cfg.duration,
                                                cfg.sg_cfg.hop, cfg.pad_mode)
        return AudioItem(sig=sig.squeeze(),
                         sr=sr,
                         spectro=mel,
                         path=item,
                         start=start,
                         end=end)
Esempio n. 15
0
def normalize_spectrogram(spectrogram, min_level=-80.0):
    return torch.clamp(spectrogram / -min_level, -1.0, 0.0) + 1.0


def denormalize_spectrogram(spectrogram, min_level=-80.0, min_value=0, max_value=1):
    return ((torch.clamp(spectrogram, 0.0, 1.0) - 1.0)-min_value)/(max_value-min_value) * -min_level


# region transform
normalize_output = normalization_parameters is not None
resample_input = int(melspectrogram_transform_parameters["sample_rate"]) != 44100
transforms_to_be_composed = [
    Lambda(lambda x: (x[:,0]+x[:,-1])/2),
    Lambda(lambda x: x[..., ::2]),
    MelSpectrogram(**melspectrogram_transform_parameters),
    AmplitudeToDB()
]
if normalize_output:
    transforms_to_be_composed.append(
        Lambda(lambda x: shift_and_normalize_spec(x, **normalization_parameters))
    )
direct_transform = transforms.Compose(transforms_to_be_composed)


# endregion

# region inverse transform
def pointwise_mel_to_audio(x):
    return mel_to_audio(x,
                        sr=melspectrogram_transform_parameters["sample_rate"],
Esempio n. 16
0
from torchaudio.transforms import MelSpectrogram
from constants import WINDOW_LENGTH, WINDOW_HOP

transform = MelSpectrogram(n_fft=WINDOW_LENGTH,
                           win_length=WINDOW_LENGTH,
                           hop_length=WINDOW_HOP,
                           power=1)
Esempio n. 17
0
 def __init__(self, **kwargs):
     self.mel_spec = MelSpectrogram(**kwargs)
     self.db_scale = AmplitudeToDB()
Esempio n. 18
0
 def __init__(self, **kwargs):
     self.mel_spec = MelSpectrogram(**kwargs)
import os
import umap
import torch
#

reducer = umap.UMAP(random_state=42, n_neighbors=7, min_dist=0.1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

manifold_approx = []
val_loss = []
num_utt = 27629
num_epoch = 30
model = Nnet()
optimizer = optim.Adam(model.parameters())
criterion = GE2ELoss(init_w=10.0, init_b=-5.0, loss_method='softmax')
melspec = MelSpectrogram(n_mels=128, n_fft=400).to(device)
for i in range(num_epoch):
    run = wandb.init(project="ge2e", reinit=True)
    running_loss = 0.0
    for k in range(num_utt // 80):
        model.to(device)
        model.train()
        X = get_batch('/content/LibriSpeech/train-clean-100', 8, 10).sampler()
        X = padding_batch(X).to(device)

        X = melspec(X)

        optimizer.zero_grad()
        outputs = model(X).view(8, 10, -1)
        loss = criterion(outputs)
        loss.backward()
Esempio n. 20
0
    def __init__(
        self,
        num_classes: int,
        hop_length: int,
        sample_rate: int,
        n_mels: int,
        n_fft: int,
        power: float,
        normalize: bool,
        use_decibels: bool,
    ) -> None:
        super().__init__()
        self.use_decibels = use_decibels

        self.melspectrogram = MelSpectrogram(
            sample_rate=sample_rate,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels,
            power=power,
            normalized=normalize,
        )
        self.amplitude2db = AmplitudeToDB()
        self.input_bn = nn.BatchNorm2d(num_features=1)

        self.conv1 = nn.Conv2d(in_channels=1,
                               out_channels=16,
                               kernel_size=3,
                               padding=1)
        self.bn1 = nn.BatchNorm2d(num_features=16)
        self.res1 = ResBlock(n_in=16, bottleneck=16, n_out=16)

        self.conv2 = nn.Conv2d(in_channels=16,
                               out_channels=32,
                               kernel_size=3,
                               padding=1)
        self.bn2 = nn.BatchNorm2d(num_features=32)
        self.res2 = ResBlock(n_in=32, bottleneck=32, n_out=32)
        self.res3 = ResBlock(n_in=32, bottleneck=32, n_out=32)

        self.conv3 = nn.Conv2d(in_channels=32,
                               out_channels=64,
                               kernel_size=3,
                               padding=1)
        self.bn3 = nn.BatchNorm2d(num_features=64)
        self.res4 = ResBlock(n_in=64, bottleneck=64, n_out=64)
        self.res5 = ResBlock(n_in=64, bottleneck=64, n_out=64)

        self.conv4 = nn.Conv2d(in_channels=64,
                               out_channels=128,
                               kernel_size=3,
                               padding=1)
        self.bn4 = nn.BatchNorm2d(num_features=128)
        self.res6 = ResBlock(n_in=128, bottleneck=128, n_out=128)
        self.res7 = ResBlock(n_in=128, bottleneck=128, n_out=128)

        self.conv5 = nn.Conv2d(in_channels=128,
                               out_channels=256,
                               kernel_size=3,
                               padding=1)
        self.bn5 = nn.BatchNorm2d(num_features=256)

        self.logits = nn.Linear(in_features=256, out_features=num_classes)
    def __init__(self):
        super(MelNet, self).__init__()
        self.mfcc = MelSpectrogram(sample_rate=8000)

        self.c1 = nn.Conv1d(1, 128, kernel_size=400, stride=200, padding=200)
        self.conv1 = nn.Conv2d(1, 1, kernel_size=37, padding=18)
Esempio n. 22
0
    def __init__(
        self,
        hparams,
        encoder,
        sed_student,
        opt=None,
        train_data=None,
        valid_data=None,
        test_data=None,
        train_sampler=None,
        scheduler=None,
        fast_dev_run=False,
    ):
        super(SEDTask4_2021, self).__init__()
        self.hparams = hparams

        self.encoder = encoder
        self.sed_student = sed_student
        self.sed_teacher = deepcopy(sed_student)
        self.opt = opt
        self.train_data = train_data
        self.valid_data = valid_data
        self.test_data = test_data
        self.train_sampler = train_sampler
        self.scheduler = scheduler
        self.fast_dev_run = fast_dev_run
        if self.fast_dev_run:
            self.num_workers = 1
        else:
            self.num_workers = self.hparams["training"]["num_workers"]

        feat_params = self.hparams["feats"]
        self.mel_spec = MelSpectrogram(
            sample_rate=feat_params["sample_rate"],
            n_fft=feat_params["n_window"],
            win_length=feat_params["n_window"],
            hop_length=feat_params["hop_length"],
            f_min=feat_params["f_min"],
            f_max=feat_params["f_max"],
            n_mels=feat_params["n_mels"],
            window_fn=torch.hamming_window,
            wkwargs={"periodic": False},
            power=1,
        )

        for param in self.sed_teacher.parameters():
            param.detach_()

        # instantiating losses
        self.supervised_loss = torch.nn.BCELoss()
        if hparams["training"]["self_sup_loss"] == "mse":
            self.selfsup_loss = torch.nn.MSELoss()
        elif hparams["training"]["self_sup_loss"] == "bce":
            self.selfsup_loss = torch.nn.BCELoss()
        else:
            raise NotImplementedError

        # for weak labels we simply compute f1 score
        self.get_weak_student_f1_seg_macro = pl.metrics.classification.F1(
            len(self.encoder.labels),
            average="macro",
            multilabel=True,
            compute_on_step=False,
        )

        self.get_weak_teacher_f1_seg_macro = pl.metrics.classification.F1(
            len(self.encoder.labels),
            average="macro",
            multilabel=True,
            compute_on_step=False,
        )

        self.scaler = self._init_scaler()

        # buffer for event based scores which we compute using sed-eval

        self.val_buffer_student_synth = {
            k: pd.DataFrame()
            for k in self.hparams["training"]["val_thresholds"]
        }
        self.val_buffer_teacher_synth = {
            k: pd.DataFrame()
            for k in self.hparams["training"]["val_thresholds"]
        }

        self.val_buffer_student_test = {
            k: pd.DataFrame()
            for k in self.hparams["training"]["val_thresholds"]
        }
        self.val_buffer_teacher_test = {
            k: pd.DataFrame()
            for k in self.hparams["training"]["val_thresholds"]
        }

        test_n_thresholds = self.hparams["training"]["n_test_thresholds"]
        test_thresholds = np.arange(1 / (test_n_thresholds * 2), 1,
                                    1 / test_n_thresholds)
        self.test_psds_buffer_student = {
            k: pd.DataFrame()
            for k in test_thresholds
        }
        self.test_psds_buffer_teacher = {
            k: pd.DataFrame()
            for k in test_thresholds
        }

        self.decoded_student_05_buffer = pd.DataFrame()
        self.decoded_teacher_05_buffer = pd.DataFrame()
Esempio n. 23
0
 def __init__(self, **kwargs):
     super(MelSpectrogramFixed, self).__init__()
     self.torchaudio_backend = MelSpectrogram(**kwargs)
Esempio n. 24
0
from collections import defaultdict
from glob import glob
from torch.nn import ConstantPad1d
from torchaudio.transforms import MelSpectrogram
from tqdm import tqdm

n_fft = 1600
n_mels = 128
f_max = 20000
sr = 16000
hop_length = 160
win_length = hop_length * 2
max_len = sr // 4
dialects = ['DR' + str(i) for i in range(1, 9)]
phoneme_cols = ['start', 'end', 'phoneme']
spec = MelSpectrogram(n_fft=n_fft, f_max=f_max)
epsilon = 1e-6


def get_mspec(y):
    return librosa.feature.melspectrogram(y, sr=sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length)


def inverse_melspec(s):
    return librosa.feature.inverse.mel_to_audio(s, sr=sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length)


def wav_to_padded_mspec_flat_tensor(wav, length):
    assert(length <= max_len)
    p_len = max_len - length
    padding = ConstantPad1d((0, p_len), 0)