def encodes(self, sg: AudioSpectrogram) -> AudioSpectrogram: channel_mean = sg.contiguous().view(sg.size(0), -1).mean(-1)[:, None, None] mask_val = ifnone(self.val, channel_mean) c, y, x = sg.shape # Position of the first mask start = ifnone(self.start, random.randint(0, y - self.size)) for _ in range(self.num_masks): mask = torch.ones(self.size, x) * mask_val if not 0 <= start <= y - self.size: raise ValueError( f"Start value '{start}' out of range for AudioSpectrogram of shape {sg.shape}" ) sg[:, start : start + self.size, :] = mask # Setting start position for next mask start = random.randint(0, y - self.size) return sg
def encodes(self, ai: AudioTensor) -> AudioTensor: sig = ai.data orig_samples = ai.nsamples crop_samples = int((self.duration / 1000) * ai.sr) if orig_samples == crop_samples: return ai elif orig_samples < crop_samples: ai.data = _tfm_pad_signal(sig, crop_samples, pad_mode=self.pad_mode) else: crop_start = random.randint(0, int(orig_samples - crop_samples)) ai.data = sig[:, crop_start : crop_start + crop_samples] return ai
def _tfm_pad_signal(sig, width, pad_mode=AudioPadType.Zeros): """Pad spectrogram to specified width, using specified pad mode""" c, x = sig.shape if pad_mode in [AudioPadType.Zeros, AudioPadType.Zeros_After]: zeros_front = (random.randint(0, width - x) if pad_mode == AudioPadType.Zeros else 0) pad_front = torch.zeros((c, zeros_front)) pad_back = torch.zeros((c, width - x - zeros_front)) return torch.cat((pad_front, sig, pad_back), 1) elif pad_mode == AudioPadType.Repeat: repeats = width // x + 1 return sig.repeat(1, repeats)[:, :width]
def encodes(self, sg: AudioSpectrogram) -> AudioSpectrogram: sr, hop = sg.sr, sg.hop_length w_crop = int((sr * self.duration) / (1000 * hop)) + 1 w_sg = sg.shape[-1] if w_sg == w_crop: sg_crop = sg elif w_sg < w_crop: sg_crop = _tfm_pad_spectro(sg, w_crop, pad_mode=self.pad_mode) else: crop_start = random.randint(0, int(w_sg - w_crop)) sg_crop = sg[:, :, crop_start : crop_start + w_crop] sg_crop.sample_start = int(crop_start * hop) sg_crop.sample_end = sg_crop.sample_start + int(self.duration * sr) sg.data = sg_crop return sg
def _tfm_pad_spectro(sg, width, pad_mode=AudioPadType.Zeros): """Pad spectrogram to specified width, using specified pad mode""" c, y, x = sg.shape if pad_mode in [AudioPadType.Zeros, AudioPadType.Zeros_After]: padded = torch.zeros((c, y, width)) start = random.randint(0, width - x) if pad_mode == AudioPadType.Zeros else 0 padded[:, :, start:start + x] = sg.data return padded elif pad_mode == AudioPadType.Repeat: repeats = width // x + 1 return sg.repeat(1, 1, repeats)[:, :, :width] else: raise ValueError(f"""pad_mode {pad_mode} not currently supported, only AudioPadType.Zeros, AudioPadType.Zeros_After, or AudioPadType.Repeat""")