def test_compute_deltas_randn(self): channel = 13 n_mfcc = channel * 3 time = 1021 win_length = 2 * 7 + 1 specgram = torch.randn(channel, n_mfcc, time) computed = F.compute_deltas(specgram, win_length=win_length) self.assertTrue(computed.shape == specgram.shape, (computed.shape, specgram.shape))
def forward(self, specgram: Tensor) -> Tensor: r""" Args: specgram (Tensor): Tensor of audio of dimension (..., freq, time). Returns: Tensor: Tensor of deltas of dimension (..., freq, time). """ return F.compute_deltas(specgram, win_length=self.win_length, mode=self.mode)
def test_compute_deltas_two_channels(self): specgram = torch.tensor([[[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]]], dtype=self.dtype, device=self.device) expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5], [0.5, 1.0, 1.0, 0.5]]], dtype=self.dtype, device=self.device) computed = F.compute_deltas(specgram, win_length=3) self.assertEqual(computed, expected)
def _test_compute_deltas(self, specgram, expected, win_length=3, atol=1e-6, rtol=1e-8): computed = F.compute_deltas(specgram, win_length=win_length) self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape)) torch.testing.assert_allclose(computed, expected, atol=atol, rtol=rtol)
def delta(specgram, N): """ Compute delta features from a feature vector sequence. :param specgram: (nframes, fealen), fealen is generally numcep in the MFCC. :param N: :return: (nframes, fealen) """ # specgram: size (freq, time) return AF.compute_deltas(specgram.T.unsqueeze(0), (N << 1) + 1).squeeze(0).T
def test_compute_deltas_transform_same_as_functional(self, atol=1e-6, rtol=1e-8): channel = 13 n_mfcc = channel * 3 time = 1021 win_length = 2 * 7 + 1 specgram = torch.randn(channel, n_mfcc, time) transform = transforms.ComputeDeltas(win_length=win_length) computed_transform = transform(specgram) computed_functional = F.compute_deltas(specgram, win_length=win_length) torch.testing.assert_allclose(computed_functional, computed_transform, atol=atol, rtol=rtol)
def __getitem__(self, index): ''' Generates one sample of data ''' rec_id = self.recording_ids[index] X = torch.load( os.path.join(source_path, test_spectrograms_path, 'spectrogram', f'{rec_id}_mel.pt')) # _full if self.deltas and self.num_channels == 3: deltas_1 = compute_deltas(X) deltas_2 = compute_deltas(deltas_1) X = torch.stack([X, deltas_1, deltas_2]) else: X = torch.stack([X] * self.num_channels) if self.normalize: X = self.min_max_normalization(X) return X
def select_feat(variables, feat_type, channel=0, log=False, delta=0, cmvn=False): raw_feat = variables[feat_type].select(dim=-3, index=channel) # apply log scale if bool(log): raw_feat = (raw_feat + self.eps).log() feats = [raw_feat.contiguous()] # apply delta for features for _ in range(int(delta)): feats.append(compute_deltas(feats[-1])) feats = torch.cat(feats, dim=-2) # apply cmvn if bool(cmvn): feats = (feats - feats.mean(dim=-1, keepdim=True)) / (feats.std(dim=-1, keepdim=True) + self.eps) return feats
def select_feat(variables, feat_type, channel=0, log=False, delta=0, cmvn=False): raw_feat = variables[feat_type].select(dim=-3, index=channel) # apply log scale if bool(log): raw_feat = (raw_feat + self.eps).log() feats = [raw_feat.contiguous()] # apply delta for features for _ in range(int(delta)): feats.append(compute_deltas(feats[-1])) feats = torch.cat(feats, dim=-2) downsample_rate = wavs.size(-1) / feats.size(-1) feats_len = [round(length / downsample_rate) for length in wavs_len] # apply cmvn if bool(cmvn): cmvn_feats = [] for feat, feat_len in zip(feats, feats_len): feat = feat[:, :feat_len] cmvn_feat = (feat - feat.mean(dim=-1, keepdim=True)) / (feat.std(dim=-1, keepdim=True) + self.eps) cmvn_feats.append(cmvn_feat.transpose(-1, -2)) feats = pad_sequence(cmvn_feats, batch_first=True).transpose(-1, -2) return feats
def test_one_channel(self): specgram = torch.tensor([[[1.0, 2.0, 3.0, 4.0]]]) expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5]]]) computed = F.compute_deltas(specgram, win_length=3) self.assertEqual(computed, expected)
def func(tensor): win_length = 2 * 7 + 1 return F.compute_deltas(tensor, win_length=win_length)
def encodes(self, sg: AudioSpectrogram): delta = compute_deltas(sg, win_length=self.width, mode=self.mode) delta2 = compute_deltas(delta, win_length=self.width, mode=self.mode) sg.data = torch.cat([sg, delta, delta2], dim=1).contiguous() return sg
def test_two_channels(self): specgram = torch.tensor([[[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]]]) expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5], [0.5, 1.0, 1.0, 0.5]]]) computed = F.compute_deltas(specgram, win_length=3) torch.testing.assert_allclose(computed, expected)
def __getitem__(self, index): ''' Generates one sample of data In "train" mode: Select 60/num_snippets sec cropped from 60 sec spectrogram. Otherwise return whole 60 sec ''' rec_id = self.recording_ids[index] snip_num = self.num_snippets snip_length = 60/snip_num X = torch.load(os.path.join(source_path, spectrograms_path, f'{rec_id}_mel.pt')) y = torch.FloatTensor(self.labels[rec_id]) if self.mode == 'train': rnd_cropping = bool(self.rng.binomial(1, self.rdm_cropping_prob)) t_start = self.time_interval_starts[index] t_end = self.time_interval_ends[index] t_length = t_end - t_start # cut off last frame to get even number (e.g. 1921 -> 1920) num_time_frames = X.shape[-1] # snippet length (in seconds) equivalent in frames # e.g. 1920 frames / 6 snippets = 320 frames/snippet frames_snip_length = int(num_time_frames/snip_num) # randomly cropping snippet length sec from spectrogram and # adjust labels if necessary if rnd_cropping: crop_start = random.uniform(0, 60-snip_length) start_frame_index = int(crop_start / 60 * num_time_frames) end_frame_index = start_frame_index + frames_snip_length buffer = t_length*0.1 # adjusting labels if not (self.in_range(t_start, crop_start, crop_start + snip_length - buffer) or self.in_range(t_end, crop_start + buffer, crop_start + snip_length)): y = torch.zeros_like(y) # cropping snippet length seconds around given time interval # [t_start, t_end] else: # no random cropping if t_length < snip_length: # avoiding cropping over limits (0sec/60sec) max_moving_range = min(t_start, snip_length - t_length) min_moving_range = max(0, t_start - (60-snip_length)) applied_moving_range = random.uniform(min_moving_range, max_moving_range) start_frame_index = int((t_start - applied_moving_range) / 60 * num_time_frames) end_frame_index = start_frame_index + frames_snip_length else: max_moving_range = t_length - snip_length min_moving_range = 0 applied_moving_range = random.uniform(min_moving_range, max_moving_range) start_frame_index = int((t_start + applied_moving_range) / 60 * num_time_frames) end_frame_index = start_frame_index + frames_snip_length X = X[:, start_frame_index:end_frame_index] if self.deltas and self.num_channels == 3: deltas_1 = compute_deltas(X) deltas_2 = compute_deltas(deltas_1) X = torch.stack([X, deltas_1, deltas_2]) else: X = torch.stack([X]*self.num_channels) if self.normalize: X = self.min_max_normalization(X) return X, y