Esempio n. 1
0
 def test_compute_deltas_randn(self):
     channel = 13
     n_mfcc = channel * 3
     time = 1021
     win_length = 2 * 7 + 1
     specgram = torch.randn(channel, n_mfcc, time)
     computed = F.compute_deltas(specgram, win_length=win_length)
     self.assertTrue(computed.shape == specgram.shape,
                     (computed.shape, specgram.shape))
Esempio n. 2
0
    def forward(self, specgram: Tensor) -> Tensor:
        r"""
        Args:
            specgram (Tensor): Tensor of audio of dimension (..., freq, time).

        Returns:
            Tensor: Tensor of deltas of dimension (..., freq, time).
        """
        return F.compute_deltas(specgram, win_length=self.win_length, mode=self.mode)
Esempio n. 3
0
 def test_compute_deltas_two_channels(self):
     specgram = torch.tensor([[[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]]],
                             dtype=self.dtype,
                             device=self.device)
     expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5], [0.5, 1.0, 1.0, 0.5]]],
                             dtype=self.dtype,
                             device=self.device)
     computed = F.compute_deltas(specgram, win_length=3)
     self.assertEqual(computed, expected)
Esempio n. 4
0
 def _test_compute_deltas(self,
                          specgram,
                          expected,
                          win_length=3,
                          atol=1e-6,
                          rtol=1e-8):
     computed = F.compute_deltas(specgram, win_length=win_length)
     self.assertTrue(computed.shape == expected.shape,
                     (computed.shape, expected.shape))
     torch.testing.assert_allclose(computed, expected, atol=atol, rtol=rtol)
Esempio n. 5
0
def delta(specgram, N):
    """
    Compute delta features from a feature vector sequence.
    :param specgram: (nframes, fealen), fealen is generally numcep in the MFCC.
    :param N:
    :return: (nframes, fealen)
    """
    # specgram: size (freq, time)
    return AF.compute_deltas(specgram.T.unsqueeze(0),
                             (N << 1) + 1).squeeze(0).T
Esempio n. 6
0
    def test_compute_deltas_transform_same_as_functional(self, atol=1e-6, rtol=1e-8):
        channel = 13
        n_mfcc = channel * 3
        time = 1021
        win_length = 2 * 7 + 1
        specgram = torch.randn(channel, n_mfcc, time)

        transform = transforms.ComputeDeltas(win_length=win_length)
        computed_transform = transform(specgram)

        computed_functional = F.compute_deltas(specgram, win_length=win_length)
        torch.testing.assert_allclose(computed_functional, computed_transform, atol=atol, rtol=rtol)
    def __getitem__(self, index):
        '''
        Generates one sample of data
        '''
        rec_id = self.recording_ids[index]

        X = torch.load(
            os.path.join(source_path, test_spectrograms_path, 'spectrogram',
                         f'{rec_id}_mel.pt'))  # _full

        if self.deltas and self.num_channels == 3:
            deltas_1 = compute_deltas(X)
            deltas_2 = compute_deltas(deltas_1)
            X = torch.stack([X, deltas_1, deltas_2])
        else:
            X = torch.stack([X] * self.num_channels)

        if self.normalize:
            X = self.min_max_normalization(X)

        return X
 def select_feat(variables, feat_type, channel=0, log=False, delta=0, cmvn=False):
     raw_feat = variables[feat_type].select(dim=-3, index=channel)
     # apply log scale
     if bool(log):
         raw_feat = (raw_feat + self.eps).log()   
     feats = [raw_feat.contiguous()]
     # apply delta for features
     for _ in range(int(delta)):
         feats.append(compute_deltas(feats[-1]))
     feats = torch.cat(feats, dim=-2)
     # apply cmvn
     if bool(cmvn):
         feats = (feats - feats.mean(dim=-1, keepdim=True)) / (feats.std(dim=-1, keepdim=True) + self.eps)
     return feats
Esempio n. 9
0
 def select_feat(variables, feat_type, channel=0, log=False, delta=0, cmvn=False):
     raw_feat = variables[feat_type].select(dim=-3, index=channel)
     # apply log scale
     if bool(log):
         raw_feat = (raw_feat + self.eps).log()   
     feats = [raw_feat.contiguous()]
     # apply delta for features
     for _ in range(int(delta)):
         feats.append(compute_deltas(feats[-1]))
     feats = torch.cat(feats, dim=-2)
     downsample_rate = wavs.size(-1) / feats.size(-1)
     feats_len = [round(length / downsample_rate) for length in wavs_len]
     # apply cmvn
     if bool(cmvn):
         cmvn_feats = []
         for feat, feat_len in zip(feats, feats_len):
             feat = feat[:, :feat_len]
             cmvn_feat = (feat - feat.mean(dim=-1, keepdim=True)) / (feat.std(dim=-1, keepdim=True) + self.eps)
             cmvn_feats.append(cmvn_feat.transpose(-1, -2))
         feats = pad_sequence(cmvn_feats, batch_first=True).transpose(-1, -2)
     return feats
Esempio n. 10
0
 def test_one_channel(self):
     specgram = torch.tensor([[[1.0, 2.0, 3.0, 4.0]]])
     expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5]]])
     computed = F.compute_deltas(specgram, win_length=3)
     self.assertEqual(computed, expected)
 def func(tensor):
     win_length = 2 * 7 + 1
     return F.compute_deltas(tensor, win_length=win_length)
Esempio n. 12
0
 def encodes(self, sg: AudioSpectrogram):
     delta = compute_deltas(sg, win_length=self.width, mode=self.mode)
     delta2 = compute_deltas(delta, win_length=self.width, mode=self.mode)
     sg.data = torch.cat([sg, delta, delta2], dim=1).contiguous()
     return sg
Esempio n. 13
0
 def test_two_channels(self):
     specgram = torch.tensor([[[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]]])
     expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5], [0.5, 1.0, 1.0, 0.5]]])
     computed = F.compute_deltas(specgram, win_length=3)
     torch.testing.assert_allclose(computed, expected)
    def __getitem__(self, index):
        '''
        Generates one sample of data
        In "train" mode: Select 60/num_snippets sec cropped from 60 sec
        spectrogram. Otherwise return whole 60 sec
        '''
        rec_id = self.recording_ids[index]
        snip_num = self.num_snippets
        snip_length = 60/snip_num

        X = torch.load(os.path.join(source_path, spectrograms_path,
                                    f'{rec_id}_mel.pt'))
        y = torch.FloatTensor(self.labels[rec_id])

        if self.mode == 'train':
            rnd_cropping = bool(self.rng.binomial(1, self.rdm_cropping_prob))

            t_start = self.time_interval_starts[index]
            t_end = self.time_interval_ends[index]
            t_length = t_end - t_start
            # cut off last frame to get even number (e.g. 1921 -> 1920)
            num_time_frames = X.shape[-1]
            # snippet length (in seconds) equivalent in frames
            # e.g. 1920 frames / 6 snippets = 320 frames/snippet
            frames_snip_length = int(num_time_frames/snip_num)

            # randomly cropping snippet length sec from spectrogram and
            # adjust labels if necessary
            if rnd_cropping:
                crop_start = random.uniform(0, 60-snip_length)
                start_frame_index = int(crop_start / 60 * num_time_frames)
                end_frame_index = start_frame_index + frames_snip_length
                buffer = t_length*0.1
                # adjusting labels
                if not (self.in_range(t_start, crop_start,
                                      crop_start + snip_length - buffer)
                        or self.in_range(t_end, crop_start + buffer,
                                         crop_start + snip_length)):
                    y = torch.zeros_like(y)
            # cropping snippet length seconds around given time interval
            # [t_start, t_end]
            else:  # no random cropping
                if t_length < snip_length:
                    # avoiding cropping over limits (0sec/60sec)
                    max_moving_range = min(t_start, snip_length - t_length)
                    min_moving_range = max(0, t_start - (60-snip_length))
                    applied_moving_range = random.uniform(min_moving_range,
                                                          max_moving_range)
                    start_frame_index = int((t_start - applied_moving_range)
                                            / 60 * num_time_frames)
                    end_frame_index = start_frame_index + frames_snip_length
                else:
                    max_moving_range = t_length - snip_length
                    min_moving_range = 0
                    applied_moving_range = random.uniform(min_moving_range,
                                                          max_moving_range)
                    start_frame_index = int((t_start + applied_moving_range)
                                            / 60 * num_time_frames)
                    end_frame_index = start_frame_index + frames_snip_length

            X = X[:, start_frame_index:end_frame_index]

        if self.deltas and self.num_channels == 3:
            deltas_1 = compute_deltas(X)
            deltas_2 = compute_deltas(deltas_1)
            X = torch.stack([X, deltas_1, deltas_2])
        else:
            X = torch.stack([X]*self.num_channels)

        if self.normalize:
            X = self.min_max_normalization(X)
        return X, y