Esempio n. 1
0
def synthesize_pure_tone_segment(duration_s: float,
                                 fs: float,
                                 ft: float,
                                 dBFS=-6.0,
                                 sample_width=2) -> audiosegment.AudioSegment:
    """
    Synthesize a pure tone of `ft` Hz, sampled at `fs` samples per second, of duration `duration_s` seconds.
    Return an AudioSegment.
    """
    def dtype(arr):
        if sample_width == 1:
            return np.int8(arr)
        elif sample_width == 2:
            return np.int16(arr)
        elif sample_width == 4:
            return np.int32(arr)
        else:
            raise ValueError(
                "Sample width of {} is not allowed.".format(sample_width))

    pure_tone = 100 * synthesize_pure_tone_array(duration_s, fs, ft)
    pure_seg = audiosegment.from_numpy_array(dtype(pure_tone), fs)
    curdb = pure_seg.dBFS
    pure_seg += (dBFS - curdb)
    return pure_seg
Esempio n. 2
0
    def test_stereo_to_and_from_numpy_array(self):
        """
        Tests that we can convert a stereo file to a numpy array and then back again
        without any changes.
        """
        before = audiosegment.from_file("stereo_furelise.wav")
        arr = before.to_numpy_array()
        after = audiosegment.from_numpy_array(arr, before.frame_rate)

        self.assertEqual(before.sample_width, after.sample_width)
        self.assertEqual(before.duration_seconds, after.duration_seconds)
        self.assertEqual(before.channels, after.channels)
        self.assertSequenceEqual(before.raw_data, after.raw_data)
        self.assertTrue(common.is_playable(after))
Esempio n. 3
0
 def test_stereo_from_numpy_array(self):
     """
     Test that we can create and play a stereo numpy array.
     """
     duration_s = 2.0
     fs = 16000
     tone_one = 100 * common.synthesize_pure_tone_array(
         duration_s, fs, ft=3200)
     tone_two = 100 * common.synthesize_pure_tone_array(
         duration_s, fs, ft=2800)
     stereo_arr = np.array([tone_one, tone_two], dtype=np.int16).reshape(
         (-1, 2))
     stereo_seg = audiosegment.from_numpy_array(stereo_arr, fs)
     self.assertTrue(common.is_playable(stereo_seg))
Esempio n. 4
0
    def _test_create_file_from_n_segments(self,
                                          mono: audiosegment.AudioSegment,
                                          nchannels: int):
        """
        Create a single segment and test it against expected, from multiple segments.
        """
        arr = mono.to_numpy_array()
        arr_multi = np.tile(arr, (nchannels, 1)).T
        multi = audiosegment.from_numpy_array(arr_multi, mono.frame_rate)

        self.assertEqual(multi.channels, nchannels)
        self.assertEqual(multi.duration_seconds, mono.duration_seconds)
        self.assertEqual(multi.frame_rate, mono.frame_rate)

        return multi
Esempio n. 5
0
    def test_mono_to_and_from(self):
        """
        Test that a mono file converts to a numpy array and back again without any change.
        """
        seg = audiosegment.from_file("furelise.wav")

        for width in (1, 2, 4):
            with self.subTest(width):
                seg = seg.resample(sample_width=width)
                arr = seg.to_numpy_array()
                seg = audiosegment.from_numpy_array(arr, seg.frame_rate)
                nsamples = int(round(seg.frame_rate * seg.duration_seconds))

                self.assertEqual(seg.sample_width,
                                 self._look_up_sample_width(arr.dtype))
                self.assertEqual(arr.shape, (nsamples, ))
                self._check_underlying_data(seg, arr)
                self.assertTrue(common.is_playable(seg))
def helper(data, name, hp, store_path):

    if not os.path.exists(store_path):
        os.makedirs(store_path, exist_ok=True)
    spectrogram = plot_spectrogram_to_numpy(data[0].cpu().detach().numpy())
    plt.imsave(os.path.join(store_path, name + '.png'),
               spectrogram.transpose((1, 2, 0)))
    with torch.enable_grad():
        waveform, wavespec = Reconstruct(hp).inverse(data[0], iters=2000)
    wavespec = plot_spectrogram_to_numpy(wavespec.cpu().detach().numpy())
    plt.imsave(os.path.join(store_path, 'Final ' + name + '.png'),
               wavespec.transpose((1, 2, 0)))

    waveform = waveform.unsqueeze(-1)
    waveform = waveform.cpu().detach().numpy()
    waveform *= 32768 / waveform.max()
    waveform = waveform.astype(np.int16)
    audio = audiosegment.from_numpy_array(waveform, framerate=hp.audio.sr)
    audio.export(os.path.join(store_path, name + '.wav'), format='wav')
Esempio n. 7
0
    def test_mono_from_numpy_array(self):
        """
        Test that creating a mono audio segment from a numpy array creates
        what we expected.
        """
        duration_s = 3.5
        fs = 32000
        ftone = 4000
        arr = np.int16(
            100 * common.synthesize_pure_tone_array(duration_s, fs, ftone))
        seg = audiosegment.from_numpy_array(arr, fs)

        sample_width = self._look_up_sample_width(arr.dtype)
        nsamples = int(round(seg.frame_rate * seg.duration_seconds))

        self.assertEqual(seg.sample_width, sample_width)
        self.assertEqual(nsamples, len(arr))
        self.assertEqual(arr.shape, (nsamples, ))
        self._check_underlying_data(seg, arr)
        self.assertTrue(common.is_playable(seg))
Esempio n. 8
0
def store(generated, path, hp, idx, class_label):
    if not os.path.exists(path):
        os.makedirs(path)
    torch.save(generated,
               os.path.join(path, '{}_{}.pt'.format(class_label, idx)))
    spectrogram = plot_spectrogram_to_numpy(
        generated[0].cpu().detach().numpy())
    plt.imsave(os.path.join(path, '{}_{}.png'.format(class_label, idx)),
               spectrogram.transpose((1, 2, 0)))
    with torch.enable_grad():
        waveform, wavespec = Reconstruct(hp).inverse(generated[0])
    wavespec = plot_spectrogram_to_numpy(wavespec.cpu().detach().numpy())
    plt.imsave(os.path.join(path, 'Final {}_{}.png'.format(class_label, idx)),
               wavespec.transpose((1, 2, 0)))

    waveform = waveform.unsqueeze(-1)
    waveform = waveform.cpu().detach().numpy()
    waveform *= 32768 / waveform.max()
    waveform = waveform.astype(np.int16)
    audio = audiosegment.from_numpy_array(waveform, framerate=hp.audio.sr)
    audio.export(os.path.join(path, '{}_{}.wav'.format(class_label, idx)),
                 format='wav')
Esempio n. 9
0
    infer_hp = HParam(args.infer_config)

    assert args.timestep % t_div[hp.model.tier] == 0, \
        "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], args.timestep)

    model = MelNet(hp, args, infer_hp).cuda()
    model.load_tiers()
    model.eval()

    with torch.no_grad():
        generated = model.sample(args.input)

    os.makedirs('temp', exist_ok=True)
    torch.save(generated, os.path.join('temp', args.name + '.pt'))
    spectrogram = plot_spectrogram_to_numpy(generated[0].cpu().detach().numpy())
    plt.imsave(os.path.join('temp', args.name + '.png'), spectrogram.transpose((1, 2, 0)))

    waveform, wavespec = Reconstruct(hp).inverse(generated[0])
    wavespec = plot_spectrogram_to_numpy(wavespec.cpu().detach().numpy())
    plt.imsave(os.path.join('temp', 'Final ' + args.name + '.png'), wavespec.transpose((1, 2, 0)))

    waveform = waveform.unsqueeze(-1)
    waveform = waveform.cpu().detach().numpy()
    waveform *= 32768 / waveform.max()
    waveform = waveform.astype(np.int16)
    audio = audiosegment.from_numpy_array(
        waveform,
        framerate=hp.audio.sr
    )
    audio.export(os.path.join('temp', args.name + '.wav'), format='wav')
Esempio n. 10
0
#                              1875, 1878, 1880, 1883, 1884, 1886, 1888, 1890, 1892, 1893, 1930, 1931, 1932, 1969,
#                              1970, 1971, 1975, 1976, 1977, 1979, 1980, 1981, 1984, 1985, 1986, 1987, 1988, 1989,
#                              1990, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2002, 2003, 2076, 2106, 2110,
#                              2177, 2178, 2179, 2180, 2206, 2241, 2242, 2243, 2245, 2246, 2253, 2254, 2262, 2263,
#                              2357, 2358, 2359, 2362, 2368, 2373, 2374, 2418, 2523, 2525, 2526, 2534, 2539, 2542,
#                              2549, 2552, 2553, 2554, 2555, 2556, 2561, 2562, 2563, 2564, 2578, 2670, 2671, 2672,
#                              2692, 2694, 2695, 2728, 2733, 2889, 2890, 3034, 3304, 3511, 3524, 3525, 3528, 3655,
#                              3802, 3864, 3930, 4038, 4049, 4051, 4061, 4193, 4241, 4301, 4302, 4307, 4569, 4570), 0)

# doing silence removal, only threshold under 0.01 wokrks
voiced_feat = []
duration = 0.1  # 0.1, 0.06, 0.01
threshold = 0.1  # 0.1, 0.07, 0.01

for i in range(len(speech)):
    x_head = data2[i]['signal']
    #x_head = speech[i]
    seg = audiosegment.from_numpy_array(speech[i], framerate)
    seg = seg.filter_silence(duration_s=duration,
                             threshold_percentage=threshold)
    st_features = calculate_features(seg.to_numpy_array(), framerate, None)
    #st_features = calculate_features(x_head, framerate, None)
    st_features, _ = pad_sequence_into_array(st_features, maxlen=100)
    voiced_feat.append(st_features.T)
    if i % 100 == 0:
        print(i)

voiced_feat = np.array(voiced_feat)
voiced_feat.shape
np.save('featAS/voiced_feat_file_01_01.npy', voiced_feat)