def test_loadsound_librosa_mp3():
    samples, sr = sp.loadsound(test_mp3, use_scipy=False, remove_dc=False)
    expected = np.array(
        [0.000e+00, -1.5258789e-05, 0.000e+00, 0.00e+00, 0.0000000e+00])
    print('\nIF ERROR: could be due to update in Librosa from 0.7.2 to 0.8.0')
    assert np.allclose(samples[:5], expected)
    assert sr == 44100
def test_loadsound_librosa_flac():
    samples, sr = sp.loadsound(test_flac, use_scipy=False, remove_dc=False)
    expected = np.array([
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        -3.0517578e-05
    ])
    assert np.allclose(samples[:5], expected)
    assert sr == 44100
Example #3
0
def test_resample_audio_sr22050_to_16000():
    test_audio_1sec, sr = sp.loadsound(test_audiofile, dur_sec=1, sr=22050)
    assert sr == 22050
    assert len(test_audio_1sec) == 22050
    test_audio_newsr, sr_new = sp.dsp.resample_audio(test_audio_1sec,
                                                     sr_original=sr,
                                                     sr_desired=16000)
    assert sr_new == 16000
    assert len(test_audio_newsr) == 16000
def test_loadsound_librosa_aiff_sr16000():
    samples, sr = sp.loadsound(test_aiff,
                               sr=16000,
                               use_scipy=False,
                               remove_dc=False)
    expected = np.array(
        [0.05152914, 0.03653815, -0.0083929, -0.0207656, -0.03038501])
    assert np.allclose(samples[:5], expected)
    assert sr == 16000
def test_savesound_filename_wav2flac():
    y, sr = sp.loadsound(test_wav_mono)
    f = sp.utils.string2pathlib(test_wav_mono)
    format_type = 'FLAC'
    audiofile_new = example_dir.joinpath(f.stem + '.' + format_type.lower())
    audiofile_corrected = sp.savesound(audiofile_new, y, sr)
    soundobject = sf.SoundFile(audiofile_corrected)
    assert audiofile_corrected.suffix[1:].lower() == format_type.lower()
    assert soundobject.format == format_type
    os.remove(audiofile_corrected)
Example #6
0
def audiofile_length_match(filename1, filename2):
    '''Checks that two audiofiles have the same length.
    
    This may be useful if you have clean and noisy audiofiles that 
    should be the same length.
    
    Parameters
    ----------
    filename1 : str or pathlib.PosixPath
        The path to first audio file.
    filename2 : str or pathlib.PosixPath
        The path to second audio file.
    
    Returns
    -------
    bool : True if they match, False if not.
    
    Warning
    -------
    UserWarning 
        If the sample rate of the audio files don't match.
    UserWarning
        If the length of the files don't match.
    '''
    y1, sr1 = sp.loadsound(filename1)
    y2, sr2 = sp.loadsound(filename2)
    if sr1 != sr2:
        import Warnings
        message = '\nWARNING: Sample rates do not match: '+\
            '\n{} has sr {}'.format(filename1, sr1)+\
            '\n{} has sr {}.'.format(filename2, sr2)
        warnings.warn(message)
        y2, sr2 = sp.dsp.resample_audio(y2, sr_original=sr2, sr_desired=sr1)
    assert sr1 == sr2
    if len(y1) != len(y2):
        import warnings
        message = '\nWARNING: audiofile length mismatch. Length '+\
            ' {}: \n{}'.format(filename1, len(y1))+\
                'Length {}: \n{}'.format(filename2, len(y2))
        return False
    else:
        return True
def test_loadsound_mono_uselibrosa_False():
    samples, sr = sp.loadsound(test_wav_stereo,
                               use_scipy=True,
                               remove_dc=False)
    expected = np.array(
        [0.06140351, 0.06140351, 0.06140351, 0.06140351, 0.06140351])
    expected_shape = (len(expected), )
    expected_sr = 16000  # sr of the audiofile (no default)
    assert np.allclose(samples[:5], expected)
    assert expected_shape == samples[:5].shape
    assert expected_sr == sr
def test_loadsound_mono_sr48000_uselibrosa_False():
    samples, sr = sp.loadsound(test_wav_stereo,
                               mono=True,
                               sr=48000,
                               use_scipy=True,
                               remove_dc=False)
    expected = np.array(
        [0.07632732, 0.07633357, 0.07633357, 0.07632732, 0.07632107])
    expected_sr = 48000
    assert np.allclose(samples[:5], expected)
    assert sr == expected_sr
def test_loadsound_librosa_wav_dur1_sr22050():
    # use librosa to load file
    samples, sr = sp.loadsound(test_wav_stereo,
                               dur_sec=1,
                               sr=22050,
                               use_scipy=False,
                               remove_dc=False)

    assert np.allclose(samples[:5], np.array([0., 0., 0., 0., 0.]))
    assert sr == 22050
    assert len(samples) == sr
def test_loadsound_librosa_wav_dur1_sr22050_stereo():
    # use librosa to load file
    samples, sr = sp.loadsound(test_wav_stereo,
                               mono=False,
                               dur_sec=1,
                               sr=22050,
                               use_scipy=False,
                               remove_dc=False)
    expected = np.array([[0., 0.], [0., 0.], [0., 0.]])
    assert np.allclose(samples[:3], expected)
    assert sr == 22050
    assert samples.shape == (22050, 2)
def test_loadsound_stereo_sr48000_uselibrosa_False():
    samples, sr = sp.loadsound(test_wav_stereo,
                               sr=48000,
                               mono=False,
                               use_scipy=True,
                               remove_dc=False)
    expected = np.array([[0.07632732, 0.07632732], [0.07633357, 0.07628564],
                         [0.07633357, 0.07628563]])
    expected_shape = expected.shape
    expected_sr = 48000
    assert np.allclose(samples[:3], expected)
    assert expected_shape == samples[:3].shape
    assert expected_sr == sr
def test_loadsound_stereo_dur1_uselibrosa_False():
    samples, sr = sp.loadsound(test_wav_stereo,
                               mono=False,
                               dur_sec=1,
                               use_scipy=True,
                               remove_dc=False)
    expected = np.array([[0.06140351, 0.06140351], [0.06140351, 0.06140351],
                         [0.06140351, 0.06140351]])
    expected_shape = expected.shape
    expected_sr = 16000  # sr of the audiofile (no default)
    assert np.allclose(samples[:3], expected)
    assert expected_shape == samples[:3].shape
    assert expected_sr == sr
    assert len(samples) == expected_sr
Example #13
0
    def get_samples(self, audiofile, dur_sec=None):
        """Load signal and save original volume

        Parameters
        ----------
        audiofile : str
            Path and name of audiofile to be loaded
        dur_sec : int, float optional
            Max length of time in seconds (default None)

        Returns 
        ----------
        samples : ndarray
            Array containing signal amplitude values in time domain
        """
        samples, sr = sp.loadsound(audiofile, self.sr, dur_sec=dur_sec)
        self.set_volume(samples, max_vol=self.max_vol)
        return samples
Example #14
0
def test_get_feats_dur_sec_zeropad_False_mfcc():
    dur_sec = 0.5
    win_size_ms = 20
    percent_overlap = 0.5
    zeropad = False
    y, sr = sp.loadsound(test_audiofile, mono=True)
    y2 = sp.feats.get_feats(y,
                            sr=sr,
                            dur_sec=dur_sec,
                            feature_type='mfcc',
                            win_size_ms=win_size_ms,
                            percent_overlap=percent_overlap,
                            zeropad=zeropad)
    num_samples = int(sr * dur_sec)
    frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
    num_overlap_samples = int(frame_length * percent_overlap)
    num_subframes = sp.dsp.calc_num_subframes(
        num_samples,
        frame_length=frame_length,
        overlap_samples=num_overlap_samples,
        zeropad=zeropad)
    assert len(y2) == num_subframes
Example #15
0
def test_get_feats_signal_mono_default_2channels_no_change():
    y, sr = sp.loadsound(test_audiofile, mono=False)
    y2 = sp.feats.get_feats(y, sr=sr, feature_type='signal')
    assert y.shape == y2.shape
    assert y.shape[1] == 2
Example #16
0
def test_get_feats_signal_mono_True_2channels():
    y, sr = sp.loadsound(test_audiofile, mono=False)
    y2 = sp.feats.get_feats(y, sr=sr, feature_type='signal', mono=True)
    assert len(y2.shape) == 1
    assert y.shape[1] == 2
Example #17
0
    except SyntaxError:
        pass

#########################################################
# For the purposes of plotting, let's use some of the settings defined:
feature_type = feat_settings['feature_type']
sr = feat_settings['sr']

######################################################
# Provide new audio for the denoiser to denoise!
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#########################################################
# We'll use sample speech from the soundpy repo:
speech = sp.string2pathlib('{}audiodata/python.wav'.format(sp_dir))
s, sr = sp.loadsound(speech, sr=sr)

#########################################################
# Let's add some white noise (10 SNR)
s_n = sp.augment.add_white_noise(s, sr=sr, snr=10)

##############################################################
# What does the noisy audio sound like?
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ipd.Audio(s_n, rate=sr)

##############################################################
# What does the noisy audio look like?
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sp.plotsound(s_n, sr=sr, feature_type='signal', subprocess=True)
def test_loadsound_scipy_sr_None():
    samples, sr = sp.loadsound(test_wav_stereo, sr=None, use_scipy=True)
    assert sr == 16000
##########################################################
# Noise sample:
noise_sample = '{}audiodata/background_samples/cafe.wav'.format(sp_dir)
noise_sample = sp.utils.string2pathlib(noise_sample)
# as pathlib object, can do the following:
noise = noise_sample.stem
noise

##########################################################
# Hear Clean Speech
# ~~~~~~~~~~~~~~~~~
# I'm using a higher sample rate here as calculating SNR
# performs best upwards of 44100 Hz.
sr = 44100
s, sr = sp.loadsound(speech_sample, sr=sr)
ipd.Audio(s, rate=sr)

##########################################################
# Hear Noise
# ~~~~~~~~~~
n, sr = sp.loadsound(noise_sample, sr=sr)
ipd.Audio(n, rate=sr)

##########################################################
# Hear Signal-to-Noise Ratio 20
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
noisyspeech_20snr, snr20 = sp.dsp.add_backgroundsound(speech_sample,
                                                      noise_sample,
                                                      sr=sr,
                                                      snr=20)
##########################################################
# Designate the path relevant for accessing audiodata
# Note: the speech and sound come with the soundpy repo.
sp_dir = '../../../'

##########################################################
# Speech sample:
speech = '{}audiodata/python.wav'.format(sp_dir)
speech = sp.utils.string2pathlib(speech)

################################################################
# Hear and see speech
# ~~~~~~~~~~~~~~~~~~~

sr = 44100
f, sr = sp.loadsound(speech, sr=sr)
ipd.Audio(f,rate=sr)

##########################################################
sp.plotsound(f, sr=sr, feature_type='stft', title='Female Speech "Python"')

##########################################################
# Change Speed
# ~~~~~~~~~~~~

##########################################################
# Let's increase the speed by 15%:

fast = sp.augment.speed_increase(f, sr=sr, perc = 0.15) 

##########################################################
def test_savesound_default_overwrite():
    y, sr = sp.loadsound(test_wav_mono)
    soundobject1 = sf.SoundFile(test_wav_mono)
    filename = sp.savesound(test_wav_mono, y, sr, overwrite=True)
    soundobject2 = sf.SoundFile(filename)
    assert soundobject1.format == soundobject2.format
Example #22
0
def augment_features(
    sound,
    sr,
    add_white_noise=False,
    snr=[5, 10, 20],
    speed_increase=False,
    speed_decrease=False,
    speed_perc=0.15,
    time_shift=False,
    shufflesound=False,
    num_subsections=3,
    harmonic_distortion=False,
    pitch_increase=False,
    pitch_decrease=False,
    num_semitones=2,
    vtlp=False,
    bilinear_warp=True,
    augment_settings_dict=None,
    random_seed=None,
):
    '''Randomly applies augmentations to audio. If no `augment_settings_dict`, defaults applied.
    '''
    if augment_settings_dict is not None:
        aug_settings = dict(augment_settings_dict)
    else:
        aug_settings = augment_settings_dict

    if speed_increase and speed_decrease:
        raise ValueError('Cannot have both speed_increase and speed_decrease'+\
            ' as augmentation options. Set just one to True.')
    if pitch_increase and pitch_decrease:
        raise ValueError('Cannot have both pitch_increase and pitch_decrease'+\
            ' as augmentation options. Set just one to True.')
    if isinstance(sound, np.ndarray):
        data = sound
    else:
        data, sr2 = sp.loadsound(sound, sr=sr)
        assert sr2 == sr
    samples = data.copy()
    samples_augmented = samples.copy()
    augmentation = ''
    if add_white_noise:
        # allow default settings to be used/overwritten
        if aug_settings is not None:
            kwargs_aug = aug_settings['add_white_noise']
            if isinstance(kwargs_aug['snr'], str):
                kwargs_aug['snr'] = sp.utils.restore_dictvalue(
                    kwargs_aug['snr'])
            # if a list of snr values: choose randomly
            if isinstance(kwargs_aug['snr'], list):
                snr = np.random.choice(kwargs_aug['snr'])
        else:
            snr = np.random.choice(snr)
        samples_augmented = sp.augment.add_white_noise(samples_augmented,
                                                       sr=sr,
                                                       snr=snr)
        augmentation += '_whitenoise{}SNR'.format(snr)

    if speed_increase:
        if aug_settings is not None:
            kwargs_aug = aug_settings['speed_increase']
        else:
            kwargs_aug = dict([('perc', speed_perc)])
        samples_augmented = sp.augment.speed_increase(samples_augmented,
                                                      sr=sr,
                                                      **kwargs_aug)
        augmentation += '_speedincrease{}'.format(kwargs_aug['perc'])

    elif speed_decrease:
        if aug_settings is not None:
            kwargs_aug = aug_settings['speed_decrease']
        else:
            kwargs_aug = dict([('perc', speed_perc)])
        samples_augmented = sp.augment.speed_decrease(samples_augmented,
                                                      sr=sr,
                                                      **kwargs_aug)
        augmentation += '_speeddecrease{}'.format(kwargs_aug['perc'])

    if time_shift:
        samples_augmented = sp.augment.time_shift(samples_augmented, sr=sr)
        augmentation += '_randtimeshift'

    if shufflesound:
        if aug_settings is not None:
            kwargs_aug = aug_settings['shufflesound']
        else:
            kwargs_aug = dict([('num_subsections', num_subsections)])
        samples_augmented = sp.augment.shufflesound(samples_augmented,
                                                    sr=sr,
                                                    **kwargs_aug)
        augmentation += '_randshuffle{}sections'.format(
            kwargs_aug['num_subsections'])

    if harmonic_distortion:
        samples_augmented = sp.augment.harmonic_distortion(samples_augmented,
                                                           sr=sr)
        augmentation += '_harmonicdistortion'

    if pitch_increase:
        if aug_settings is not None:
            kwargs_aug = aug_settings['pitch_increase']
        else:
            kwargs_aug = dict([('num_semitones', num_semitones)])
        samples_augmented = sp.augment.pitch_increase(samples_augmented,
                                                      sr=sr,
                                                      **kwargs_aug)
        augmentation += '_pitchincrease{}semitones'.format(
            kwargs_aug['num_semitones'])

    elif pitch_decrease:
        if aug_settings is not None:
            kwargs_aug = aug_settings['pitch_decrease']
        else:
            kwargs_aug = dict([('num_semitones', num_semitones)])
        samples_augmented = sp.augment.pitch_decrease(samples_augmented,
                                                      sr=sr,
                                                      **kwargs_aug)
        augmentation += '_pitchdecrease{}semitones'.format(
            kwargs_aug['num_semitones'])

    # all augmentation techniques return sample data except for vtlp
    # therefore vtlp will be handled outside of this function (returns stft or powspec)
    if vtlp:
        pass

    samples_augmented = sp.dsp.set_signal_length(samples_augmented,
                                                 len(samples))

    return samples_augmented, augmentation
Example #23
0
    def generator(self):
        '''Extracts features and feeds them to model according to `desired_input_shape`.
        '''
        while 1:
            augmentation = ''
            audioinfo = self.audiolist[self.counter]
            # does the list contain label audiofile pairs?
            if isinstance(audioinfo, tuple):
                if len(audioinfo) != 2:
                    raise ValueError('Expected tuple containing audio file path and label. '+\
                        'Instead received tuple of length: \n{}'.format(len(audioinfo)))
                # if label is a string digit, int, or float - turn to int
                if isinstance(audioinfo[0], int) or isinstance(audioinfo[0], float) or \
                    isinstance(audioinfo[0], str) and audioinfo[0].isdigit():
                    label = int(audioinfo[0])
                    audiopath = audioinfo[1]
                elif isinstance(audioinfo[1], int) or isinstance(audioinfo[1], float) or \
                    isinstance(audioinfo[1], str) and audioinfo[1].isdigit():
                    label = int(audioinfo[1])
                    audiopath = audioinfo[1]
                else:
                    raise ValueError('Expected tuple to contain an integer label '+\
                        'and audio pathway. Received instead tuple with types '+\
                            '{} and {}.'.format(type(audioinfo[0]), type(audioinfo[1])))
            # otherwise list of audiofiles
            else:
                audiopath = audioinfo
                label = None
            if self.audiolist2 is not None:
                # expects audiolist2 to be either integer labels or audiofile pathways
                audioinfo2 = self.audiolist2[self.counter]
                if isinstance(audioinfo2, int) or isinstance(audioinfo2, str) and \
                    audioinfo2.isdigit():
                    if label is None:
                        label = audioinfo2
                    else:
                        if label == int(audioinfo2):
                            pass
                        else:
                            raise ValueError('Provided conflicting labels for '+\
                                'current audiofile: {}.'.format(audiopath) +\
                                    '\nReceived both label {} and {} .'.format(
                                        label, int(audioinfo2)))
                    audiopath2 = None
                else:
                    audiopath2 = audioinfo2
            else:
                audiopath2 = None
            if label is not None:
                labeled_data = True
                if self.decode_dict is not None:
                    try:
                        label_pic = self.decode_dict[label].upper()
                    except KeyError:
                        # dictionary keys might be string type, not int type
                        label_pic = self.decode_dict[str(int(label))].upper()
                else:
                    label_pic = label
            else:
                labeled_data = False
                label_pic = None

            # ensure audio is valid:
            y, sr = sp.loadsound(audiopath, self.kwargs['sr'])
            if audiopath2:
                y2, sr2 = sp.loadsound(audiopath2, self.kwargs['sr'])
            else:
                y2, sr2 = None, None
            if self.label_silence:
                if self.vad_start_end:
                    y_stft, vad = sp.dsp.get_stft_clipped(y,
                                                          sr=sr,
                                                          win_size_ms=50,
                                                          percent_overlap=0.5)
                else:
                    y_stft, __ = sp.feats.get_vad_stft(y,
                                                       sr=sr,
                                                       win_size_ms=50,
                                                       percent_overlap=0.5,
                                                       use_beg_ms=120,
                                                       energy_thresh=40,
                                                       freq_thresh=185,
                                                       sfm_thresh=5)
                if not y_stft.any():
                    label = len(self.decode_dict) - 1
                    print(
                        '\nNo voice activity detected in {}'.format(audiopath))
                    print('Label {} adjusted to {}.'.format(
                        label_pic, self.decode_dict[label]))
                    label_pic = self.decode_dict[label]
            # augment_data
            if self.augment_dict is not None:
                aug_dict = randomize_augs(self.augment_dict)

                augmented_data, augmentation = augment_features(
                    y, self.kwargs['sr'], **aug_dict)
                if audiopath2:
                    # remove 'add_white_noise' if in aug_dict
                    aug_dict2 = {}
                    for key, value in aug_dict.items():
                        if key != 'add_white_noise':
                            aug_dict2[key] = value
                    augmented_data2, augmentation2 = augment_features(
                        y2, self.kwargs['sr'], **aug_dict2)
            else:
                augmented_data, augmentation = y, ''
                aug_dict = dict()
                augmented_data2, augmentation2 = y2, ''
                aug_dict2 = dict()
            # extract features
            # will be shape (num_frames, num_features)
            if 'vtlp' in aug_dict and aug_dict['vtlp']:
                sr = self.kwargs['sr']
                win_size_ms = sp.utils.restore_dictvalue(
                    self.kwargs['win_size_ms'])
                percent_overlap = sp.utils.restore_dictvalue(
                    self.kwargs['percent_overlap'])
                fft_bins = sp.utils.restore_dictvalue(self.kwargs['fft_bins'])
                window = sp.utils.restore_dictvalue(self.kwargs['window'])
                real_signal = sp.utils.restore_dictvalue(
                    self.kwargs['real_signal'])
                feature_type_vtlp = 'stft'
                dur_sec = sp.utils.restore_dictvalue(self.kwargs['dur_sec'])
                zeropad = sp.utils.restore_dictvalue(self.kwargs['zeropad'])

                # need to tell vtlp the size of fft we need, in order to
                # be able to extract fbank and mfcc features as well
                expected_stft_shape, __ = sp.feats.get_feature_matrix_shape(
                    sr=sr,
                    dur_sec=dur_sec,
                    feature_type=feature_type_vtlp,
                    win_size_ms=win_size_ms,
                    percent_overlap=percent_overlap,
                    fft_bins=fft_bins,
                    zeropad=zeropad,
                    real_signal=real_signal)

                # TODO bug fix: oversize_factor higher than 1:
                # how to reduce dimension back to `expected_stft_shape` without
                # shaving off data?
                oversize_factor = 16
                augmented_data, alpha = sp.augment.vtlp(
                    augmented_data,
                    sr,
                    win_size_ms=win_size_ms,
                    percent_overlap=percent_overlap,
                    fft_bins=fft_bins,
                    window=window,
                    real_signal=real_signal,
                    expected_shape=expected_stft_shape,
                    oversize_factor=oversize_factor,
                    visualize=False)
                # vtlp was last augmentation to be added to `augmentation` string
                # add the value that was applied
                augmentation += '_vtlp' + str(alpha)
                # need to be able to set alpha
                augmented_data2, alpha2 = sp.augment.vtlp(
                    augmented_data2,
                    sr,
                    a=alpha,
                    win_size_ms=win_size_ms,
                    percent_overlap=percent_overlap,
                    fft_bins=fft_bins,
                    window=window,
                    real_signal=real_signal,
                    expected_shape=expected_stft_shape,
                    oversize_factor=oversize_factor,
                    visualize=False)
                try:
                    assert alpha == alpha2
                except AssertionError:
                    raise ValueError('The alpha value for vtlp application '+\
                        'does not match for the X and y audio: '+\
                            'X alpha is {} and y alpha is {}'.format(alpha, alpha2))
                # vtlp was last augmentation to be added to `augmentation` string
                # add the value that was applied
                augmentation2 += '_vtlp' + str(alpha)

            if 'vtlp' in aug_dict and aug_dict['vtlp']:
                if 'stft' in self.kwargs['feature_type'] or \
                    'powspec' in self.kwargs['feature_type']:
                    if 'stft' in self.kwargs[
                            'feature_type'] and oversize_factor > 1:
                        import warnings
                        msg = '\nWARNING: due to resizing of STFT matrix due to '+\
                            ' `oversize_factor` {}, converted to '.format(oversize_factor)+\
                            'power spectrum. Phase information has been removed.'
                        warnings.warn(msg)
                    feats = augmented_data
                    if audiopath2:
                        feats2 = augmented_data2
                    if 'powspec' in self.kwargs[
                            'feature_type'] and oversize_factor == 1:
                        # otherwise already a power spectrum
                        feats = sp.dsp.calc_power(feats)
                        if audiopath2:
                            feats2 = sp.dsp.calc_power(feats2)

            elif 'stft'in self.kwargs['feature_type'] or \
                'powspec' in self.kwargs['feature_type']:
                feats = sp.feats.get_stft(
                    augmented_data,
                    sr=self.kwargs['sr'],
                    win_size_ms=self.kwargs['win_size_ms'],
                    percent_overlap=self.kwargs['percent_overlap'],
                    real_signal=self.kwargs['real_signal'],
                    fft_bins=self.kwargs['fft_bins'],
                    rate_of_change=self.kwargs['rate_of_change'],
                    rate_of_acceleration=self.kwargs['rate_of_acceleration'],
                    window=self.kwargs['window'],
                    zeropad=self.kwargs['zeropad'])
                if audiopath2:
                    feats2 = sp.feats.get_stft(
                        augmented_data2,
                        sr=self.kwargs['sr'],
                        win_size_ms=self.kwargs['win_size_ms'],
                        percent_overlap=self.kwargs['percent_overlap'],
                        real_signal=self.kwargs['real_signal'],
                        fft_bins=self.kwargs['fft_bins'],
                        rate_of_change=self.kwargs['rate_of_change'],
                        rate_of_acceleration=self.
                        kwargs['rate_of_acceleration'],
                        window=self.kwargs['window'],
                        zeropad=self.kwargs['zeropad'])
                if 'powspec' in self.kwargs['feature_type']:
                    feats = sp.dsp.calc_power(feats)
                    if audiopath2:
                        feats2 = sp.dsp.calc_power(feats2)

            if 'fbank' in self.kwargs['feature_type']:
                feats = sp.feats.get_fbank(
                    augmented_data,
                    sr=self.kwargs['sr'],
                    num_filters=self.kwargs['num_filters'],
                    win_size_ms=self.kwargs['win_size_ms'],
                    percent_overlap=self.kwargs['percent_overlap'],
                    real_signal=self.kwargs['real_signal'],
                    fft_bins=self.kwargs['fft_bins'],
                    rate_of_change=self.kwargs['rate_of_change'],
                    rate_of_acceleration=self.kwargs['rate_of_acceleration'],
                    window=self.kwargs['window'],
                    zeropad=self.kwargs['zeropad'])
                if audiopath2:
                    feats2 = sp.feats.get_fbank(
                        augmented_data2,
                        sr=self.kwargs['sr'],
                        num_filters=self.kwargs['num_filters'],
                        win_size_ms=self.kwargs['win_size_ms'],
                        percent_overlap=self.kwargs['percent_overlap'],
                        real_signal=self.kwargs['real_signal'],
                        fft_bins=self.kwargs['fft_bins'],
                        rate_of_change=self.kwargs['rate_of_change'],
                        rate_of_acceleration=self.
                        kwargs['rate_of_acceleration'],
                        window=self.kwargs['window'],
                        zeropad=self.kwargs['zeropad'])

            elif 'mfcc' in self.kwargs['feature_type']:
                feats = sp.feats.get_mfcc(
                    augmented_data,
                    sr=self.kwargs['sr'],
                    num_mfcc=self.kwargs['num_mfcc'],
                    num_filters=self.kwargs['num_filters'],
                    win_size_ms=self.kwargs['win_size_ms'],
                    percent_overlap=self.kwargs['percent_overlap'],
                    real_signal=self.kwargs['real_signal'],
                    fft_bins=self.kwargs['fft_bins'],
                    rate_of_change=self.kwargs['rate_of_change'],
                    rate_of_acceleration=self.kwargs['rate_of_acceleration'],
                    window=self.kwargs['window'],
                    zeropad=self.kwargs['zeropad'])
                if audiopath2:
                    feats2 = sp.feats.get_mfcc(
                        augmented_data2,
                        sr=self.kwargs['sr'],
                        num_mfcc=self.kwargs['num_mfcc'],
                        num_filters=self.kwargs['num_filters'],
                        win_size_ms=self.kwargs['win_size_ms'],
                        percent_overlap=self.kwargs['percent_overlap'],
                        real_signal=self.kwargs['real_signal'],
                        fft_bins=self.kwargs['fft_bins'],
                        rate_of_change=self.kwargs['rate_of_change'],
                        rate_of_acceleration=self.
                        kwargs['rate_of_acceleration'],
                        window=self.kwargs['window'],
                        zeropad=self.kwargs['zeropad'])

            if self.apply_log:
                # TODO test
                if feats[0].any() < 0:
                    feats = np.abs(feats)
                feats = np.log(feats)
            if self.normalize:
                feats = sp.feats.normalize(feats)
            if audiopath2:
                if self.apply_log:
                    # TODO test
                    if feats2[0].any() < 0:
                        feats2 = np.abs(feats2)
                    feats2 = np.log(feats2)
                if self.normalize:
                    feats2 = sp.feats.normalize(feats2)
            else:
                feats2 = None
            # Save visuals if desired
            if self.visualize:
                if self.counter % self.vis_every_n_items == 0:
                    # make augmentation string more legible.
                    augments_vis = augmentation[1:].split('_')
                    if len(augments_vis) > 1:
                        augs1 = augments_vis[:len(augments_vis) // 2]
                        augs2 = augments_vis[len(augments_vis) // 2:]
                        augs1 = ', '.join(augs1)
                        augs2 = ', '.join(augs2)
                    else:
                        augs1 = augments_vis[0]
                        augs2 = ''
                    if self.visuals_dir is not None:
                        save_visuals_path = sp.check_dir(self.visuals_dir,
                                                         make=True)
                    else:
                        save_visuals_path = sp.check_dir('./training_images/',
                                                         make=True)
                    save_visuals_path = save_visuals_path.joinpath(
                        '{}_label{}_training_{}_{}_{}.png'.format(
                            self.dataset, label_pic, self.model_name,
                            augmentation, sp.utils.get_date()))
                    feature_type = self.kwargs['feature_type']
                    sr = self.kwargs['sr']
                    win_size_ms = self.kwargs['win_size_ms']
                    percent_overlap = self.kwargs['percent_overlap']
                    if 'stft' in feature_type or 'powspec' in feature_type or 'fbank' \
                        in feature_type:
                        energy_scale = 'power_to_db'
                    else:
                        energy_scale = None
                    sp.feats.plot(
                        feature_matrix=feats,
                        feature_type=feature_type,
                        sr=sr,
                        win_size_ms=win_size_ms,
                        percent_overlap=percent_overlap,
                        energy_scale=energy_scale,
                        save_pic=True,
                        name4pic=save_visuals_path,
                        title='"{}" {} Aug: {}-\n{}'.format(
                            label_pic, feature_type.upper(), augs1, augs2),
                        subprocess=True)  #use Agg backend for plotting
                    if feats2 is not None:
                        # add '_2' to pathway
                        p = sp.utils.string2pathlib(save_visuals_path)
                        p2 = p.name.stem
                        save_visuals_path2 = p.parent.joinpath(p2 + '_2' +
                                                               p.name.suffix)
                        sp.feats.plot(feature_matrix=feats2,
                                      feature_type=feature_type,
                                      sr=sr,
                                      win_size_ms=win_size_ms,
                                      percent_overlap=percent_overlap,
                                      energy_scale=energy_scale,
                                      save_pic=True,
                                      name4pic=save_visuals_path2,
                                      title='Output {} features {}'.format(
                                          label_pic, feature_type),
                                      subprocess=True)

            batch_x = feats
            batch_y = feats2

            # reshape features to allow for timestep / subsection features
            if self.timestep is not None:
                batch_x = sp.feats.apply_new_subframe(
                    batch_x,
                    new_frame_size=self.timestep,
                    zeropad=self.kwargs['zeropad'],
                    axis=self.axis_timestep)
                if batch_y is not None:
                    batch_y = sp.feats.apply_new_subframe(
                        batch_y,
                        new_frame_size=self.timestep,
                        zeropad=self.kwargs['zeropad'],
                        axis=self.axis_timestep)

            # reshape features to allow for context window / subsection features
            if self.context_window is not None:
                batch_x = sp.feats.apply_new_subframe(
                    batch_x,
                    new_frame_size=self.context_window * 2 + 1,
                    zeropad=self.kwargs['zeropad'],
                    axis=self.axis_context)
                if batch_y is not None:
                    batch_y = apply_new_subframe(
                        batch_y,
                        new_frame_size=self.context_window * 2 + 1,
                        zeropad=self.kwargs['zeropad'],
                        axis=self.axis_context)

            # grayscale 2 color
            if self.gray2color:
                batch_x = sp.feats.grayscale2color(
                    batch_x, colorscale=3)  # default colorscale is 3
                if batch_y is not None:
                    batch_y = sp.feats.grayscale2color(batch_y, colorscale=3)

            # reshape to input shape. Will be zeropadded or limited to this shape.
            # tensor dimensions on either side can be added here as well.
            if self.desired_input_shape is not None:
                batch_x = sp.feats.adjust_shape(batch_x,
                                                self.desired_input_shape)
                if batch_y is not None:
                    batch_y = sp.feats.adjust_shape(batch_y,
                                                    self.desired_input_shape)

            # prepare data to be fed to network:
            if labeled_data:
                # has to be at least (1,)
                batch_y = np.expand_dims(np.array(label), axis=0)

            elif batch_y is not None:
                pass
            else:
                raise ValueError('No independent variable provided.')

            self.counter += 1
            yield batch_x, batch_y

            #restart counter to yield data in the next epoch as well
            if self.counter >= self.number_of_batches:
                self.counter = 0
def test_loadsound_librosa_aiff():
    samples, sr = sp.loadsound(test_aiff, use_scipy=False, remove_dc=False)
    expected = np.array(
        [0.09291077, 0.06417847, 0.04179382, 0.02642822, 0.01808167])
    assert np.allclose(samples[:5], expected)
    assert sr == 48000
def test_loadsound_librosa_ogg():
    samples, sr = sp.loadsound(test_ogg, use_scipy=False, remove_dc=False)
    expected = np.array(
        [-0.00639889, -0.00722905, -0.00864992, -0.00878596, -0.00894831])
    assert np.allclose(samples[:5], expected)
    assert sr == 44100
def test_loadsound_librosa_m4a():
    samples, sr = sp.loadsound(test_m4a, use_scipy=False, remove_dc=False)
    expected = np.array([0., 0., 0., 0., 0.])
    assert np.allclose(samples[:5], expected)
    assert sr == 48000
def test_loadsound_librosa_sr_None():
    samples, sr = sp.loadsound(test_wav_stereo, sr=None)
    assert sr == 16000
Example #28
0
def test_get_feats_dur_sec_signal():
    dur_sec = 0.5
    y, sr = sp.loadsound(test_audiofile, mono=True)
    y2 = sp.feats.get_feats(y, sr=sr, dur_sec=dur_sec, feature_type='signal')
    num_samps = int(sr * dur_sec)
    assert len(y2) == num_samps
Example #29
0
######################################################
# Load sample speech audio
# ------------------------
# We will look at how these two options handle two different speech samples.
# The speech samples will be combined but separated by a silence.
# They will also be altered with white noise.

######################################################
# "Python"
# ~~~~~~~~
# Note: this file is available in the soundpy repo.

# VAD and filtering work best with high sample rates
sr = 48000
python = '{}audiodata/python.wav'.format(sp_dir, sr=sr)
y_p, sr = sp.loadsound(python, sr=sr)
ipd.Audio(y_p, rate=sr)

######################################################
# "six"
# ~~~~~
# This is a sample file from the speech commands dataset
# (Attribution 4.0 International (CC BY 4.0))
# dataset: https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.htmll
# license: https://creativecommons.org/licenses/by/4.0/

######################################################
# This is audio that has two fricatives in it: 's' and 'x'
# which will show to cause issues as noise increases.
six = '{}audiodata/six.wav'.format(sp_dir, sr=sr)
y_six, sr = sp.loadsound(six, sr=sr)
def test_savesound_default_FileExistsError():
    y, sr = sp.loadsound(test_wav_mono)
    with pytest.raises(FileExistsError):
        filename = sp.savesound(test_wav_mono, y, sr)