Esempio n. 1
0
def process_directory(dir, n_rate):
    signal = []

    for j, audioname in enumerate(f[dir]):
        holder_signal, sr = Audio.read(f'{origin_path}/{dir}/{audioname}',
                                       sr=n_rate)

        signal.extend(Audio.trim(holder_signal, 20))

    signal = array(signal)

    Audio.write(f'{dest_path}/{n_rate}/{dir}.wav', signal, n_rate)
Esempio n. 2
0
def process_directory(dir, index, library):
    signal, rate = Audio.read(f'{path}/{dir}',
                              sr=sampling_rate,
                              normalize=True)

    signal = np.array(signal)

    segment_time = 5

    # arredonda o sinal de audio para multiplo de 5
    signal = signal[:len(signal) - len(signal) % (rate * segment_time)]

    # avalia quantos segmentos têm em uma audio
    segments = len(signal) // (rate * segment_time)

    augsize = int(augment[0]) if len(augment) > 0 else 0

    m = {
        'attrs': [],
        'labels': [index] * (n_segments or segments) * (1 + augsize),
        'classes': [{
            index: (n_segments or segments) * (1 + augsize)
        }]
    }

    for i in range(segments):
        if n_segments and i >= n_segments:
            continue

        samples = [Audio.segment(signal, rate, seconds=segment_time, window=i)]

        n_mfcc = 13
        n_mels = 26
        n_fft = 2048
        # Janela e overlapping (em amostras)
        hop_length = 512
        win_length = 1024
        # Janela e overlapping (em tempo)
        win_len = win_length / rate
        win_hop = hop_length / rate
        lifter = 22
        fmin = 0
        fmax = rate / 2
        coef_pre_enfase = 0.97
        append_energy = 0

        if augment:
            for _ in range(int(augment[0])):
                flag = False
                aug = samples[0]

                if random.uniform() > 0.5 and 'cut' in augment:
                    aug = _cut(aug, rate)
                    flag = True

                if random.uniform() > 0.5 and 'noise' in augment:
                    aug = _noise(aug, rate)
                    flag = True

                if not flag and len(augment) == 3:
                    if random.uniform() > 0.5:
                        aug = _cut(aug, rate)
                    else:
                        aug = _noise(aug, rate)

                samples.append(aug)

        for sample_index, sample in enumerate(samples):
            if library == 'stft':
                attr = np.abs(
                    np.array(stft(sample, n_fft=n_fft, hop_length=hop_length)))

            if library == 'melbanks':
                sample = sample[newaxis, :]
                melfbanks = MelFilterbanks(sample_rate=rate)
                attr = melfbanks(sample)
                attr = np.array(attr).T

            if library == 'psf':
                attr = mfcc(signal=sample,
                            samplerate=rate,
                            winlen=win_len,
                            winstep=win_hop,
                            numcep=n_mfcc,
                            nfilt=n_mels,
                            nfft=n_fft,
                            lowfreq=fmin,
                            highfreq=fmax,
                            preemph=coef_pre_enfase,
                            ceplifter=lifter,
                            appendEnergy=append_energy,
                            winfunc=hann)
                attr = np.array(attr)

            # Visualization.plot_cepstrals(
            #     attr, fig_name=f'teste.png')
            # Visualization.plot_audio(
            #     sample, rate, fig_name='./teste.png')
            # Audio.write(
            #     f'portuguese/processed/psf/{dir}_{i}_{sample_index}.wav', sample, rate)

            m['attrs'].append(attr.tolist())

        del attr
    del signal
    return m
warped_masked_spectrogram = spec_augment_tensorflow.spec_augment(mfcc)
warped_masked_spectrogram = warped_masked_spectrogram.numpy()

# %%
plt.figure(figsize=(10, 4))
librosa.display.specshow(librosa.power_to_db(warped_masked_spectrogram[0, :, :,
                                                                       0],
                                             ref=np.max),
                         y_axis='mel',
                         fmax=8000,
                         x_axis='time')
plt.tight_layout()
plt.title('SpecAugmented')
plt.show()
plt.close()
audio_signal = librosa.core.spectrum.griffinlim(
    warped_masked_spectrogram[0, :, :, 0])
Audio.write('test/warped_audio.wav', audio_signal, sr)
# %%
plt.figure(figsize=(10, 4))
librosa.display.specshow(librosa.power_to_db(mfcc[0, :, :, 0], ref=np.max),
                         y_axis='mel',
                         fmax=8000,
                         x_axis='time')
plt.tight_layout()
plt.title('MFCC')
plt.show()
plt.close()
audio_signal = librosa.core.spectrum.griffinlim(mfcc[0, :, :, 0])
Audio.write('test/mfcc_audio.wav', audio_signal, sr)
model = load(open(filename_holder + 'model.h5', 'rb'))

signal, rate = librosa.load(args['inferencia'], sr=sampling_rate)

# signal = Audio.trim(signal)

segment_time = 5
signal = signal[:len(signal) - len(signal) % (rate * segment_time)]

segments = len(signal) // (rate * segment_time)

mfcc_audios = []

for i in range(segments):
    sample = Audio.segment(signal, rate, seconds=segment_time, window=i)

    n_mfcc = 13
    n_mels = 26
    n_fft = 2048
    # Janela e overlapping (em amostras)
    hop_length = 512
    win_length = 1024
    # Janela e overlapping (em tempo)
    win_len = win_length / rate
    win_hop = hop_length / rate
    lifter = 22
    fmin = 0
    fmax = rate / 2
    coef_pre_enfase = 0.97
    append_energy = 0
from matplotlib.pyplot import show
from python_speech_features import mfcc
from scipy.signal.windows import hann
from deep_audio import Audio, Visualization

signal, rate = Audio.read(
    'inferencia/hugo/Frase 1-1.wav', 24000)

rate = 24000

n_mfcc = 40
n_mels = 40
n_fft = 2048
# Janela e overlapping (em amostras)
hop_length = 512
win_length = 1024
# Janela e overlapping (em tempo)
win_len = win_length / rate
win_hop = hop_length / rate
lifter = 22
fmin = 0
fmax = rate / 2
coef_pre_enfase = 0.97
append_energy = 0


attr = mfcc(
    signal=signal,
    samplerate=rate,
    winlen=win_len,
    winstep=win_hop,