Esempio n. 1
0
def segment(data, seg_location, length):
    os.chdir(data)
    files = os.listdir(data)
    speakers = speech_data.get_speakers(data)
    waves = []
    num = {}
    for s in speakers:
        num[s] = 0
    c = 0
    for f in files:  # grab all wave files in list
        waves.append(audio.from_wav(f))
        c = c + 1
    os.chdir(seg_location)
    for f, w in zip(
            files,
            waves):  # need to segment the data into one second intervals
        begin = 0
        end = 1
        while (end * length) < int(w.duration_seconds):
            segment = w[begin * 1000 * length:end * 1000 * length]
            segment.export(
                speech_data.speaker(f) + '_' +
                str(num[speech_data.speaker(f)]) + '.wav', 'wav')
            begin = begin + length
            end = end + length
            num[speech_data.speaker(f)] = num[speech_data.speaker(f)] + 1
import pickle
import tensorflow as tf
import librosa.display
import IPython.display
import numpy as np
import speech_data
from pydub import AudioSegment as audio

# now put all of the mfccs into an array
data = '/home/cc/working/data/devclean_2_seg/'
speakers = speech_data.get_speakers(data)
audio_files = os.listdir(data)
mfccs = []
Y = []
for f in audio_files:
  Y.append(speech_data.one_hot_from_item(speech_data.speaker(f), speakers))
  y, sr = librosa.load(data + f)
  mfccs.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13))

net = tflearn.input_data(shape=[None, 13, 44]) 
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, 32)
net = tflearn.fully_connected(net, len(speakers), activation='softmax')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')

model = tflearn.DNN(net,tensorboard_dir='/home/cc/working/tboard/', tensorboard_verbose=3)
model.fit(mfccs, Y, n_epoch=2000, show_metric=True, snapshot_step=100)

os.chdir('/home/cc/working/data/devclean_test/')
Esempio n. 3
0
from pydub import AudioSegment as audio

# constants
data = '/home/cc/oci_audio/arun-train'
working = '/home/cc/oci_audio/'
new_data = 'whisp-train1'
os.chdir(data)
files = os.listdir(data)
speakers = speech_data.get_speakers(data)
waves = []
num = {}
for s in speakers:
    num[s] = 0
c = 0
for f in files:  # grab all wave files in list
    waves.append(audio.from_wav(f))
    c = c + 1
os.chdir(working + new_data)
for f, w in zip(files,
                waves):  # need to segment the data into one second intervals
    begin = 0
    end = 1
    while (end * 1) < int(w.duration_seconds):
        segment = w[begin * 1000:end * 1000]
        segment.export(
            speech_data.speaker(f) + '_' + str(num[speech_data.speaker(f)]) +
            '.wav', 'wav')
        begin = begin + 1
        end = end + 1
        num[speech_data.speaker(f)] = num[speech_data.speaker(f)] + 1
import speech_data as data

# training and testing data sets
train_data = sys.argv[1]
test_data = sys.argv[2]

# grab the speakers from the training directory
speakers = data.get_speakers(train_data)
number_classes = len(speakers)

# create the MFCC arrays from the data for training
audio_files = os.listdir(train_data)
X = []
Y = []
for f in audio_files:
    Y.append(data.one_hot_from_item(data.speaker(f), speakers))
    y, sr = librosa.load(train_data + f)
    X.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13))

# define the network and the model
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

net = tflearn.input_data(shape=[None, 13, 44])
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, number_classes, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         loss='categorical_crossentropy')

model = tflearn.DNN(net)
import os
import speech_data

data = '/home/cc/working/data/devclean/'
new_data = '/home/cc/working/data/devclean_mfccs/'
os.chdir(data)
files = os.listdir(data)
back = []
c = 0
for f in files:
    if 'Speaker' in f: continue
    back.append(f)
print(len(back))
for b in back:
    os.system('cp ' + b + ' ' + new_data + speech_data.speaker(f) + '_back' +
              str(c) + '.wav')
    c = c + 1