def segment(data, seg_location, length): os.chdir(data) files = os.listdir(data) speakers = speech_data.get_speakers(data) waves = [] num = {} for s in speakers: num[s] = 0 c = 0 for f in files: # grab all wave files in list waves.append(audio.from_wav(f)) c = c + 1 os.chdir(seg_location) for f, w in zip( files, waves): # need to segment the data into one second intervals begin = 0 end = 1 while (end * length) < int(w.duration_seconds): segment = w[begin * 1000 * length:end * 1000 * length] segment.export( speech_data.speaker(f) + '_' + str(num[speech_data.speaker(f)]) + '.wav', 'wav') begin = begin + length end = end + length num[speech_data.speaker(f)] = num[speech_data.speaker(f)] + 1
import pickle import tensorflow as tf import librosa.display import IPython.display import numpy as np import speech_data from pydub import AudioSegment as audio # now put all of the mfccs into an array data = '/home/cc/working/data/devclean_2_seg/' speakers = speech_data.get_speakers(data) audio_files = os.listdir(data) mfccs = [] Y = [] for f in audio_files: Y.append(speech_data.one_hot_from_item(speech_data.speaker(f), speakers)) y, sr = librosa.load(data + f) mfccs.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)) net = tflearn.input_data(shape=[None, 13, 44]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, 32) net = tflearn.fully_connected(net, len(speakers), activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net,tensorboard_dir='/home/cc/working/tboard/', tensorboard_verbose=3) model.fit(mfccs, Y, n_epoch=2000, show_metric=True, snapshot_step=100) os.chdir('/home/cc/working/data/devclean_test/')
from pydub import AudioSegment as audio # constants data = '/home/cc/oci_audio/arun-train' working = '/home/cc/oci_audio/' new_data = 'whisp-train1' os.chdir(data) files = os.listdir(data) speakers = speech_data.get_speakers(data) waves = [] num = {} for s in speakers: num[s] = 0 c = 0 for f in files: # grab all wave files in list waves.append(audio.from_wav(f)) c = c + 1 os.chdir(working + new_data) for f, w in zip(files, waves): # need to segment the data into one second intervals begin = 0 end = 1 while (end * 1) < int(w.duration_seconds): segment = w[begin * 1000:end * 1000] segment.export( speech_data.speaker(f) + '_' + str(num[speech_data.speaker(f)]) + '.wav', 'wav') begin = begin + 1 end = end + 1 num[speech_data.speaker(f)] = num[speech_data.speaker(f)] + 1
import speech_data as data # training and testing data sets train_data = sys.argv[1] test_data = sys.argv[2] # grab the speakers from the training directory speakers = data.get_speakers(train_data) number_classes = len(speakers) # create the MFCC arrays from the data for training audio_files = os.listdir(train_data) X = [] Y = [] for f in audio_files: Y.append(data.one_hot_from_item(data.speaker(f), speakers)) y, sr = librosa.load(train_data + f) X.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)) # define the network and the model tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 13, 44]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net)
import os import speech_data data = '/home/cc/working/data/devclean/' new_data = '/home/cc/working/data/devclean_mfccs/' os.chdir(data) files = os.listdir(data) back = [] c = 0 for f in files: if 'Speaker' in f: continue back.append(f) print(len(back)) for b in back: os.system('cp ' + b + ' ' + new_data + speech_data.speaker(f) + '_back' + str(c) + '.wav') c = c + 1