import speech_data as data

# training and testing data sets
train_data = sys.argv[1]
test_data = sys.argv[2]

# grab the speakers from the training directory
speakers = data.get_speakers(train_data)
number_classes = len(speakers)

# create the MFCC arrays from the data for training
audio_files = os.listdir(train_data)
X = []
Y = []
for f in audio_files:
    Y.append(data.one_hot_from_item(data.speaker(f), speakers))
    y, sr = librosa.load(train_data + f)
    X.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13))

# define the network and the model
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

net = tflearn.input_data(shape=[None, 13, 44])
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, number_classes, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         loss='categorical_crossentropy')

model = tflearn.DNN(net)
import pickle
import tensorflow as tf
import librosa.display
import IPython.display
import numpy as np
import speech_data
from pydub import AudioSegment as audio

# now put all of the mfccs into an array
data = '/home/cc/working/data/devclean_2_seg/'
speakers = speech_data.get_speakers(data)
audio_files = os.listdir(data)
mfccs = []
Y = []
for f in audio_files:
  Y.append(speech_data.one_hot_from_item(speech_data.speaker(f), speakers))
  y, sr = librosa.load(data + f)
  mfccs.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13))

net = tflearn.input_data(shape=[None, 13, 44]) 
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, 32)
net = tflearn.fully_connected(net, len(speakers), activation='softmax')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')

model = tflearn.DNN(net,tensorboard_dir='/home/cc/working/tboard/', tensorboard_verbose=3)
model.fit(mfccs, Y, n_epoch=2000, show_metric=True, snapshot_step=100)

os.chdir('/home/cc/working/data/devclean_test/')
Ejemplo n.º 3
0
                     learning_rate=0.001)

# Training
model = tflearn.DNN(network,
                    checkpoint_path='model_alexnet',
                    max_checkpoints=1,
                    tensorboard_verbose=2)
statistic_array = np.zeros((1, number_classes))
try:
    model.load('./saved_model/augment_model.tflearn')
finally:
    ts_path = "./new_data_set/simple_test_set/npys/"
v_counter = 0
samples = fetch.random_sample(ts_path, 1)
for sample in samples:
    load_spectrum = np.load(ts_path + sample)
    #demo=np.reshape(load_spectrum,(227,227,1))
    demo = np.array(load_spectrum, dtype=np.float32)
    result1 = model.predict([demo])
    result = data.one_hot_to_item(result1, speakers)
    validity = fetch.check_speaker(result, sample, -3)  #-2or-3
    print("predicted speaker for %s : result = %s validity = %d" %
          (sample, result, validity))
    # ~ 97% correct
    if validity:
        v_counter += 1
    else:
        statistic_array = statistic_array + data.one_hot_from_item(
            fetch.extract(sample, -3), speakers)
print(v_counter / len(samples))