コード例 #1
0
ファイル: __main__.py プロジェクト: povilasb/voice_auth
def make_model(number_classes):

    batch = data.wave_batch_generator(batch_size=1000,
                                      target=data.Target.speaker)
    X, Y = next(batch)

    # Classification
    tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

    net = tflearn.input_data(shape=[None, 3848])  #Two wave chunks

    net = tflearn.fully_connected(net, 128)
    net = tflearn.dropout(net, 0.5)

    net = tflearn.fully_connected(net, 16)
    net = tflearn.dropout(net, 0.8)

    net = tflearn.fully_connected(net, 128)
    net = tflearn.dropout(net, 0.5)

    net = tflearn.fully_connected(net, number_classes, activation='softmax')
    net = tflearn.regression(net,
                             optimizer='adam',
                             loss='categorical_crossentropy')
    model = tflearn.DNN(net)

    return model
コード例 #2
0
ファイル: __main__.py プロジェクト: Skomantas/voice_auth
def train(number_classes):
    model = ml.make_model(number_classes)
    batch = data.wave_batch_generator(batch_size=1000,
                                      target=data.Target.speaker)
    X, Y = next(batch)
    model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100)
    model.save('classifier')
コード例 #3
0
def main():
    batch = speech_data.wave_batch_generator(10000,
                                             target=speech_data.Target.digits)
    X, Y = next(batch)
    Y = [numpy.hstack([y, numpy.array([0, 0, 0, 0, 0, 0])]) for y in Y]
    # Y = map(lambda a: , Y)
    print(type(Y))
    # print (np.hstack([Y[0], np.array([0, 0, 0, 0, 0, 0])]))
    number_classes = 16  # Digits

    # Classification
    tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

    net = tflearn.input_data(shape=[None, 8192])
    net = tflearn.fully_connected(net, 64, name='f1')
    net = tflearn.dropout(net, 0.5, name='dp')
    net = tflearn.fully_connected(net,
                                  number_classes,
                                  activation='softmax',
                                  name='f2')
    net = tflearn.regression(net,
                             optimizer='sgd',
                             loss='categorical_crossentropy')

    model = tflearn.DNN(net)
    model.load('pre-trained/model.tflearn.sgd_trained')

    # Overfitting okay for now
    totalTime = 0
    totalAcc = 0
    numTimes = 100
    for i in range(numTimes):
        t = time.time()
        result = model.predict(X)
        print("-------------")

        result = numpy.array([numpy.argmax(r) for r in result])
        answers = numpy.array([numpy.argmax(answer) for answer in Y])

        print(i, ">>>", (result == answers).sum() / float(len(answers)),
              "time: ",
              time.time() - t)
        totalAcc = totalAcc + (result == answers).sum() / float(len(answers))
        totalTime = totalTime + time.time() - t

    print("Avg. Acc. = ", totalAcc / numTimes)
    print("Avg. time = ", totalTime / numTimes)
コード例 #4
0
#!/usr/bin/env python
#!/usr/bin/env PYTHONIOENCODING="utf-8" python
import tflearn
import pyaudio
import speech_data
import numpy
import os
import tensorflow as tf

# Simple spoken digit recognition demo, with 98% accuracy in under a minute

# Training Step: 544  | total loss: 0.15866
# | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000

batch = speech_data.wave_batch_generator(10000,
                                         target=speech_data.Target.digits)
X, Y = next(batch)

number_classes = 10  # Digits

# Classification
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

net = tflearn.input_data(shape=[None, 8192])
#net = tflearn.fully_connected(net, 64)
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, number_classes, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         loss='categorical_crossentropy')
import tflearn
import pyaudio
import speech_data as data

# Simple speaker recognition demo, with 99% accuracy in under a minute ( on digits sample )

# | Adam | epoch: 030 | loss: 0.05330 - acc: 0.9966 -- iter: 0000/1000
# 'predicted speaker for 9_Vicki_260 : result = ', 'Vicki'

speakers = data.get_speakers()
number_classes = len(speakers)
##print("speakers",speakers)

WORD_WAVs = "spoken_words"
batch = data.wave_batch_generator(batch_size=1000,
                                  source=WORD_WAVs,
                                  target=data.Target.speaker)
X, Y = next(batch)

# Classification
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

net = tflearn.input_data(shape=[None, 8192])  #Two wave chunks
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, number_classes, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         loss='categorical_crossentropy')

model = tflearn.DNN(net)
コード例 #6
0
# | Adam | epoch: 030 | loss: 0.05330 - acc: 0.9966 -- iter: 0000/1000
# 'predicted speaker for 9_Vicki_260 : result = ', 'Vicki'
import tensorflow as tf
print("You are using tensorflow version " +
      tf.__version__)  #+" tflearn version "+ tflearn.version)
if tf.__version__ >= '0.12' and os.name == 'nt':
    print("sorry, tflearn is not ported to tensorflow 0.12 on windows yet!(?)")
    quit()  # why? works on Mac?

speakers = data.get_speakers()
number_classes = len(speakers)
print("speakers", speakers)

batch = data.wave_batch_generator(batch_size=1000,
                                  source=data.Source.DIGIT_WAVES,
                                  target=data.Target.speaker)
X, Y = next(batch)

# Classification
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

net = tflearn.input_data(shape=[None, 8192])  #Two wave chunks
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, number_classes, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         loss='categorical_crossentropy')

model = tflearn.DNN(net)
コード例 #7
0
#!/usr/local/bin/python
#!/usr/bin/env PYTHONIOENCODING="utf-8" python
import os

import tflearn
import speech_data as data
import tensorflow as tf
import numpy

speakers = data.get_speakers()
number_classes = len(speakers)
print("speakers", speakers)

batch = data.wave_batch_generator(batch_size=1000, target=data.Target.speaker)
X, Y = next(batch)

# Classification
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

net = tflearn.input_data(shape=[None, 3848])  #Two wave chunks

net = tflearn.fully_connected(net, 128)
net = tflearn.dropout(net, 0.5)

net = tflearn.fully_connected(net, 16)
net = tflearn.dropout(net, 0.8)

net = tflearn.fully_connected(net, 128)
net = tflearn.dropout(net, 0.5)

net = tflearn.fully_connected(net, number_classes, activation='softmax')
コード例 #8
0
ファイル: train2.py プロジェクト: llamalle/TERL2
#!/usr/bin/env python
#!/usr/bin/env PYTHONIOENCODING="utf-8" python
import tflearn
#import pyaudio
import speech_data
import numpy
import sys

load = speech_data.wave_batch_generator(2404)
X, Y = next(load)

number_classes = 10  # Digits

tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)
# nombre de coeur allouer et memoire gpu allouer

net = tflearn.input_data(shape=[None, 8192])
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, number_classes, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         learning_rate=0.0001,
                         loss='categorical_crossentropy')

model = tflearn.DNN(net)
model.fit(X,
          Y,
          validation_set=0.2,
          n_epoch=200,
          show_metric=True,
#!/usr/bin/env PYTHONIOENCODING="utf-8" python
import tflearn
import pyaudio
import speech_data
import numpy

# Simple spoken digit recognition demo, with 98% accuracy in under a minute

# Training Step: 544  | total loss: 0.15866
# | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000

batch=speech_data.wave_batch_generator(10000,target=speech_data.Target.digits)
X,Y=next(batch)

number_classes=10 # Digits

# Classification
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

net = tflearn.input_data(shape=[None, 8192])
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, number_classes, activation='softmax')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')

model = tflearn.DNN(net)
model.fit(X, Y,n_epoch=3,show_metric=True,snapshot_step=100)
# Overfitting okay for now

demo_file = "5_Vicki_260.wav"
demo=speech_data.load_wav_file(speech_data.path + demo_file)
#!/usr/bin/env PYTHONIOENCODING="utf-8" python
import tflearn
import pyaudio
import speech_data as data

# Simple speaker recognition demo, with 99% accuracy in under a minute ( on digits sample )

# | Adam | epoch: 030 | loss: 0.05330 - acc: 0.9966 -- iter: 0000/1000
# 'predicted speaker for 9_Vicki_260 : result = ', 'Vicki'

speakers = data.get_speakers()
number_classes=len(speakers)
print("speakers",speakers)

WORD_WAVs="spoken_words"
batch=data.wave_batch_generator(batch_size=1000,source=WORD_WAVs,target=data.Target.speaker)
X,Y=next(batch)


# Classification
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, number_classes, activation='softmax')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')

model = tflearn.DNN(net)
model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100)
コード例 #11
0
# Training Step: 544  | total loss: 0.15866
# | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000
# 98% Accuracy on training set in just a minute

# audio = pyaudio.PyAudio()
# # format=pyaudio.paFloat32
# format=pyaudio.paInt8
# # format=audio.get_format_from_width(f.getsampwidth())
# # out_stream = audio.open( format=format,channels = f.getnchannels(), rate=f.getframerate(), output= True)
# out_stream = audio.open( format=format,channels = 1, rate=48000, output= True)
# out_stream.start_stream()
# def play_pcm(data):
#   out_stream.write(data)

batch = speech_data.wave_batch_generator(1000)
X, Y = next(batch)

# Classification
# x = tflearn.input_data(shape=[None, 8192])
# net = tflearn.fully_connected(x, 64)
# net = tflearn.dropout(net, 0.5)
# net = tflearn.fully_connected(net, 10, activation='softmax')
# net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')
# y = net.placeholder
# classifier = tflearn.DNN(net)


def model(net):
    # type: (layer.net) -> None
    # net.inputnet_data(shape=[None, 10])
コード例 #12
0
import tflearn
from speech_data import wave_batch_generator, Target, load_wav_file, path
import numpy

# Simple spoken digit recognition demo, with 98% accuracy in under a minute

# Training Step: 544  | total loss: 0.15866
# | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000

if __name__ == '__main__':
    batch = wave_batch_generator(10000, target=Target.digits)
    X, Y = next(batch)

    number_classes = 10  # Digits

    # Classification
    tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

    net = tflearn.input_data(shape=[None, 8192])
    net = tflearn.fully_connected(net, 64)
    net = tflearn.dropout(net, 0.5)
    net = tflearn.fully_connected(net, number_classes, activation='softmax')
    net = tflearn.regression(net,
                             optimizer='adam',
                             loss='categorical_crossentropy')

    model = tflearn.DNN(net)
    model.fit(X, Y, n_epoch=3, show_metric=True, snapshot_step=100)
    # Overfitting okay for now

    demo_file = "5_Vicki_260.wav"
コード例 #13
0
#!/usr/bin/env PYTHONIOENCODING="utf-8" python
import tflearn
import pyaudio
import speech_data as data

# Simple speaker recognition demo, with 99% accuracy in under a minute ( on digits sample )

# | Adam | epoch: 030 | loss: 0.05330 - acc: 0.9966 -- iter: 0000/1000
# 'predicted speaker for 9_Vicki_260 : result = ', 'Vicki'

speakers = data.get_speakers()
number_classes = len(speakers)
print("speakers", speakers)

target = data.Target.speaker
batch = data.wave_batch_generator(batch_size=1000, target=target)
X, Y = next(batch)

# Classification
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

net = tflearn.input_data(shape=[None, 8192])
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, number_classes, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         loss='categorical_crossentropy')

model = tflearn.DNN(net)
model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100)
コード例 #14
0
ファイル: letsGo.py プロジェクト: nafehshoaib/safeSpeak
import speech_data

import numpy as np

from constants import POSITIVE_PATH, NEGATIVE_PATH

learning_rate = 0.03
training_iters = 10  # steps
batch_size = 2

width = 1
height = 8192  # (max) length of utterance
classes = 2  # digits

positive_batch = word_batch = speech_data.wave_batch_generator(
    batch_size, POSITIVE_PATH, [1.0, 0.0])
negative_batch = word_batch = speech_data.wave_batch_generator(
    batch_size, NEGATIVE_PATH, [0.0, 1])
#positive_batch = np.reshape(positive_batch, (1, 10, 8192))
#negative_batch = np.reshape(negative_batch, (1, 10, 8192))

# Network building
'''net = tflearn.input_data(shape=[None, 8192])
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, classes, activation='softmax')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')
'''

net = tflearn.input_data([None, width, height])
net = tflearn.lstm(net, 128 * 4, dropout=0.2)
コード例 #15
0
# Training Step: 544  | total loss: 0.15866
# | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000
# 98% Accuracy on training set in just a minute

# audio = pyaudio.PyAudio()
# # format=pyaudio.paFloat32
# format=pyaudio.paInt8
# # format=audio.get_format_from_width(f.getsampwidth())
# # out_stream = audio.open( format=format,channels = f.getnchannels(), rate=f.getframerate(), output= True)
# out_stream = audio.open( format=format,channels = 1, rate=48000, output= True)
# out_stream.start_stream()
# def play_pcm(data):
#   out_stream.write(data)

batch=speech_data.wave_batch_generator(1000)
X,Y=next(batch)

# Classification
# x = tflearn.input_data(shape=[None, 8192])
# net = tflearn.fully_connected(x, 64)
# net = tflearn.dropout(net, 0.5)
# net = tflearn.fully_connected(net, 10, activation='softmax')
# net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')
# y = net.placeholder
# classifier = tflearn.DNN(net)

def model(net):
	# type: (layer.net) -> None
	# net.inputnet_data(shape=[None, 10])
	net.fully_connected( 64)
コード例 #16
0
#!/usr/bin/env PYTHONIOENCODING="utf-8" python
import tflearn
import os
import speech_data as data

# training and testing data sets
train_data = '/home/cc/Data/small-clean-train/'
test_data = '/home/cc/Data/small-clean-test/'

# grab the speakers from the training directory
speakers = data.get_speakers(train_data)
number_classes = len(speakers)

# create the MFCC arrays from the data for training
batch=data.wave_batch_generator(batch_size=1000,source=WORD_WAVs,target=data.Target.speaker,speakers=speakers)
X,Y=next(batch)


# Classification
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks
net = tflearn.fully_connected(net, 64)
# seems like a higher dropout rate works better -- why is this??
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, number_classes, activation='softmax')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')

model = tflearn.DNN(net)
model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100)