Example #1
0
    def neural_network(self):
        # Hyperparameters
        learning_rate = 0.0001  # lower = more accuracy
        # For testing and nea reasons this is set to a lower value so that it can complete, in practice this would be 300000 steps.
        training_iters = 150  # training steps
        batch_size = 64

        width = 20  # mfcc features
        height = 80  # (max) length of utterance
        classes = 10  # amount of digits

        batch = word_batch = speech_data.mfcc_batch_generator(batch_size)
        X, Y = next(batch)
        trainX, trainY = X, Y
        testX, testY = X, Y  # overfit for now

        # Network building
        net = tflearn.input_data([None, width, height])
        net = tflearn.lstm(net, 128, dropout=0.8)
        net = tflearn.fully_connected(net, classes, activation='softmax')
        net = tflearn.regression(net, optimizer='adam', learning_rate=learning_rate, loss='categorical_crossentropy')

        # Training
        # tflearn_logs is the folder location
        model = tflearn.DNN(net, tensorboard_verbose=0)
        for iters in range(training_iters):  # training_iters
            model.fit(trainX, trainY, n_epoch=10, validation_set=(testX, testY), show_metric=True,
                      batch_size=batch_size)  # n_epoch the amount of iterations it will do per loop
            _y = model.predict(X)
        model.save("/home/mitchell/Documents/speech_data_files/models/my_model")
        # save_model(model,"/home/mitchell/Documents/speech_data_files/models/my_model", overwrite=True, include_optimizer=True)
        print(_y)
Example #2
0
def main():
    #Hyperparameters
    learning_rate = 0.0001 #lower = more accuracy
    #For testing and nea reasons this is set to a lower value so that it can complete, in practice this would be 300000 steps.
    training_iters = 500  #training steps
    batch_size = 64

    width = 20  # mfcc features
    height = 80  # (max) length of utterance
    classes = 10  # amount of digits

    batch = word_batch = speech_data.mfcc_batch_generator(batch_size)
    X, Y = next(batch)
    trainX, trainY = X, Y
    testX, testY = X, Y #overfit for now

    # Network building
    net = tflearn.input_data([None, width, height])
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, classes, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=learning_rate, loss='categorical_crossentropy')

    # Training
    #tflearn_logs is the folder location
    model = tflearn.DNN(net, tensorboard_verbose=0)
    for iters in range(training_iters): #training_iters
      model.fit(trainX, trainY, n_epoch=10, validation_set=(testX, testY), show_metric=True,
              batch_size=batch_size) #n_epoch the amount of iterations it will do per loop
      _y=model.predict(X)
    model.save("/home/mitchell/Documents/speech_data_files/models/my_model")
    #save_model(model,"/home/mitchell/Documents/speech_data_files/models/my_model", overwrite=True, include_optimizer=True)
    print(_y)

#ai = SpeechRecognition()
#ai.neural_network()
#model = ai.load_model()
#ai.audio_to_spectrogram()
#ai.predict_model(model)

#ai.commands("open", "google", "", "", "")
#ai.commands("close", "google")
#ai.commands("use", "google", "youtube")
#ai.commands("search", "","commands_list.txt", "/home/mitchell/Documents/", "/home/mitchell/")
#ai.commands("play","","youtube nyan cat","","")
#cmd = Commands()

#speak_words("hello, i am Iris")
#speak_words("yeet")





#main()
#model = summ()
#graph_spectrogram(audiopath)
#predict_model(model, imgpath)
Example #3
0
gradients, variables = zip(*optimizer.compute_gradients(model_loss))
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
model_optimizer = optimizer.apply_gradients(zip(gradients, variables))

#learning_rate = 0.001
#model_optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(model_loss)

model_predict = tf.to_int32(
    tf.nn.ctc_beam_search_decoder(model_logits3d,
                                  model_seq_lengths,
                                  merge_repeated=False)[0][0])
model_predict_dense = tf.sparse_to_dense(model_predict.indices,
                                         model_predict.dense_shape,
                                         model_predict.values)

batch = speech_data.mfcc_batch_generator(batch_size,
                                         target=speech_data.Target.dense)
X, Y, batch_no = next(batch)
trainX, trainY = X, Y
testX, testY = X, Y  #overfit for now

session = tf.Session()
session.run(tf.global_variables_initializer())


def dense_to_sparse(dense):
    idx = []
    vals = []
    shape = np.array(dense).shape
    lens = []
    for x in np.ndenumerate(dense):
        idx.append(x[0])
import tflearn
import speech_data
import tensorflow as tf

# 定义输入数据并预处理数据
learning_rate = 0.0001
training_iters = 300000  #迭代次数
batch_size = 64

width = 20  # MFCC特征
height = 80  # 最大发音长度
classes = 10  # 数字类别

# 对语言做分帧、取对数、逆矩阵等操作后,生成的MFCC 就代表这个语音的特征
batch = word_batch = speech_data.mfcc_batch_generator(
    batch_size)  # 生成每一批MFCC语音
X, Y = next(batch)
trainX, trainY = X, Y
testX, testY = X, Y

# 定义网络模型
net = tflearn.input_data([None, width, height])
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, classes, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         learning_rate=learning_rate,
                         loss='categorical_crossentropy')

# 训练模型
model = tflearn.DNN(net, tensorboard_verbose=0)
Example #5
0
from __future__ import division, print_function, absolute_import
import tflearn  #library from the top of the tensorflow
import speech_data  #file that fetches data from the web
import tensorflow as tf  #google framework for machine learning

#hyperparametars
learning_rate = 0.0001  #higher learning_rate the faster our network trains, smaller- more accurate results
training_iters = 300000  # steps
batch_size = 64

width = 20  # mfcc features
height = 80  # (max) length of utterance
classes = 10  # digits

batch = word_batch = speech_data.mfcc_batch_generator(
    batch_size
)  #this function will download a set of wav files, each file is a recording of different spoken digit. Returns the labeled speach files as batch
X, Y = next(
    batch
)  # we are spliting the batch in training and testing data with next() function
trainX, trainY = X, Y  # we are using the same data for testing, so it would be able to recognize the speaker i've trained on, but not other speakers
testX, testY = X, Y  #overfit for now

# Network building
net = tflearn.input_data(
    [None, width, height]
)  #gateway for the date to be put in the network, the parametar will help define the shape of the input  data
net = tflearn.lstm(
    net, 128, dropout=0.8
)  #building the next layer (number of neurons) too few - bad prediction, too many - overtraining
#dropout helps overfitting, by randomly turning off some of the neorons during training, so data is forced to find new paths in the network, alowing  more generalized model
Example #6
0
def score_model(X, y):
    y_predicted = np.array(model.predict(X))
    bool_arr = np.argmax(y_predicted, axis=1) == np.argmax(np.array(y), axis=1)
    bool_sum = np.sum(bool_arr)
    return ('model accuracy: {}'.format(
        round(float(bool_sum) / bool_arr.shape[0], 2)))


LEARNING_RATE = 0.0001
BATCH_SIZE = 64
WIDTH = 20  # mfcc features
HEIGHT = 80  # (max) length of utterance
CLASSES = 10  # digits

data_set = speech_data.mfcc_batch_generator(2400)
X, Y = next(data_set)
X, Y = np.array(X), np.array(Y)

# get train, test, validation split
X_train_val, X_test, y_train_val, y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.2,
                                                            random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_val,
                                                  y_train_val,
                                                  test_size=0.2,
                                                  random_state=0)
# Network building
net = tf.input_data([None, WIDTH, HEIGHT])
net = tf.lstm(net, 128, dropout=0.8)
# install dependencies
# pip install tensorflow
# pip install tflearn
# pip install future

# load libraries/packages
import tflearn
import speech_data

# define learning rate and number of trainings = tradeoff between speed and accuracy of learning
learning_rate = 0.0001
training_iterations = 300000

# use help of waive data for speech sounds
batch = word_batch = speech_data.mfcc_batch_generator(64)

# break up into training and test data
X, Y = next(batch)
trainX, trainY = X, Y
testX, testY = X, Y

# create  multi-layer reccurent neural net since speech is a train of sounds
# first layer -> use tflearn and it takes two inputs width of data (number of utterances) and height
nnet = tflearn.input_data([None, 20, 80])
# second layer -> defining how many nets and dropout rate (prevent overfitting by dumping that which doesnt make cuttoff)
nnet = tflearn.lstm(nnet, 128, dropout=.80)
# third layer -> making all layers fully connecteed with each other and only recognize 10 digits and softmax to convert numerical data into probabilities
net = tflearn.fully_connected(nnet, 10, activation='softmax')
# fourth layer -> use regression to make single predition per utterance
net = tflearn.regression(nnet,
test_step = 10
save_step = 100
learning_rate = 0.0001
# 0.0001 Step 300 Loss= 1.976625 Accuracy= 0.250 Time= 303s
# Step 24261 Loss= 0.011786 Accuracy= 1.000 Time= 33762s takes time but works

training_iters = 300000 #steps
batch_size = 64

width=features=20 # mfcc features
height=max_length=80 # (max) length of utterance
classes=10 # digits

keep_prob=dropout=0.7

batch = speech_data.mfcc_batch_generator(batch_size,target=Target.digits) #
X,Y=next(batch)
# print(Y)
print(np.array(Y).shape)

# inputs=tf.placeholder(tf.float32, shape=(batch_size,max_length,features))
x=inputs=tf.placeholder(tf.float32, shape=(batch_size,features,max_length))
# inputs = tf.transpose(inputs, [0, 2, 1]) #  inputs must be a `Tensor` of shape: `[batch_size, max_time, ...]`
inputs = tf.transpose(inputs, [2, 0, 1]) # [max_time, batch_size, features] to split:
# Split data because rnn cell needs a list of inputs for the RNN inner loop
inputs = tf.split(axis=0, num_or_size_splits=max_length, value=inputs)  # n_steps * (batch_size, features)

num_hidden = 100 #features
cell = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True)
# rnn=tf.nn.rnn(cell,inputs)
# rnn=tf.nn.dynamic_rnn(cell,inputs)
Example #9
0
  smodel_predicts_remove_others = []
  for predicts in smodel_predicts:
    digits = tf.split(predicts, nclasses, axis=1)
    remove_others = []
    for i in range(len(digits)):
      others = digits[:i] + digits[i+1:]
      remove_others.append(digits[i] - tf.add_n(others))
    smodel_predicts_remove_others.append(remove_others)
  smodel_predicts_remove_others = [ tf.concat(predicts, 1) for predicts in smodel_predicts_remove_others ]
  smodel_predicts = smodel_predicts_remove_others

model_loss = tf.losses.softmax_cross_entropy(model_output, model_logits)
opt = tf.train.AdamOptimizer(learning_rate)
model_train = opt.minimize(model_loss)

batch = speech_data.mfcc_batch_generator(batch_size, generate_separator = True)
#batch = speech_data.mfcc_sequence_batch_generator(batch_size, target=speech_data.Target.dense)
X, Y, batch_no = next(batch)
trainX, trainY = X, Y
testX, testY = X, Y #overfit for now

session = tf.Session()
session.run(tf.global_variables_initializer())
epoch = 0
epochs = 20
while epoch < epochs:
  epoch += 1
  print("epoch {0}".format(epoch))
  num_batches = int(len(trainX) / batch_size)
  batch_no = 1  # set to get in the loop
  while batch_no > 0:
#!/usr/bin/env python
#!/usr/bin/env python
import tensorflow as tf
import tflearn

import speech_data

learning_rate = 0.0001
training_iters = 300000  # steps
batch_size = 64

width = 20  # mfcc features
height = 80  # (max) length of utterance
classes = 10  # digits

batch = word_batch = speech_data.mfcc_batch_generator(batch_size)

# Network building
net = tflearn.input_data([None, width, height])
net = tflearn.lstm(net, 128*4, dropout=0.5)
net = tflearn.fully_connected(net, classes, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=learning_rate, loss='categorical_crossentropy')
model = tflearn.DNN(net, tensorboard_verbose=0)

## add this "fix" for tensorflow version errors
for x in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): tf.add_to_collection(tf.GraphKeys.VARIABLES, x )

# Training

while --training_iters > 0:
	trainX, trainY = next(batch)
Example #11
0

# hdf5 is not supported on this machine (please install/reinstall h5py for optimal experience)
# curses is not supported on this machine (please install/reinstall curses for an optimal experience)

#learning rate. The higher the learning rate the faster the network trains.
# the slower the learning rate, the slower the network is trained but it is more accurate
learning_rate = 0.0001
training_iters = 300000  # steps we want to train for
batch_size = 64

width = 20  # mfcc features
height = 80  # (max) length of utterance
classes = 10  # digits (The number of digits that we are training 1- 9)

batch = word_batch = speech_data.mfcc_batch_generator(batch_size) #downloads the .wav files that have a recording of different spoken numbers
X, Y = next(batch) #labeled speach files
trainX, trainY = X, Y
testX, testY = X, Y #overfit for now

# uses Recurrent Neural Network (RNN)
#tensor is a multi dimensional array of data
# Network building

#width is the number of features abstracted from the utterances from our speech helper class
#height is the max length of each utterance
net = tflearn.input_data([None, width, height])

#128 is the number of neurons
#dropout helps prevents overfitting by turning of neurons during training.  This allows for a more generalized model
net = tflearn.lstm(net, 128, dropout=0.8) #LSTM is a network that remembers everything it has learned. USed for state of the art speech recognition
from speech_data import Source,Target

#  LESS IS MORE! :
# 0.001  Step 1000 Loss= 2.292103 Accuracy= 0.100 Time= 163s 			Test Accuracy:  0.1 too high vs
# 0.0001  Step 1420 Loss= 1.794861 Accuracy= 0.600 Time= 231
# 0.00001 Step 1700 Loss= 0.575172 Accuracy= 1.000 Time= 274s 			Test Accuracy:  0.8
learning_rate = 0.00001
training_iters = 300000 #steps
batch_size = 64


height=20 # mfcc features
width=80 # (max) length of utterance
classes=10 # digits

batch = word_batch = speech_data.mfcc_batch_generator(batch_size, source=Source.DIGIT_WAVES, target=Target.digits)
X, Y = next(batch)
print("batch shape " + str(np.array(X).shape))

shape=[-1, height, width, 1]
# shape=[-1, width,height, 1]

# BASELINE toy net
def simple_dense(net): # best with lr ~0.001
	# type: (layer.net) -> None
	# net.dense(hidden=200,depth=8,dropout=False) # BETTER!!
	# net.reshape(shape)  # Reshape input picture
	net.dense(400, activation=tf.nn.tanh)# 0.99 YAY
	# net.denseNet(40, depth=4)
	# net.classifier() # auto classes from labels
	return
def lstm_model_kfold(name, lstm1_n, lstm2_n, fc3_n):

    # Results file manipulation:
    # name = input("Name of the model for future saving: ")
    file = open(name + "_results.txt", "w")
    file.write(
        "Long Short-Therm Memory Recurrent Neural Network for speech recognition.\n"
    )
    file.write("Training with K-Fold.\n")
    file.write("Network name: " + name + ".\n")
    file.write("Owner: Gabriel Furtado Lins Melo.\n\n")

    print("Importing libraries...")
    import numpy as np
    import tflearn
    import speech_data
    import os
    import time
    import datetime
    print("Imported libraries!")

    # Hyperparameters:
    learning_rate = 0.001
    lstm1_neurons = lstm1_n
    lstm2_neurons = lstm2_n
    fc3_neurons = fc3_n
    # lstm_neurons = 128
    dropout = 0.8
    activation = 'softmax'
    optimizer = 'adam'
    loss_func = 'categorical_crossentropy'

    # Training Features:
    kfold_k = 5
    training_iters = 100  # Multiply by 10 to know how many epochs

    # Dataset Features:
    dataset = 2400
    split_p = 0.9
    batchsize = int(split_p * dataset)  # 2160
    val_sets = int(batchsize / kfold_k)  # 432
    classes = 10
    width = 20
    height = 80
    print("Hyperparameters were set!")

    # Loading batchs and defining dataset / testing set:
    print("Loading batch...")
    batch = speech_data.mfcc_batch_generator(dataset)
    wav_files, wav_labels = next(batch)
    print("Loading training and testing sets...")
    testX, testY = wav_files[batchsize:], wav_labels[
        batchsize:]  # Testing set.
    datasetX, datasetY = wav_files[:
                                   batchsize], wav_labels[:
                                                          batchsize]  # Dataset.

    # Loading or building model:
    print("Building/Loading neural network structures...")
    net = tflearn.input_data([None, width, height])
    net = tflearn.lstm(net, lstm1_neurons, dropout=dropout, return_seq=True)
    net = tflearn.lstm(net, lstm2_neurons)
    net = tflearn.fully_connected(net, fc3_neurons)
    net = tflearn.fully_connected(net, classes,
                                  activation=activation)  # Output layer
    net = tflearn.regression(net,
                             optimizer=optimizer,
                             learning_rate=learning_rate,
                             loss=loss_func)
    model = tflearn.DNN(net, tensorboard_verbose=3, tensorboard_dir='Graphs')
    print("Built net!")

    # Validation with K-Fold:
    trainX = []
    trainY = []
    validationX = []
    validationY = []
    validation_accuracy = 0

    # Saving the initial weights for restarting them in k-fold
    model.save('tflearn.lstm.kfold')

    # Results file manipulation:
    fprintline(file)
    file.write("\nNetwork Layers:\n\n")
    file.write("	Input data (size: " + str(width) + " x " + str(height) +
               ")\n")
    file.write("	Lstm layer 1 (neurons: " + str(lstm1_neurons) +
               "), (dropout: " + str(dropout) + ")\n")
    file.write("	Lstm layer 2 (neurons: " + str(lstm2_neurons) +
               "), (dropout: None)\n")
    file.write("	Fully connected 1 (neurons: " + str(fc3_neurons) + ")\n")
    file.write("	Fully connected 2 (output neurons: " + str(classes) +
               "), (activation: " + str(activation) + ")\n")
    file.write("	Regression layer (optimizer: " + str(optimizer) +
               "), (loss function: " + str(loss_func) + "), (learning rate: " +
               str(learning_rate) + ")\n")
    file.write("\nDataset Features:\n\n")
    file.write("	Dataset: " + str(dataset) + "\n")
    file.write("	Batch size used in training: " + str(batchsize) + "\n")
    file.write("	Amount of test files: " + str(dataset - batchsize) + "\n")
    file.write("\nTraining Features:\n\n")
    file.write("	K-Fold \"K\": " + str(kfold_k) + "\n")
    file.write("	Epochs: " + str(training_iters * 10) + "\n\n")
    fprintline(file)

    # K-Fold training:
    file.write("\nK-Fold results:\n\n")
    printline()
    print("\nInitiating " + str(kfold_k) + "-Fold training.\n")
    printline()
    kname = "K-Fold_" + name
    start_time = time.time()
    for i in range(kfold_k):
        # Fixing sets
        validationX, validationY = datasetX[(i * val_sets):(
            (i + 1) * val_sets)], datasetY[(i * val_sets):((i + 1) * val_sets)]
        trainX, trainY = datasetX.copy(), datasetY.copy()
        trainX[(i * val_sets):((i + 1) * val_sets)] = []
        trainY[(i * val_sets):((i + 1) * val_sets)] = []
        printline()
        # Fold number printing:
        lista = []
        for j in range(kfold_k):
            lista.append('*VAL.*') if j == i else lista.append('TRAIN')
        print("\nK-Fold \"K\":", (i + 1))
        print("Dataset:", lista)
        print()
        printline()
        # Actual training:
        minibatch = batchsize - val_sets  # 1728
        for j in range(int(
                training_iters)):  # Each iteration have 10 training epochs.
            model.fit(trainX,
                      trainY,
                      n_epoch=10,
                      validation_set=None,
                      show_metric=False,
                      batch_size=minibatch,
                      run_id=kname)
        # Printing validation accuracy for each fold:
        accuracy = 0
        preds = model.predict(validationX)
        for j in range(len(validationX)):
            if (preds[j].tolist().index(max(
                    preds[j])) == validationY[j].tolist().index(
                        max(validationY[j]))):
                accuracy += 1
        accuracy /= len(validationX)
        printline()
        print("Fold " + str(i + 1) + " accuracy: %0.1f%%" % (accuracy * 100))
        printline()
        file.write("Fold " + str(i + 1) + " accuracy: %0.1f%%\n" %
                   (accuracy * 100))
        validation_accuracy += accuracy
        model.load('tflearn.lstm.kfold')
    # Printing mean validation accuracy:
    validation_accuracy /= kfold_k
    printline()
    print("\nValidation phase done!")
    print("Mean validation accuracy: %0.1f%%\n" % (validation_accuracy * 100))
    file.write("\nK-Fold validation phase mean accuracy: %0.1f%%\n\n" %
               (validation_accuracy * 100))
    fprintline(file)
    printline()
    print()

    # Final Training phase:
    printline()
    print("\nInitiating final training phase.\n")
    printline()
    file.write("\nFinal training phase results:\n\n")
    # After K-Fold training phase (Using all dataset):
    for j in range(
            int(training_iters)):  # Each iteration have 10 training epochs.
        # model.fit(datasetX, datasetY, n_epoch=10, validation_set=(testX,testY), show_metric=True, batch_size=batchsize, run_id=kname)
        model.fit(datasetX,
                  datasetY,
                  n_epoch=10,
                  validation_set=None,
                  show_metric=False,
                  batch_size=batchsize,
                  run_id=kname)
    # Printing predictions:
    printline()
    _Y = model.predict(testX)
    print("\nPredictions using testing set:\n")
    printline()
    hits = 0
    pred_matrix = []
    for i in range(len(testX)):
        prediction = []
        target = []
        for j in range(len(_Y[i])):
            prediction.append(str(round(_Y[i][j] * 100, 1)) +
                              "%")  # Making predictions readable
            target.append('Nope' if testY[i][j] == 0 else
                          ' Yes ')  # Making targets readable
        if (_Y[i].tolist().index(max(_Y[i])) == testY[i].tolist().index(
                max(testY[i]))):
            hits += 1
        if ((i + 1) % 10 == 0):
            print(
                "Prediction " + str(i + 1) + ":", prediction
            )  # Predição da rede treinada (Cada lista contém a probabilidade de cada número falado (classe))
            print("Target " + str(i + 1) + "    :", target)  # Targets
            printline()
    accuracy = hits / len(testX)
    print("Testing set size: %d" % len(testX))
    print("Hits (right predictions): %d" % hits)
    print("Testing accuracy: %0.1f%%" % (accuracy * 100))
    file.write("Testing set size: %d\n" % len(testX))
    file.write("Hits (right predictions): %d\n" % hits)
    file.write("Testing accuracy: %0.1f%%\n\n" % (accuracy * 100))
    fprintline(file)
    printline()
    end_time = time.time()
    total_time = end_time - start_time
    string_time = time.strftime("%Hh%Mm%Ss", time.gmtime(total_time))
    file.write("\nTotal training time:\n")
    file.write(string_time)

    # Saving:
    file.close()
    model.save('tflearn.lstm.model_' + name)
    print("Model saved with name: tflearn.lstm.model_" + name)
    print("Results saved as: " + name + "_results.txt")
Example #14
0
else:
    model_fc_w = tf.get_variable("fc_w", shape=(height * 128, 10))
model_fc_b = tf.get_variable("fc_b", shape=(10))
model_logits = tf.matmul(rnn_output, model_fc_w) + model_fc_b

model_predict = tf.nn.softmax(model_logits)
model_l2_loss = tf.nn.l2_loss(model_fc_w) + tf.nn.l2_loss(model_fc_b)
model_loss = tf.losses.softmax_cross_entropy(model_output,
                                             model_logits) + model_l2_loss
#opt = tf.train.AdamOptimizer(learning_rate)
opt = tf.train.GradientDescentOptimizer(learning_rate)
model_train = opt.minimize(model_loss)

# Training

batch = speech_data.mfcc_batch_generator(batch_size, n_mfcc=width)
X, Y, batch_no = next(batch)
trainX, trainY = X, Y


class Persistance:

    MODEL_NAME = 'tf_reverse'

    def __init__(self):
        self.saver = tf.train.Saver()
        self.checkpoint_path = "./saves/{0}.ckpt".format(self.MODEL_NAME)
        self.input_graph_path = "./saves/{0}.pbtxt".format(self.MODEL_NAME)
        self.pickle_file = "./saves/pickle_file"

    def load_graph(self, session):
# Step 24261 Loss= 0.011786 Accuracy= 1.000 Time= 33762s takes time but works

training_iters = 300000  # steps
batch_size = 64

width = features = 20  # mfcc input features
height = max_input_length = 80  # (max) length of input utterance (mfcc slices)
classes = num_characters = 32
max_word_length = 20  # max length of output (characters per word)
# classes=10 # digits

keep_prob = dropout = 0.7

# batch = speech_data.mfcc_batch_generator(batch_size, target=Target.word)
batch = speech_data.mfcc_batch_generator(batch_size,
                                         source=Source.WORD_WAVES,
                                         target=Target.hotword)
X, Y = next(batch)
print("lable shape", np.array(Y).shape)

# inputs=tf.placeholder(tf.float32, shape=(batch_size,max_length,features))
x = inputX = inputs = tf.placeholder(tf.float32,
                                     shape=(batch_size, features,
                                            max_input_length))
# inputs = tf.transpose(inputs, [0, 2, 1]) #  inputs must be a `Tensor` of shape: `[batch_size, max_time, ...]`
inputs = tf.transpose(inputs,
                      [2, 0, 1])  # [max_time, batch_size, features] to split:
# Split data because rnn cell needs a list of inputs for the RNN inner loop
inputs = tf.split(axis=0, num_or_size_splits=max_input_length,
                  value=inputs)  # n_steps * (batch_size, features)
Example #16
0
import tflearn
import speech_data
import tensorflow as tf
import numpy as np

data_path = r"D:\python_code\tensorflow_works\testx\pre"
learning_rate = 0.001
batch_size = 1

width = 20  # mfcc features
height = 29  # (max) length of utterance
classes = 10  # digits
batch = word_batch = speech_data.mfcc_batch_generator(
    batch_size, height, data_path)  #傳入64,返回一個生產器batch,每次只會執行一次,每次使用就算出一次的值。
#print('batch =',batch)
#X, Y = next(batch)

# Network building
net = tflearn.input_data([None, width, height])

#net = tflearn.lstm(net,256, dropout=0.8)
net = tflearn.lstm(net, 2048)
#jie bi jie bi
net = tflearn.fully_connected(net, classes, activation='softmax')

net = tflearn.regression(net,
                         optimizer='adam',
                         learning_rate=learning_rate,
                         loss='categorical_crossentropy')
# Training
# 0.0001 Step 300 Loss= 1.976625 Accuracy= 0.250 Time= 303s
# Step 24261 Loss= 0.011786 Accuracy= 1.000 Time= 33762s takes time but works

training_iters = 300000  # steps
batch_size = 64

width = features = 20  # mfcc input features
height = max_input_length = 80  # (max) length of input utterance (mfcc slices)
classes = num_characters = 32
max_word_length = 20  # max length of output (characters per word)
# classes=10 # digits

keep_prob = dropout = 0.7

# batch = speech_data.mfcc_batch_generator(batch_size, target=Target.word)
batch = speech_data.mfcc_batch_generator(batch_size, source=Source.WORD_WAVES, target=Target.hotword)
X, Y = next(batch)
print("lable shape", np.array(Y).shape)

# inputs=tf.placeholder(tf.float32, shape=(batch_size,max_length,features))
x = inputX = inputs = tf.placeholder(tf.float32, shape=(batch_size, features, max_input_length))
# inputs = tf.transpose(inputs, [0, 2, 1]) #  inputs must be a `Tensor` of shape: `[batch_size, max_time, ...]`
inputs = tf.transpose(inputs, [2, 0, 1])  # [max_time, batch_size, features] to split:
# Split data because rnn cell needs a list of inputs for the RNN inner loop
inputs = tf.split(axis=0, num_or_size_splits=max_input_length, value=inputs)  # n_steps * (batch_size, features)

num_hidden = 100  # features
cell = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True)
# cell = tf.nn.rnn_cell.EmbeddingWrapper(num_hidden, state_is_tuple=True)

# in many cases it may be more efficient to not use this wrapper,
Example #18
0
test_step = 10
save_step = 100
learning_rate = 0.0001
# 0.0001 Step 300 Loss= 1.976625 Accuracy= 0.250 Time= 303s
# Step 24261 Loss= 0.011786 Accuracy= 1.000 Time= 33762s takes time but works

training_iters = 300000  #steps
batch_size = 64

width = features = 20  # mfcc features
height = max_length = 80  # (max) length of utterance
classes = 10  # digits

keep_prob = dropout = 0.7

batch = speech_data.mfcc_batch_generator(batch_size, target=Target.digits)  #
X, Y = next(batch)
# print(Y)
print(np.array(Y).shape)

# inputs=tf.placeholder(tf.float32, shape=(batch_size,max_length,features))
x = inputs = tf.placeholder(tf.float32,
                            shape=(batch_size, features, max_length))
# inputs = tf.transpose(inputs, [0, 2, 1]) #  inputs must be a `Tensor` of shape: `[batch_size, max_time, ...]`
inputs = tf.transpose(inputs,
                      [2, 0, 1])  # [max_time, batch_size, features] to split:
# Split data because rnn cell needs a list of inputs for the RNN inner loop
inputs = tf.split(0, max_length, inputs)  # n_steps * (batch_size, features)

num_hidden = 100  #features
cell = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True)
from speech_data import Source,Target

#  LESS IS MORE! :
# 0.001  Step 1000 Loss= 2.292103 Accuracy= 0.100 Time= 163s 			Test Accuracy:  0.1 too high vs
# 0.0001  Step 1420 Loss= 1.794861 Accuracy= 0.600 Time= 231
# 0.00001 Step 1700 Loss= 0.575172 Accuracy= 1.000 Time= 274s 			Test Accuracy:  0.8
learning_rate = 0.00001
training_iters = 300000 #steps
batch_size = 64


height=20 # mfcc features
width=80 # (max) length of utterance
classes=10 # digits

batch = word_batch = speech_data.mfcc_batch_generator(batch_size, source=Source.DIGIT_WAVES, target=Target.digits)
X, Y = next(batch)
print("batch shape " + str(np.array(X).shape))

shape=[-1, height, width, 1]
# shape=[-1, width,height, 1]

# BASELINE toy net
def simple_dense(net): # best with lr ~0.001
	# type: (layer.net) -> None
	# net.dense(hidden=200,depth=8,dropout=False) # BETTER!!
	# net.reshape(shape)  # Reshape input picture
	net.dense(400, activation=tf.nn.tanh)# 0.99 YAY
	# net.denseNet(40, depth=4)
	# net.classifier() # auto classes from labels
	return
def lstm_model(existing_model=False,
               model_name="",
               usage="train",
               lstm1_n=100,
               lstm2_n=100):

    if (usage == 'train' and existing_model == False):
        name = input("Name of the model: ")
    else:
        name = model_name

    print("Importing libraries...")
    import numpy as np
    import tflearn
    # TFlearn is a modular and transparent deep learning library built on top of Tensorflow.
    # It was designed to provide a higher-level API to TensorFlow in order to facilitate and
    # speed-up experimentations, while remaining fully transparent and compatible with it.
    import speech_data  # speech_data will fetch data from web and format it for us.
    import matplotlib.pyplot as plt
    print("Imported libraries!")

    # Hyperparameters:
    # are the variables which determines the network structure (Eg: Number of Hidden Units)
    # and the variables which determine how the network is trained (Eg: Learning Rate).
    learning_rate = 0.001  # Original is 0.0001
    # Learning rate is a hyper-parameter that controls how much we are adjusting the weights
    # of our network with respect the loss gradient. The greater the learning rate the faster
    # our network trains, the lower the learning rate the more accurate our network predicts.
    training_iters = 100  # (Original is 30000)
    # Since spoken are a sequence of sound waves, we should use a recurrent neural network
    # because of its ability to process sequences. Lets build it below:
    width = 20  # mfcc features
    height = 80  # (max) length of utterance
    classes = 10  # amout of targets for using in layers
    split_p = 0.9  # split percentage (Size of training set). Testing set will be 1 - split_p
    dataset = 2400  # Size of dataset
    batchsize = int(split_p * dataset)  # Used batch for training
    lstm1_neurons = lstm1_n  # Amout of lstm neurons
    lstm2_neurons = lstm2_n
    dropout = 0.8  # amout of dropout (disabling neurons during training)
    print("Hyperparameters were set!")

    print("Loading batch...")
    if (usage == 'train'):
        batch = speech_data.mfcc_batch_generator(dataset)
        # This function (mfcc_batch_generator(batch_size)) will download (if needed)
        # a set of WAV files with recordings of spoken digits and a label with that digit. Having
        # the files, it will randomly load the batchs (with .wav files and their respective labels)
        # Original batch_size: 64
        print("Loading training and testing sets...")
        wav_files, wav_labels = next(
            batch
        )  # Spliting files and its labels with python built-in next() function.
        trainX, trainY = wav_files[:
                                   batchsize], wav_labels[:
                                                          batchsize]  # Training set gets firsts 90% of dataset
        testX, testY = wav_files[batchsize:], wav_labels[
            batchsize:]  # Validation set gets lasts 10% of dataset
        print("Training and testing sets were loaded!")
    elif (usage == 'test'):
        batch = speech_data.mfcc_batch_generator(200)
        testX, testY = next(batch)

    # Overfitting refers to a model that models the “training data” too well. Overfitting happens
    # when a model learns the detail and noise in the training data to the extent that it
    # negatively impacts the performance of the model on new data.

    # Loading or building model:
    print("Building/Loading neural network structures...")
    net = tflearn.input_data([None, width, height])
    # The input_data is a layer that will be used as the input layer.
    # For example, if the network wants an input with the shape [None, img_size,img_size,1]
    # meaning in human language:
    # None - many or a number of or how many images of.(batch size)
    # img_size X img_size - dimensions of the image.
    # 1 - with one color channel.
    net = tflearn.lstm(
        net, lstm1_neurons, dropout=dropout,
        return_seq=True)  # First parameter is net1, since we are feeding
    # tensors from one layer to the next. 128 means the number of neurons, too few would lead to
    # bad predictions, and to many would overfit the net. The third parameter, dropout, says how
    # much dropout do we want. Droupout helps prevent overfiting by randomly turning off some
    # neurons during training, so data is forced to find new paths between layers, allowing for
    # a more generalized model.
    # lstm is a type of RNN that can remember everything that is fed, outperforming regular
    # recurrent neural networks.
    net = tflearn.lstm(net, lstm2_neurons)
    net = tflearn.fully_connected(
        net, classes, activation='softmax')  # The activation function
    # softmax will convert numerical data into probabilities.
    net = tflearn.regression(net,
                             optimizer='adam',
                             learning_rate=learning_rate,
                             loss='categorical_crossentropy')
    # The output layer is a regression, which will output a single predicted number for our utterance.
    # The adam optimizer minimize the categorical cross entropy loss function over time.
    model = tflearn.DNN(net, tensorboard_verbose=3, tensorboard_dir='Graphs')

    # if(existing_model==True): model.load('tflearn.lstm.model_' + model_name) # Load weights, if wanted.
    print("Built net!")

    # Training and saving model:
    if (usage == "train"):
        print("\nStarting the training!")
        for i in range(int(
                training_iters)):  # Each iteration have 10 training epochs.
            treino = model.fit(trainX,
                               trainY,
                               n_epoch=10,
                               validation_set=(testX, testY),
                               show_metric=True,
                               batch_size=batchsize,
                               run_id=name)
        print("Network has been successfully trained!")

        model.save('tflearn.lstm.model_' + name)
        print("Model saved with name: tflearn.lstm.model_" + name)

    # Printing predictions:
    if (usage == "test"):
        _Y = model.predict(testX)
        print("\nPredictions:")
        printline()
        accuracy = 0
        for i in range(len(testX)):
            # if(i < int(len(validation_labels)*split_p)):
            # 	print("Training sample")
            # else: print("Validation sample")
            prediction = []
            target = []
            for j in range(len(_Y[i])):
                prediction.append(str(round(_Y[i][j] * 100, 1)) +
                                  "%")  # Making predictions readable
                target.append('Nope' if testY[i][j] == 0 else
                              ' Yes ')  # Making targets readable
            if (_Y[i].tolist().index(max(_Y[i])) == testY[i].tolist().index(
                    max(testY[i]))):
                accuracy += 1
            print(
                "Prediction " + str(i + 1) + ":", prediction
            )  # Predição da rede treinada (Cada lista contém a probabilidade de cada número falado (classe))
            print("Target " + str(i + 1) + "    :", target)  # Targets
            printline()
        accuracy /= len(testX)
        print("TEST ACCURACY: %.1f%%" % (accuracy * 100))
        printline()
#if tf.__version__ >= '0.12' and os.name == 'nt':
#	print("sorry, tflearn is not ported to tensorflow 0.12 on windows yet!(?)")
#	quit() # why? works on Mac?

speakers = data.get_speakers()

number_classes = len(speakers)
#print("speakers",speakers)

file = open('spkrs_list.txt', 'w')
for line in speakers:
    file.write(line + '\n')
file.close()

batch = data.mfcc_batch_generator(batch_size=2000,
                                  source=data.Source.DIGIT_WAVES,
                                  target=data.Target.speaker)
X, Y = next(batch)

# Classification
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

input_layer = tflearn.input_data(shape=[
    20, 640
])  #Two wave chunks, the time dimension of the wav file has been modified

fc1 = tflearn.fully_connected(input_layer, 128, name='fc1')
bn1 = tflearn.batch_normalization(fc1, name='bn1')
dp1 = tflearn.dropout(bn1, 0.5, name='dp1')
ac1 = tflearn.activation(bn1, activation='softmax', name='ac1')
#net = tflearn.fully_connected(net, 400, activation='softmax')
Example #22
0
from __future__ import division, print_function, absolute_import
import tflearn
import speech_data
import tensorflow as tf

learning_rate = 0.0001
training_iters = 30  # steps
batch_size = 64

width = 20  # mfcc features
height = 80  # (max) length of utterance
classes = 10  # digits

batch = word_batch = speech_data.mfcc_batch_generator(batch_size)
X, Y = next(batch)
trainX, trainY = X, Y
testX, testY = X, Y  #overfit for now

# Network building
net = tflearn.input_data([None, width, height])
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, classes, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         learning_rate=learning_rate,
                         loss='categorical_crossentropy')
# Training

### add this "fix" for tensorflow version errors
col = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
for x in col:
Example #23
0
#

"""util  tflearn"""
import tflearn
import speech_data
import tensorflow as tf

learning_rate = 0.0001
training_iters = 300000  #iterations number
batch_size = 64

width = 20 #MFCC characteristic
height = 80  #max lenght of voice
classes = 10  #classes of spoken numbers

batch = word_batch = speech_data.mfcc_batch_generator(batch_size) #generate each batch of MFCC
X, Y = next(batch)
trainX, trainY = X, Y
testX, testY = X, Y

#define LSTM model
net = tflearn.input_data([None, width, height])
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, classes, activation='softmax')
net = tf.learn.regression(net, optimizer='adam', learning_rate=learning_rate,
                          loss='categorical_crossentropy')

#train model and save it
model = tflearn.DNN(net, tensorboard_verbose=0)
while 1:  #training_iters
    model.fit(trainX, trainY, n_epoch=10, validation_set=(testX, testY),