def neural_network(self): # Hyperparameters learning_rate = 0.0001 # lower = more accuracy # For testing and nea reasons this is set to a lower value so that it can complete, in practice this would be 300000 steps. training_iters = 150 # training steps batch_size = 64 width = 20 # mfcc features height = 80 # (max) length of utterance classes = 10 # amount of digits batch = word_batch = speech_data.mfcc_batch_generator(batch_size) X, Y = next(batch) trainX, trainY = X, Y testX, testY = X, Y # overfit for now # Network building net = tflearn.input_data([None, width, height]) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=learning_rate, loss='categorical_crossentropy') # Training # tflearn_logs is the folder location model = tflearn.DNN(net, tensorboard_verbose=0) for iters in range(training_iters): # training_iters model.fit(trainX, trainY, n_epoch=10, validation_set=(testX, testY), show_metric=True, batch_size=batch_size) # n_epoch the amount of iterations it will do per loop _y = model.predict(X) model.save("/home/mitchell/Documents/speech_data_files/models/my_model") # save_model(model,"/home/mitchell/Documents/speech_data_files/models/my_model", overwrite=True, include_optimizer=True) print(_y)
def main(): #Hyperparameters learning_rate = 0.0001 #lower = more accuracy #For testing and nea reasons this is set to a lower value so that it can complete, in practice this would be 300000 steps. training_iters = 500 #training steps batch_size = 64 width = 20 # mfcc features height = 80 # (max) length of utterance classes = 10 # amount of digits batch = word_batch = speech_data.mfcc_batch_generator(batch_size) X, Y = next(batch) trainX, trainY = X, Y testX, testY = X, Y #overfit for now # Network building net = tflearn.input_data([None, width, height]) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=learning_rate, loss='categorical_crossentropy') # Training #tflearn_logs is the folder location model = tflearn.DNN(net, tensorboard_verbose=0) for iters in range(training_iters): #training_iters model.fit(trainX, trainY, n_epoch=10, validation_set=(testX, testY), show_metric=True, batch_size=batch_size) #n_epoch the amount of iterations it will do per loop _y=model.predict(X) model.save("/home/mitchell/Documents/speech_data_files/models/my_model") #save_model(model,"/home/mitchell/Documents/speech_data_files/models/my_model", overwrite=True, include_optimizer=True) print(_y) #ai = SpeechRecognition() #ai.neural_network() #model = ai.load_model() #ai.audio_to_spectrogram() #ai.predict_model(model) #ai.commands("open", "google", "", "", "") #ai.commands("close", "google") #ai.commands("use", "google", "youtube") #ai.commands("search", "","commands_list.txt", "/home/mitchell/Documents/", "/home/mitchell/") #ai.commands("play","","youtube nyan cat","","") #cmd = Commands() #speak_words("hello, i am Iris") #speak_words("yeet") #main() #model = summ() #graph_spectrogram(audiopath) #predict_model(model, imgpath)
gradients, variables = zip(*optimizer.compute_gradients(model_loss)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) model_optimizer = optimizer.apply_gradients(zip(gradients, variables)) #learning_rate = 0.001 #model_optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(model_loss) model_predict = tf.to_int32( tf.nn.ctc_beam_search_decoder(model_logits3d, model_seq_lengths, merge_repeated=False)[0][0]) model_predict_dense = tf.sparse_to_dense(model_predict.indices, model_predict.dense_shape, model_predict.values) batch = speech_data.mfcc_batch_generator(batch_size, target=speech_data.Target.dense) X, Y, batch_no = next(batch) trainX, trainY = X, Y testX, testY = X, Y #overfit for now session = tf.Session() session.run(tf.global_variables_initializer()) def dense_to_sparse(dense): idx = [] vals = [] shape = np.array(dense).shape lens = [] for x in np.ndenumerate(dense): idx.append(x[0])
import tflearn import speech_data import tensorflow as tf # 定义输入数据并预处理数据 learning_rate = 0.0001 training_iters = 300000 #迭代次数 batch_size = 64 width = 20 # MFCC特征 height = 80 # 最大发音长度 classes = 10 # 数字类别 # 对语言做分帧、取对数、逆矩阵等操作后,生成的MFCC 就代表这个语音的特征 batch = word_batch = speech_data.mfcc_batch_generator( batch_size) # 生成每一批MFCC语音 X, Y = next(batch) trainX, trainY = X, Y testX, testY = X, Y # 定义网络模型 net = tflearn.input_data([None, width, height]) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=learning_rate, loss='categorical_crossentropy') # 训练模型 model = tflearn.DNN(net, tensorboard_verbose=0)
from __future__ import division, print_function, absolute_import import tflearn #library from the top of the tensorflow import speech_data #file that fetches data from the web import tensorflow as tf #google framework for machine learning #hyperparametars learning_rate = 0.0001 #higher learning_rate the faster our network trains, smaller- more accurate results training_iters = 300000 # steps batch_size = 64 width = 20 # mfcc features height = 80 # (max) length of utterance classes = 10 # digits batch = word_batch = speech_data.mfcc_batch_generator( batch_size ) #this function will download a set of wav files, each file is a recording of different spoken digit. Returns the labeled speach files as batch X, Y = next( batch ) # we are spliting the batch in training and testing data with next() function trainX, trainY = X, Y # we are using the same data for testing, so it would be able to recognize the speaker i've trained on, but not other speakers testX, testY = X, Y #overfit for now # Network building net = tflearn.input_data( [None, width, height] ) #gateway for the date to be put in the network, the parametar will help define the shape of the input data net = tflearn.lstm( net, 128, dropout=0.8 ) #building the next layer (number of neurons) too few - bad prediction, too many - overtraining #dropout helps overfitting, by randomly turning off some of the neorons during training, so data is forced to find new paths in the network, alowing more generalized model
def score_model(X, y): y_predicted = np.array(model.predict(X)) bool_arr = np.argmax(y_predicted, axis=1) == np.argmax(np.array(y), axis=1) bool_sum = np.sum(bool_arr) return ('model accuracy: {}'.format( round(float(bool_sum) / bool_arr.shape[0], 2))) LEARNING_RATE = 0.0001 BATCH_SIZE = 64 WIDTH = 20 # mfcc features HEIGHT = 80 # (max) length of utterance CLASSES = 10 # digits data_set = speech_data.mfcc_batch_generator(2400) X, Y = next(data_set) X, Y = np.array(X), np.array(Y) # get train, test, validation split X_train_val, X_test, y_train_val, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=0) # Network building net = tf.input_data([None, WIDTH, HEIGHT]) net = tf.lstm(net, 128, dropout=0.8)
# install dependencies # pip install tensorflow # pip install tflearn # pip install future # load libraries/packages import tflearn import speech_data # define learning rate and number of trainings = tradeoff between speed and accuracy of learning learning_rate = 0.0001 training_iterations = 300000 # use help of waive data for speech sounds batch = word_batch = speech_data.mfcc_batch_generator(64) # break up into training and test data X, Y = next(batch) trainX, trainY = X, Y testX, testY = X, Y # create multi-layer reccurent neural net since speech is a train of sounds # first layer -> use tflearn and it takes two inputs width of data (number of utterances) and height nnet = tflearn.input_data([None, 20, 80]) # second layer -> defining how many nets and dropout rate (prevent overfitting by dumping that which doesnt make cuttoff) nnet = tflearn.lstm(nnet, 128, dropout=.80) # third layer -> making all layers fully connecteed with each other and only recognize 10 digits and softmax to convert numerical data into probabilities net = tflearn.fully_connected(nnet, 10, activation='softmax') # fourth layer -> use regression to make single predition per utterance net = tflearn.regression(nnet,
test_step = 10 save_step = 100 learning_rate = 0.0001 # 0.0001 Step 300 Loss= 1.976625 Accuracy= 0.250 Time= 303s # Step 24261 Loss= 0.011786 Accuracy= 1.000 Time= 33762s takes time but works training_iters = 300000 #steps batch_size = 64 width=features=20 # mfcc features height=max_length=80 # (max) length of utterance classes=10 # digits keep_prob=dropout=0.7 batch = speech_data.mfcc_batch_generator(batch_size,target=Target.digits) # X,Y=next(batch) # print(Y) print(np.array(Y).shape) # inputs=tf.placeholder(tf.float32, shape=(batch_size,max_length,features)) x=inputs=tf.placeholder(tf.float32, shape=(batch_size,features,max_length)) # inputs = tf.transpose(inputs, [0, 2, 1]) # inputs must be a `Tensor` of shape: `[batch_size, max_time, ...]` inputs = tf.transpose(inputs, [2, 0, 1]) # [max_time, batch_size, features] to split: # Split data because rnn cell needs a list of inputs for the RNN inner loop inputs = tf.split(axis=0, num_or_size_splits=max_length, value=inputs) # n_steps * (batch_size, features) num_hidden = 100 #features cell = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True) # rnn=tf.nn.rnn(cell,inputs) # rnn=tf.nn.dynamic_rnn(cell,inputs)
smodel_predicts_remove_others = [] for predicts in smodel_predicts: digits = tf.split(predicts, nclasses, axis=1) remove_others = [] for i in range(len(digits)): others = digits[:i] + digits[i+1:] remove_others.append(digits[i] - tf.add_n(others)) smodel_predicts_remove_others.append(remove_others) smodel_predicts_remove_others = [ tf.concat(predicts, 1) for predicts in smodel_predicts_remove_others ] smodel_predicts = smodel_predicts_remove_others model_loss = tf.losses.softmax_cross_entropy(model_output, model_logits) opt = tf.train.AdamOptimizer(learning_rate) model_train = opt.minimize(model_loss) batch = speech_data.mfcc_batch_generator(batch_size, generate_separator = True) #batch = speech_data.mfcc_sequence_batch_generator(batch_size, target=speech_data.Target.dense) X, Y, batch_no = next(batch) trainX, trainY = X, Y testX, testY = X, Y #overfit for now session = tf.Session() session.run(tf.global_variables_initializer()) epoch = 0 epochs = 20 while epoch < epochs: epoch += 1 print("epoch {0}".format(epoch)) num_batches = int(len(trainX) / batch_size) batch_no = 1 # set to get in the loop while batch_no > 0:
#!/usr/bin/env python #!/usr/bin/env python import tensorflow as tf import tflearn import speech_data learning_rate = 0.0001 training_iters = 300000 # steps batch_size = 64 width = 20 # mfcc features height = 80 # (max) length of utterance classes = 10 # digits batch = word_batch = speech_data.mfcc_batch_generator(batch_size) # Network building net = tflearn.input_data([None, width, height]) net = tflearn.lstm(net, 128*4, dropout=0.5) net = tflearn.fully_connected(net, classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=learning_rate, loss='categorical_crossentropy') model = tflearn.DNN(net, tensorboard_verbose=0) ## add this "fix" for tensorflow version errors for x in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): tf.add_to_collection(tf.GraphKeys.VARIABLES, x ) # Training while --training_iters > 0: trainX, trainY = next(batch)
# hdf5 is not supported on this machine (please install/reinstall h5py for optimal experience) # curses is not supported on this machine (please install/reinstall curses for an optimal experience) #learning rate. The higher the learning rate the faster the network trains. # the slower the learning rate, the slower the network is trained but it is more accurate learning_rate = 0.0001 training_iters = 300000 # steps we want to train for batch_size = 64 width = 20 # mfcc features height = 80 # (max) length of utterance classes = 10 # digits (The number of digits that we are training 1- 9) batch = word_batch = speech_data.mfcc_batch_generator(batch_size) #downloads the .wav files that have a recording of different spoken numbers X, Y = next(batch) #labeled speach files trainX, trainY = X, Y testX, testY = X, Y #overfit for now # uses Recurrent Neural Network (RNN) #tensor is a multi dimensional array of data # Network building #width is the number of features abstracted from the utterances from our speech helper class #height is the max length of each utterance net = tflearn.input_data([None, width, height]) #128 is the number of neurons #dropout helps prevents overfitting by turning of neurons during training. This allows for a more generalized model net = tflearn.lstm(net, 128, dropout=0.8) #LSTM is a network that remembers everything it has learned. USed for state of the art speech recognition
from speech_data import Source,Target # LESS IS MORE! : # 0.001 Step 1000 Loss= 2.292103 Accuracy= 0.100 Time= 163s Test Accuracy: 0.1 too high vs # 0.0001 Step 1420 Loss= 1.794861 Accuracy= 0.600 Time= 231 # 0.00001 Step 1700 Loss= 0.575172 Accuracy= 1.000 Time= 274s Test Accuracy: 0.8 learning_rate = 0.00001 training_iters = 300000 #steps batch_size = 64 height=20 # mfcc features width=80 # (max) length of utterance classes=10 # digits batch = word_batch = speech_data.mfcc_batch_generator(batch_size, source=Source.DIGIT_WAVES, target=Target.digits) X, Y = next(batch) print("batch shape " + str(np.array(X).shape)) shape=[-1, height, width, 1] # shape=[-1, width,height, 1] # BASELINE toy net def simple_dense(net): # best with lr ~0.001 # type: (layer.net) -> None # net.dense(hidden=200,depth=8,dropout=False) # BETTER!! # net.reshape(shape) # Reshape input picture net.dense(400, activation=tf.nn.tanh)# 0.99 YAY # net.denseNet(40, depth=4) # net.classifier() # auto classes from labels return
def lstm_model_kfold(name, lstm1_n, lstm2_n, fc3_n): # Results file manipulation: # name = input("Name of the model for future saving: ") file = open(name + "_results.txt", "w") file.write( "Long Short-Therm Memory Recurrent Neural Network for speech recognition.\n" ) file.write("Training with K-Fold.\n") file.write("Network name: " + name + ".\n") file.write("Owner: Gabriel Furtado Lins Melo.\n\n") print("Importing libraries...") import numpy as np import tflearn import speech_data import os import time import datetime print("Imported libraries!") # Hyperparameters: learning_rate = 0.001 lstm1_neurons = lstm1_n lstm2_neurons = lstm2_n fc3_neurons = fc3_n # lstm_neurons = 128 dropout = 0.8 activation = 'softmax' optimizer = 'adam' loss_func = 'categorical_crossentropy' # Training Features: kfold_k = 5 training_iters = 100 # Multiply by 10 to know how many epochs # Dataset Features: dataset = 2400 split_p = 0.9 batchsize = int(split_p * dataset) # 2160 val_sets = int(batchsize / kfold_k) # 432 classes = 10 width = 20 height = 80 print("Hyperparameters were set!") # Loading batchs and defining dataset / testing set: print("Loading batch...") batch = speech_data.mfcc_batch_generator(dataset) wav_files, wav_labels = next(batch) print("Loading training and testing sets...") testX, testY = wav_files[batchsize:], wav_labels[ batchsize:] # Testing set. datasetX, datasetY = wav_files[: batchsize], wav_labels[: batchsize] # Dataset. # Loading or building model: print("Building/Loading neural network structures...") net = tflearn.input_data([None, width, height]) net = tflearn.lstm(net, lstm1_neurons, dropout=dropout, return_seq=True) net = tflearn.lstm(net, lstm2_neurons) net = tflearn.fully_connected(net, fc3_neurons) net = tflearn.fully_connected(net, classes, activation=activation) # Output layer net = tflearn.regression(net, optimizer=optimizer, learning_rate=learning_rate, loss=loss_func) model = tflearn.DNN(net, tensorboard_verbose=3, tensorboard_dir='Graphs') print("Built net!") # Validation with K-Fold: trainX = [] trainY = [] validationX = [] validationY = [] validation_accuracy = 0 # Saving the initial weights for restarting them in k-fold model.save('tflearn.lstm.kfold') # Results file manipulation: fprintline(file) file.write("\nNetwork Layers:\n\n") file.write(" Input data (size: " + str(width) + " x " + str(height) + ")\n") file.write(" Lstm layer 1 (neurons: " + str(lstm1_neurons) + "), (dropout: " + str(dropout) + ")\n") file.write(" Lstm layer 2 (neurons: " + str(lstm2_neurons) + "), (dropout: None)\n") file.write(" Fully connected 1 (neurons: " + str(fc3_neurons) + ")\n") file.write(" Fully connected 2 (output neurons: " + str(classes) + "), (activation: " + str(activation) + ")\n") file.write(" Regression layer (optimizer: " + str(optimizer) + "), (loss function: " + str(loss_func) + "), (learning rate: " + str(learning_rate) + ")\n") file.write("\nDataset Features:\n\n") file.write(" Dataset: " + str(dataset) + "\n") file.write(" Batch size used in training: " + str(batchsize) + "\n") file.write(" Amount of test files: " + str(dataset - batchsize) + "\n") file.write("\nTraining Features:\n\n") file.write(" K-Fold \"K\": " + str(kfold_k) + "\n") file.write(" Epochs: " + str(training_iters * 10) + "\n\n") fprintline(file) # K-Fold training: file.write("\nK-Fold results:\n\n") printline() print("\nInitiating " + str(kfold_k) + "-Fold training.\n") printline() kname = "K-Fold_" + name start_time = time.time() for i in range(kfold_k): # Fixing sets validationX, validationY = datasetX[(i * val_sets):( (i + 1) * val_sets)], datasetY[(i * val_sets):((i + 1) * val_sets)] trainX, trainY = datasetX.copy(), datasetY.copy() trainX[(i * val_sets):((i + 1) * val_sets)] = [] trainY[(i * val_sets):((i + 1) * val_sets)] = [] printline() # Fold number printing: lista = [] for j in range(kfold_k): lista.append('*VAL.*') if j == i else lista.append('TRAIN') print("\nK-Fold \"K\":", (i + 1)) print("Dataset:", lista) print() printline() # Actual training: minibatch = batchsize - val_sets # 1728 for j in range(int( training_iters)): # Each iteration have 10 training epochs. model.fit(trainX, trainY, n_epoch=10, validation_set=None, show_metric=False, batch_size=minibatch, run_id=kname) # Printing validation accuracy for each fold: accuracy = 0 preds = model.predict(validationX) for j in range(len(validationX)): if (preds[j].tolist().index(max( preds[j])) == validationY[j].tolist().index( max(validationY[j]))): accuracy += 1 accuracy /= len(validationX) printline() print("Fold " + str(i + 1) + " accuracy: %0.1f%%" % (accuracy * 100)) printline() file.write("Fold " + str(i + 1) + " accuracy: %0.1f%%\n" % (accuracy * 100)) validation_accuracy += accuracy model.load('tflearn.lstm.kfold') # Printing mean validation accuracy: validation_accuracy /= kfold_k printline() print("\nValidation phase done!") print("Mean validation accuracy: %0.1f%%\n" % (validation_accuracy * 100)) file.write("\nK-Fold validation phase mean accuracy: %0.1f%%\n\n" % (validation_accuracy * 100)) fprintline(file) printline() print() # Final Training phase: printline() print("\nInitiating final training phase.\n") printline() file.write("\nFinal training phase results:\n\n") # After K-Fold training phase (Using all dataset): for j in range( int(training_iters)): # Each iteration have 10 training epochs. # model.fit(datasetX, datasetY, n_epoch=10, validation_set=(testX,testY), show_metric=True, batch_size=batchsize, run_id=kname) model.fit(datasetX, datasetY, n_epoch=10, validation_set=None, show_metric=False, batch_size=batchsize, run_id=kname) # Printing predictions: printline() _Y = model.predict(testX) print("\nPredictions using testing set:\n") printline() hits = 0 pred_matrix = [] for i in range(len(testX)): prediction = [] target = [] for j in range(len(_Y[i])): prediction.append(str(round(_Y[i][j] * 100, 1)) + "%") # Making predictions readable target.append('Nope' if testY[i][j] == 0 else ' Yes ') # Making targets readable if (_Y[i].tolist().index(max(_Y[i])) == testY[i].tolist().index( max(testY[i]))): hits += 1 if ((i + 1) % 10 == 0): print( "Prediction " + str(i + 1) + ":", prediction ) # Predição da rede treinada (Cada lista contém a probabilidade de cada número falado (classe)) print("Target " + str(i + 1) + " :", target) # Targets printline() accuracy = hits / len(testX) print("Testing set size: %d" % len(testX)) print("Hits (right predictions): %d" % hits) print("Testing accuracy: %0.1f%%" % (accuracy * 100)) file.write("Testing set size: %d\n" % len(testX)) file.write("Hits (right predictions): %d\n" % hits) file.write("Testing accuracy: %0.1f%%\n\n" % (accuracy * 100)) fprintline(file) printline() end_time = time.time() total_time = end_time - start_time string_time = time.strftime("%Hh%Mm%Ss", time.gmtime(total_time)) file.write("\nTotal training time:\n") file.write(string_time) # Saving: file.close() model.save('tflearn.lstm.model_' + name) print("Model saved with name: tflearn.lstm.model_" + name) print("Results saved as: " + name + "_results.txt")
else: model_fc_w = tf.get_variable("fc_w", shape=(height * 128, 10)) model_fc_b = tf.get_variable("fc_b", shape=(10)) model_logits = tf.matmul(rnn_output, model_fc_w) + model_fc_b model_predict = tf.nn.softmax(model_logits) model_l2_loss = tf.nn.l2_loss(model_fc_w) + tf.nn.l2_loss(model_fc_b) model_loss = tf.losses.softmax_cross_entropy(model_output, model_logits) + model_l2_loss #opt = tf.train.AdamOptimizer(learning_rate) opt = tf.train.GradientDescentOptimizer(learning_rate) model_train = opt.minimize(model_loss) # Training batch = speech_data.mfcc_batch_generator(batch_size, n_mfcc=width) X, Y, batch_no = next(batch) trainX, trainY = X, Y class Persistance: MODEL_NAME = 'tf_reverse' def __init__(self): self.saver = tf.train.Saver() self.checkpoint_path = "./saves/{0}.ckpt".format(self.MODEL_NAME) self.input_graph_path = "./saves/{0}.pbtxt".format(self.MODEL_NAME) self.pickle_file = "./saves/pickle_file" def load_graph(self, session):
# Step 24261 Loss= 0.011786 Accuracy= 1.000 Time= 33762s takes time but works training_iters = 300000 # steps batch_size = 64 width = features = 20 # mfcc input features height = max_input_length = 80 # (max) length of input utterance (mfcc slices) classes = num_characters = 32 max_word_length = 20 # max length of output (characters per word) # classes=10 # digits keep_prob = dropout = 0.7 # batch = speech_data.mfcc_batch_generator(batch_size, target=Target.word) batch = speech_data.mfcc_batch_generator(batch_size, source=Source.WORD_WAVES, target=Target.hotword) X, Y = next(batch) print("lable shape", np.array(Y).shape) # inputs=tf.placeholder(tf.float32, shape=(batch_size,max_length,features)) x = inputX = inputs = tf.placeholder(tf.float32, shape=(batch_size, features, max_input_length)) # inputs = tf.transpose(inputs, [0, 2, 1]) # inputs must be a `Tensor` of shape: `[batch_size, max_time, ...]` inputs = tf.transpose(inputs, [2, 0, 1]) # [max_time, batch_size, features] to split: # Split data because rnn cell needs a list of inputs for the RNN inner loop inputs = tf.split(axis=0, num_or_size_splits=max_input_length, value=inputs) # n_steps * (batch_size, features)
import tflearn import speech_data import tensorflow as tf import numpy as np data_path = r"D:\python_code\tensorflow_works\testx\pre" learning_rate = 0.001 batch_size = 1 width = 20 # mfcc features height = 29 # (max) length of utterance classes = 10 # digits batch = word_batch = speech_data.mfcc_batch_generator( batch_size, height, data_path) #傳入64,返回一個生產器batch,每次只會執行一次,每次使用就算出一次的值。 #print('batch =',batch) #X, Y = next(batch) # Network building net = tflearn.input_data([None, width, height]) #net = tflearn.lstm(net,256, dropout=0.8) net = tflearn.lstm(net, 2048) #jie bi jie bi net = tflearn.fully_connected(net, classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=learning_rate, loss='categorical_crossentropy') # Training
# 0.0001 Step 300 Loss= 1.976625 Accuracy= 0.250 Time= 303s # Step 24261 Loss= 0.011786 Accuracy= 1.000 Time= 33762s takes time but works training_iters = 300000 # steps batch_size = 64 width = features = 20 # mfcc input features height = max_input_length = 80 # (max) length of input utterance (mfcc slices) classes = num_characters = 32 max_word_length = 20 # max length of output (characters per word) # classes=10 # digits keep_prob = dropout = 0.7 # batch = speech_data.mfcc_batch_generator(batch_size, target=Target.word) batch = speech_data.mfcc_batch_generator(batch_size, source=Source.WORD_WAVES, target=Target.hotword) X, Y = next(batch) print("lable shape", np.array(Y).shape) # inputs=tf.placeholder(tf.float32, shape=(batch_size,max_length,features)) x = inputX = inputs = tf.placeholder(tf.float32, shape=(batch_size, features, max_input_length)) # inputs = tf.transpose(inputs, [0, 2, 1]) # inputs must be a `Tensor` of shape: `[batch_size, max_time, ...]` inputs = tf.transpose(inputs, [2, 0, 1]) # [max_time, batch_size, features] to split: # Split data because rnn cell needs a list of inputs for the RNN inner loop inputs = tf.split(axis=0, num_or_size_splits=max_input_length, value=inputs) # n_steps * (batch_size, features) num_hidden = 100 # features cell = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True) # cell = tf.nn.rnn_cell.EmbeddingWrapper(num_hidden, state_is_tuple=True) # in many cases it may be more efficient to not use this wrapper,
test_step = 10 save_step = 100 learning_rate = 0.0001 # 0.0001 Step 300 Loss= 1.976625 Accuracy= 0.250 Time= 303s # Step 24261 Loss= 0.011786 Accuracy= 1.000 Time= 33762s takes time but works training_iters = 300000 #steps batch_size = 64 width = features = 20 # mfcc features height = max_length = 80 # (max) length of utterance classes = 10 # digits keep_prob = dropout = 0.7 batch = speech_data.mfcc_batch_generator(batch_size, target=Target.digits) # X, Y = next(batch) # print(Y) print(np.array(Y).shape) # inputs=tf.placeholder(tf.float32, shape=(batch_size,max_length,features)) x = inputs = tf.placeholder(tf.float32, shape=(batch_size, features, max_length)) # inputs = tf.transpose(inputs, [0, 2, 1]) # inputs must be a `Tensor` of shape: `[batch_size, max_time, ...]` inputs = tf.transpose(inputs, [2, 0, 1]) # [max_time, batch_size, features] to split: # Split data because rnn cell needs a list of inputs for the RNN inner loop inputs = tf.split(0, max_length, inputs) # n_steps * (batch_size, features) num_hidden = 100 #features cell = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True)
def lstm_model(existing_model=False, model_name="", usage="train", lstm1_n=100, lstm2_n=100): if (usage == 'train' and existing_model == False): name = input("Name of the model: ") else: name = model_name print("Importing libraries...") import numpy as np import tflearn # TFlearn is a modular and transparent deep learning library built on top of Tensorflow. # It was designed to provide a higher-level API to TensorFlow in order to facilitate and # speed-up experimentations, while remaining fully transparent and compatible with it. import speech_data # speech_data will fetch data from web and format it for us. import matplotlib.pyplot as plt print("Imported libraries!") # Hyperparameters: # are the variables which determines the network structure (Eg: Number of Hidden Units) # and the variables which determine how the network is trained (Eg: Learning Rate). learning_rate = 0.001 # Original is 0.0001 # Learning rate is a hyper-parameter that controls how much we are adjusting the weights # of our network with respect the loss gradient. The greater the learning rate the faster # our network trains, the lower the learning rate the more accurate our network predicts. training_iters = 100 # (Original is 30000) # Since spoken are a sequence of sound waves, we should use a recurrent neural network # because of its ability to process sequences. Lets build it below: width = 20 # mfcc features height = 80 # (max) length of utterance classes = 10 # amout of targets for using in layers split_p = 0.9 # split percentage (Size of training set). Testing set will be 1 - split_p dataset = 2400 # Size of dataset batchsize = int(split_p * dataset) # Used batch for training lstm1_neurons = lstm1_n # Amout of lstm neurons lstm2_neurons = lstm2_n dropout = 0.8 # amout of dropout (disabling neurons during training) print("Hyperparameters were set!") print("Loading batch...") if (usage == 'train'): batch = speech_data.mfcc_batch_generator(dataset) # This function (mfcc_batch_generator(batch_size)) will download (if needed) # a set of WAV files with recordings of spoken digits and a label with that digit. Having # the files, it will randomly load the batchs (with .wav files and their respective labels) # Original batch_size: 64 print("Loading training and testing sets...") wav_files, wav_labels = next( batch ) # Spliting files and its labels with python built-in next() function. trainX, trainY = wav_files[: batchsize], wav_labels[: batchsize] # Training set gets firsts 90% of dataset testX, testY = wav_files[batchsize:], wav_labels[ batchsize:] # Validation set gets lasts 10% of dataset print("Training and testing sets were loaded!") elif (usage == 'test'): batch = speech_data.mfcc_batch_generator(200) testX, testY = next(batch) # Overfitting refers to a model that models the “training data” too well. Overfitting happens # when a model learns the detail and noise in the training data to the extent that it # negatively impacts the performance of the model on new data. # Loading or building model: print("Building/Loading neural network structures...") net = tflearn.input_data([None, width, height]) # The input_data is a layer that will be used as the input layer. # For example, if the network wants an input with the shape [None, img_size,img_size,1] # meaning in human language: # None - many or a number of or how many images of.(batch size) # img_size X img_size - dimensions of the image. # 1 - with one color channel. net = tflearn.lstm( net, lstm1_neurons, dropout=dropout, return_seq=True) # First parameter is net1, since we are feeding # tensors from one layer to the next. 128 means the number of neurons, too few would lead to # bad predictions, and to many would overfit the net. The third parameter, dropout, says how # much dropout do we want. Droupout helps prevent overfiting by randomly turning off some # neurons during training, so data is forced to find new paths between layers, allowing for # a more generalized model. # lstm is a type of RNN that can remember everything that is fed, outperforming regular # recurrent neural networks. net = tflearn.lstm(net, lstm2_neurons) net = tflearn.fully_connected( net, classes, activation='softmax') # The activation function # softmax will convert numerical data into probabilities. net = tflearn.regression(net, optimizer='adam', learning_rate=learning_rate, loss='categorical_crossentropy') # The output layer is a regression, which will output a single predicted number for our utterance. # The adam optimizer minimize the categorical cross entropy loss function over time. model = tflearn.DNN(net, tensorboard_verbose=3, tensorboard_dir='Graphs') # if(existing_model==True): model.load('tflearn.lstm.model_' + model_name) # Load weights, if wanted. print("Built net!") # Training and saving model: if (usage == "train"): print("\nStarting the training!") for i in range(int( training_iters)): # Each iteration have 10 training epochs. treino = model.fit(trainX, trainY, n_epoch=10, validation_set=(testX, testY), show_metric=True, batch_size=batchsize, run_id=name) print("Network has been successfully trained!") model.save('tflearn.lstm.model_' + name) print("Model saved with name: tflearn.lstm.model_" + name) # Printing predictions: if (usage == "test"): _Y = model.predict(testX) print("\nPredictions:") printline() accuracy = 0 for i in range(len(testX)): # if(i < int(len(validation_labels)*split_p)): # print("Training sample") # else: print("Validation sample") prediction = [] target = [] for j in range(len(_Y[i])): prediction.append(str(round(_Y[i][j] * 100, 1)) + "%") # Making predictions readable target.append('Nope' if testY[i][j] == 0 else ' Yes ') # Making targets readable if (_Y[i].tolist().index(max(_Y[i])) == testY[i].tolist().index( max(testY[i]))): accuracy += 1 print( "Prediction " + str(i + 1) + ":", prediction ) # Predição da rede treinada (Cada lista contém a probabilidade de cada número falado (classe)) print("Target " + str(i + 1) + " :", target) # Targets printline() accuracy /= len(testX) print("TEST ACCURACY: %.1f%%" % (accuracy * 100)) printline()
#if tf.__version__ >= '0.12' and os.name == 'nt': # print("sorry, tflearn is not ported to tensorflow 0.12 on windows yet!(?)") # quit() # why? works on Mac? speakers = data.get_speakers() number_classes = len(speakers) #print("speakers",speakers) file = open('spkrs_list.txt', 'w') for line in speakers: file.write(line + '\n') file.close() batch = data.mfcc_batch_generator(batch_size=2000, source=data.Source.DIGIT_WAVES, target=data.Target.speaker) X, Y = next(batch) # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) input_layer = tflearn.input_data(shape=[ 20, 640 ]) #Two wave chunks, the time dimension of the wav file has been modified fc1 = tflearn.fully_connected(input_layer, 128, name='fc1') bn1 = tflearn.batch_normalization(fc1, name='bn1') dp1 = tflearn.dropout(bn1, 0.5, name='dp1') ac1 = tflearn.activation(bn1, activation='softmax', name='ac1') #net = tflearn.fully_connected(net, 400, activation='softmax')
from __future__ import division, print_function, absolute_import import tflearn import speech_data import tensorflow as tf learning_rate = 0.0001 training_iters = 30 # steps batch_size = 64 width = 20 # mfcc features height = 80 # (max) length of utterance classes = 10 # digits batch = word_batch = speech_data.mfcc_batch_generator(batch_size) X, Y = next(batch) trainX, trainY = X, Y testX, testY = X, Y #overfit for now # Network building net = tflearn.input_data([None, width, height]) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=learning_rate, loss='categorical_crossentropy') # Training ### add this "fix" for tensorflow version errors col = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) for x in col:
# """util tflearn""" import tflearn import speech_data import tensorflow as tf learning_rate = 0.0001 training_iters = 300000 #iterations number batch_size = 64 width = 20 #MFCC characteristic height = 80 #max lenght of voice classes = 10 #classes of spoken numbers batch = word_batch = speech_data.mfcc_batch_generator(batch_size) #generate each batch of MFCC X, Y = next(batch) trainX, trainY = X, Y testX, testY = X, Y #define LSTM model net = tflearn.input_data([None, width, height]) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, classes, activation='softmax') net = tf.learn.regression(net, optimizer='adam', learning_rate=learning_rate, loss='categorical_crossentropy') #train model and save it model = tflearn.DNN(net, tensorboard_verbose=0) while 1: #training_iters model.fit(trainX, trainY, n_epoch=10, validation_set=(testX, testY),