def speech_to_text(): n_mfcc = 60 batch_size = 1 n_epoch = 100 speech_loader = SpeechLoader(batch_size=batch_size, n_mfcc=n_mfcc) n_out = speech_loader.vocab_size model = WaveNet(n_out, batch_size=batch_size, n_mfcc=n_mfcc) model.build_graph() chpt = tf.train.get_checkpoint_state(TRAIN_DIR) if chpt: print("restore model paramters from %s" % chpt.model_checkpoint_path) model.restore(chpt.model_checkpoint_path) else: print("init a new model.") model.init_sess() file_names = os.listdir(TEXT_DIR) file_list = [os.path.join(TEXT_DIR, file_name) for file_name in file_names] step = 0 for file in file_list: step += 1 mfcc_features = speech_loader.load_one_file(file) output = model.predict(mfcc_features) # transfer to word words = speech_loader.index2str(output[0]) print("Input(%d): %s" % (step, file)) print("Output(%d): %s" % (step, words))
def train(): batch_size = 32 n_epoch = 100 n_mfcc = 60 speech_loader = SpeechLoader(batch_size=batch_size, n_mfcc=n_mfcc) n_out = speech_loader.vocab_size model = WaveNet(n_out, batch_size=batch_size, n_mfcc=n_mfcc) model.build_graph() chpt = tf.train.get_checkpoint_state(TRAIN_DIR) if chpt: print("restore model paramters from %s" % chpt.model_checkpoint_path) model.restore(chpt.model_checkpoint_path) else: print("init a new model.") model.init_sess() speech_loader.create_batches() model.train_val(speech_loader.mfcc_tensor, speech_loader.label_tensor, ckpt_dir=TRAIN_DIR, n_epoch=n_epoch, val_rate=0.15)
def train(): # setting parameters batch_size = 32 n_epoch = 100 n_mfcc = 60 # load speech data wav_path = os.path.join(os.getcwd(),'data','wav','train') label_file = os.path.join(os.getcwd(),'data','doc','trans','train.word.txt') speech_loader = SpeechLoader(wav_path, label_file, batch_size, n_mfcc) n_out = speech_loader.vocab_size # load model model = Model(n_out, batch_size=batch_size, n_mfcc=n_mfcc) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) for epoch in range(n_epoch): speech_loader.create_batches() # random shuffle data speech_loader.reset_batch_pointer() for batch in range(speech_loader.n_batches): start = time.time() batches_wav, batches_label = speech_loader.next_batch() feed = {model.input_data: batches_wav, model.targets: batches_label} train_loss, _ = sess.run([model.cost, model.optimizer_op], feed_dict=feed) end = time.time() print("epoch: %d/%d, batch: %d/%d, loss: %s, time: %.3f."%(epoch, n_epoch, batch, speech_loader.n_batches, train_loss, end-start)) # save models if epoch % 5 ==0: saver.save(sess, os.path.join(os.getcwd(), 'model','speech.module'), global_step=epoch)
def speech_to_text(): n_mfcc = 60 # load data wav_path = os.path.join(os.getcwd(), 'data', 'wav', 'train') label_file = os.path.join(os.getcwd(), 'data', 'doc', 'trans', 'train.word.txt') speech_loader = SpeechLoader(wav_path, label_file=label_file, batch_size=1, n_mfcc=n_mfcc) # load model model = Model(speech_loader.vocab_size, n_mfcc=n_mfcc, is_training=False) saver = tf.train.Saver(tf.trainable_variables()) with tf.Session() as sess: saver.restore(sess, tf.train.latest_checkpoint('model')) for j in range(750, 755): # extract feature wav_file = os.path.join(os.getcwd(), 'data', 'wav', 'test', 'D4', 'D4_' + str(j) + '.wav') wav, sr = librosa.load(wav_file, mono=True) mfcc = np.transpose( np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=n_mfcc), axis=0), [0, 2, 1]) mfcc = mfcc.tolist() # fill 0 while len(mfcc[0]) < speech_loader.wav_max_len: mfcc[0].append([0] * n_mfcc) # word dict wmap = {value: key for key, value in speech_loader.wordmap.items()} # recognition decoded = tf.transpose(model.logit, perm=[1, 0, 2]) decoded, probs = tf.nn.ctc_beam_search_decoder(decoded, model.seq_len, top_paths=1, merge_repeated=True) predict = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1 output, probs = sess.run([predict, probs], feed_dict={model.input_data: mfcc}) # print result words = '' for i in range(len(output[0])): words += wmap.get(output[0][i], -1) print("---------------------------") print("Input: " + wav_file) print("Output: " + words)
def speech_to_text(wav_files, labels_dict): n_mfcc = 60 # load data speech_loader = SpeechLoader(n_mfcc=n_mfcc, is_training=False) wav_max_len = 673 # load model model = WaveNet(speech_loader.vocab_size, n_mfcc=n_mfcc, is_training=False) saver = tf.train.Saver(tf.trainable_variables()) test_wav = wav_files[:10] # word dict word_map = {value: key for key, value in speech_loader.wordmap.items()} print(word_map) with tf.Session() as sess: saver.restore(sess, tf.train.latest_checkpoint('../model')) for wav_path in test_wav: wav, sr = librosa.load(wav_path, mono=True) mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=n_mfcc), axis=0), [0, 2, 1]) mfcc = mfcc.tolist() while len(mfcc[0]) < wav_max_len: mfcc[0].append([0] * n_mfcc) # recognition decoded = tf.transpose(model.logit, perm=[1, 0, 2]) decoded, probs = tf.nn.ctc_beam_search_decoder(decoded, model.seq_len, top_paths=1, merge_repeated=True) predict = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1 output, probs = sess.run([predict, probs], feed_dict={model.input_data: mfcc}) # result words = '' for i in range(len(output[0])): words += word_map.get(output[0][i], -1) wav_name = os.path.basename(wav_path).split('.')[0] print('-------------------------------------------------------') print(f'Input: {wav_path}') print(f'Output: {words}') print(f'True result: {labels_dict[wav_name]}')
def train(): # setting parameters batch_size = 2 n_epoch = 100 n_mfcc = 60 # load speech data wav_path = os.path.join(os.getcwd(), 'data', 'wav', 'train') label_file = os.path.join(os.getcwd(), 'data', 'doc', 'trans', 'train.word.txt') speech_loader = SpeechLoader(wav_path, label_file, batch_size, n_mfcc) n_out = speech_loader.vocab_size # load model model = Model(n_out, batch_size=batch_size, n_mfcc=n_mfcc) tf.summary.scalar('loss', model.cost) merged = tf.summary.merge_all() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) if len(os.listdir('./model')) > 3: print("loading model from checkpoint") checkpoint = tf.train.latest_checkpoint('./model') saver.restore(sess, checkpoint) tf.train.write_graph(sess.graph_def, './model', 'model.pbtxt') summary_writer = tf.summary.FileWriter('./model', graph=sess.graph) for epoch in range(n_epoch): speech_loader.create_batches() # random shuffle data speech_loader.reset_batch_pointer() for batch in range(speech_loader.n_batches): start = time.time() batches_wav, batches_label = speech_loader.next_batch() feed = { model.input_data: batches_wav, model.targets: batches_label } result, train_loss, _ = sess.run( [merged, model.cost, model.optimizer_op], feed_dict=feed) end = time.time() print("epoch: %d/%d, batch: %d/%d, loss: %s, time: %.3f." % (epoch, n_epoch, batch, speech_loader.n_batches, train_loss, end - start)) summary_writer.add_summary(result, epoch) # save models if epoch % 5 == 0: saver.save(sess, os.path.join(os.getcwd(), 'model', 'speech.module'), global_step=epoch)
def train(): ''' :return: ''' batch_size = 8 n_mfcc = 60 n_epoch = 100 source_file = '/home/ydf_micro/datasets/data_thchs30' speech_loader = SpeechLoader(os.path.join(source_file, 'train'), batch_size, n_mfcc) n_out = speech_loader.vocab_size # load model model = WaveNet(n_out, batch_size=batch_size, n_mfcc=n_mfcc) saver = tf.train.Saver(tf.global_variables()) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # sess.graph.finalize() # Graph is read-only after this statement for epoch in range(n_epoch): speech_loader.create_batches() # random shuffle data speech_loader.reset_batch_pointer() for batch in range(speech_loader.n_batches): batch_start = time.time() batches_wav, batches_label = speech_loader.next_batch() feed = { model.input_data: batches_wav, model.targets: batches_label } train_loss, _ = sess.run([model.cost, model.optimizer_op], feed_dict=feed) batch_end = time.time() print( f'epoch: {epoch+1}/{n_epoch}, batch: {batch+1}/{speech_loader.n_batches}, ' f'loss: {train_loss:.2f}, time: {(batch_end-batch_start):.2f}s' ) # save models if epoch % 5 == 0: saver.save(sess, os.path.join(os.path.dirname(os.getcwd()), 'model', 'speech.module'), global_step=epoch)
def speech_to_text(): n_mfcc = 60 # load data speech_loader = SpeechLoader(n_mfcc=60, is_test=True) # load model model = Model(speech_loader.vocab_size, n_mfcc=n_mfcc, is_training=False) saver = tf.train.Saver() with tf.Session() as sess: # extract feature wav, sr = librosa.load("data/test-cn.wav", mono=True) mfcc = np.transpose( np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=n_mfcc), axis=0), [0, 2, 1]) mfcc = mfcc.tolist() # fill 0 while len(mfcc[0]) < speech_loader.wav_max_len: mfcc[0].append([0] * n_mfcc) # word dict wmap = {value: key for key, value in speech_loader.wordmap.items()} # recognition #saver.restore(sess, tf.train.latest_checkpoint('model')) saver.restore(sess, "model/SpeechToTextCN/speech.module.v2-99") decoded = tf.transpose(model.logit, perm=[1, 0, 2]) decoded, probs = tf.nn.ctc_beam_search_decoder(decoded, model.seq_len, top_paths=1, merge_repeated=True) predict = tf.sparse_to_dense( decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1 output, probs = sess.run([predict, probs], feed_dict={model.input_data: mfcc}) # print result words = '' for i in range(len(output[0])): words += wmap.get(output[0][i], -1) print("---------------------------") print("Predict: " + words)