# encode audio feature logit = get_logit(x, voca_size=voca_size) # ctc decoding decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]), seq_len, merge_repeated=False) # to dense tensor y = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1 # regcognize audio file # perintah untuk menginput path file audio tf.sg_arg_def(file=('', 'speech wave file to recognize.')) # load audio file file = sys.argv[1] wav, sr = librosa.load(file, mono=True, sr=16000) # mendapatkan mfcc feature mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, 16000), axis=0), [0, 2, 1]) # run network with tf.Session() as sess: # init variables tf.sg_init(sess)
from data import SpeechCorpus, voca_size from model import * import numpy as np from tqdm import tqdm import pandas as pd import os from shutil import copyfile train_path = './asset/train/' best_model = './best_model/' # set log level to debug tf.sg_verbosity(10) # command line argument for set_name tf.sg_arg_def(set=('test', "'train', 'valid', or 'test'. The default is 'valid'")) tf.sg_arg_def(frac=(1.0, "test fraction ratio to whole data set. The default is 1.0(=whole set)")) # # hyper parameters # # batch size batch_size = 16 # # inputs # # corpus input tensor ( with QueueRunner )
import sugartensor as tf from data import SpeechCorpus, voca_size from model import * import numpy as np from tqdm import tqdm __author__ = '*****@*****.**' # set log level to debug tf.sg_verbosity(10) # command line argument for set_name tf.sg_arg_def(set=('valid', "'train', 'valid', or 'test'. The default is 'valid'")) tf.sg_arg_def(frac=(1.0, "test fraction ratio to whole data set. The default is 1.0(=whole set)")) # # hyper parameters # # batch size batch_size = 16 # # inputs # # corpus input tensor ( with QueueRunner ) data = SpeechCorpus(batch_size=batch_size, set_name=tf.sg_arg().set)