def init_model(): global x, y # set log level to debug tf.sg_verbosity(10) # # hyper parameters # batch_size = 1 # batch size # # inputs # # vocabulary size voca_size = data.voca_size # print(voca_size) # mfcc feature of audio x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 20)) # sequence length except zero-padding seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1) # encode audio feature logit = get_logit(x, voca_size=voca_size) # ctc decoding decoded, _ = tf.nn.ctc_beam_search_decoder( logit.sg_transpose(perm=[1, 0, 2]), seq_len, merge_repeated=False) # to dense tensor y = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1
def run(self, datafile): # ctc decoding decoded, _ = tf.nn.ctc_beam_search_decoder( self.logit.sg_transpose(perm=[1, 0, 2]), self.seq_len, merge_repeated=False) # to dense tensor y = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1 # load wave file wav, _ = librosa.load(datafile, mono=True, sr=16000) # get mfcc feature mfcc = np.transpose( np.expand_dims(librosa.feature.mfcc(wav, 16000), axis=0), [0, 2, 1]) # run session label = self.session.run(y, feed_dict={self.x: mfcc}) # return string return sttwdata.index_as_string(label)
from model import * import data batch_size = 1 # batch size voca_size = data.voca_size x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 20)) # sequence length except zero-padding seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1) # encode audio feature logit = get_logit(x, voca_size) # ctc decoding decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]), seq_len, merge_repeated=False) # to dense tensor y = tf.add(tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values), 1, name="output") with tf.Session() as sess: tf.sg_init(sess) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint('asset/train')) graph = tf.get_default_graph() input_graph_def = graph.as_graph_def() with tf.Session() as sess: tf.sg_init(sess) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint('asset/train')) # Output model's graph details for reference. tf.train.write_graph(sess.graph_def, '/root/speech-to-text-wavenet/asset/train', 'graph.txt', as_text=True)
# menginput mfcc feature pada file audio x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 20)) # panjang sequence kecuali zero-padding seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1) # encode audio feature logit = get_logit(x, voca_size=voca_size) # ctc decoding decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]), seq_len, merge_repeated=False) # to dense tensor y = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1 # regcognize audio file # perintah untuk menginput path file audio tf.sg_arg_def(file=('', 'speech wave file to recognize.')) # load audio file file = sys.argv[1] wav, sr = librosa.load(file, mono=True, sr=16000) # mendapatkan mfcc feature mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, 16000), axis=0), [0, 2, 1]) # run network