コード例 #1
0
def init_model():
    global x, y
    # set log level to debug
    tf.sg_verbosity(10)
    #
    # hyper parameters
    #
    batch_size = 1  # batch size
    #
    # inputs
    #
    # vocabulary size
    voca_size = data.voca_size
    # print(voca_size)
    # mfcc feature of audio
    x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 20))
    # sequence length except zero-padding
    seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1)
    # encode audio feature
    logit = get_logit(x, voca_size=voca_size)
    # ctc decoding
    decoded, _ = tf.nn.ctc_beam_search_decoder(
        logit.sg_transpose(perm=[1, 0, 2]), seq_len, merge_repeated=False)
    # to dense tensor
    y = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape,
                           decoded[0].values) + 1
コード例 #2
0
ファイル: recog.py プロジェクト: goldsteink/STTWall
    def run(self, datafile):
        # ctc decoding
        decoded, _ = tf.nn.ctc_beam_search_decoder(
            self.logit.sg_transpose(perm=[1, 0, 2]),
            self.seq_len,
            merge_repeated=False)

        # to dense tensor
        y = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape,
                               decoded[0].values) + 1

        # load wave file
        wav, _ = librosa.load(datafile, mono=True, sr=16000)

        # get mfcc feature
        mfcc = np.transpose(
            np.expand_dims(librosa.feature.mfcc(wav, 16000), axis=0),
            [0, 2, 1])

        # run session
        label = self.session.run(y, feed_dict={self.x: mfcc})

        # return string
        return sttwdata.index_as_string(label)
コード例 #3
0
ファイル: export_wave_pb.py プロジェクト: akshayjh/speech
from model import *
import data


batch_size = 1     # batch size
voca_size = data.voca_size
x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 20))
# sequence length except zero-padding
seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1)
# encode audio feature
logit = get_logit(x, voca_size)
# ctc decoding
decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]), seq_len, merge_repeated=False)
# to dense tensor
y = tf.add(tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values), 1, name="output")

with tf.Session() as sess:
     tf.sg_init(sess)
     saver = tf.train.Saver()
     saver.restore(sess, tf.train.latest_checkpoint('asset/train'))

graph = tf.get_default_graph()
input_graph_def = graph.as_graph_def()

with tf.Session() as sess:
     tf.sg_init(sess)
     saver = tf.train.Saver()
     saver.restore(sess, tf.train.latest_checkpoint('asset/train'))
     # Output model's graph details for reference.
     tf.train.write_graph(sess.graph_def, '/root/speech-to-text-wavenet/asset/train', 'graph.txt', as_text=True)
コード例 #4
0
ファイル: recognize.py プロジェクト: Teguh04/Speech-to-Text
# menginput mfcc feature pada file audio
x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 20))

# panjang sequence kecuali zero-padding
seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1)

# encode audio feature
logit = get_logit(x, voca_size=voca_size)

# ctc decoding
decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]),
                                           seq_len,
                                           merge_repeated=False)

# to dense tensor
y = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape,
                       decoded[0].values) + 1

# regcognize audio file

# perintah untuk menginput path file audio
tf.sg_arg_def(file=('', 'speech wave file to recognize.'))

# load audio file
file = sys.argv[1]
wav, sr = librosa.load(file, mono=True, sr=16000)

# mendapatkan mfcc feature
mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, 16000), axis=0),
                    [0, 2, 1])

# run network