def speech_to_text():
    n_mfcc = 60
    batch_size = 1
    n_epoch = 100

    speech_loader = SpeechLoader(batch_size=batch_size, n_mfcc=n_mfcc)
    n_out = speech_loader.vocab_size

    model = WaveNet(n_out, batch_size=batch_size, n_mfcc=n_mfcc)
    model.build_graph()

    chpt = tf.train.get_checkpoint_state(TRAIN_DIR)
    if chpt:
        print("restore model paramters from %s" % chpt.model_checkpoint_path)
        model.restore(chpt.model_checkpoint_path)
    else:
        print("init a new model.")
        model.init_sess()

    file_names = os.listdir(TEXT_DIR)
    file_list = [os.path.join(TEXT_DIR, file_name) for file_name in file_names]

    step = 0
    for file in file_list:
        step += 1
        mfcc_features = speech_loader.load_one_file(file)

        output = model.predict(mfcc_features)
        # transfer to word
        words = speech_loader.index2str(output[0])
        print("Input(%d): %s" % (step, file))
        print("Output(%d): %s" % (step, words))
def train():
    batch_size = 32
    n_epoch = 100
    n_mfcc = 60

    speech_loader = SpeechLoader(batch_size=batch_size, n_mfcc=n_mfcc)
    n_out = speech_loader.vocab_size

    model = WaveNet(n_out, batch_size=batch_size, n_mfcc=n_mfcc)

    model.build_graph()
    chpt = tf.train.get_checkpoint_state(TRAIN_DIR)
    if chpt:
        print("restore model paramters from %s" % chpt.model_checkpoint_path)
        model.restore(chpt.model_checkpoint_path)
    else:
        print("init a new model.")
        model.init_sess()

    speech_loader.create_batches()
    model.train_val(speech_loader.mfcc_tensor,
                    speech_loader.label_tensor,
                    ckpt_dir=TRAIN_DIR,
                    n_epoch=n_epoch,
                    val_rate=0.15)
def train():
	# setting parameters
	batch_size = 32
	n_epoch = 100
	n_mfcc = 60

	# load speech data
	wav_path = os.path.join(os.getcwd(),'data','wav','train')
	label_file = os.path.join(os.getcwd(),'data','doc','trans','train.word.txt')
	speech_loader = SpeechLoader(wav_path, label_file, batch_size, n_mfcc)
	n_out = speech_loader.vocab_size

	# load model
	model = Model(n_out, batch_size=batch_size, n_mfcc=n_mfcc)

	with tf.Session() as sess:
		sess.run(tf.global_variables_initializer())
		
		saver = tf.train.Saver(tf.global_variables())

		for epoch in range(n_epoch):
			speech_loader.create_batches() # random shuffle data
			speech_loader.reset_batch_pointer()
			for batch in range(speech_loader.n_batches):
				start = time.time()
				batches_wav, batches_label = speech_loader.next_batch()
				feed = {model.input_data: batches_wav, model.targets: batches_label}
				train_loss, _ = sess.run([model.cost, model.optimizer_op], feed_dict=feed)
				end = time.time()
				print("epoch: %d/%d, batch: %d/%d, loss: %s, time: %.3f."%(epoch, n_epoch, batch, speech_loader.n_batches, train_loss, end-start))

			# save models
			if epoch % 5 ==0:
				saver.save(sess, os.path.join(os.getcwd(), 'model','speech.module'), global_step=epoch)
Example #4
0
def speech_to_text():
    n_mfcc = 60

    # load data
    wav_path = os.path.join(os.getcwd(), 'data', 'wav', 'train')
    label_file = os.path.join(os.getcwd(), 'data', 'doc', 'trans',
                              'train.word.txt')
    speech_loader = SpeechLoader(wav_path,
                                 label_file=label_file,
                                 batch_size=1,
                                 n_mfcc=n_mfcc)
    # load model
    model = Model(speech_loader.vocab_size, n_mfcc=n_mfcc, is_training=False)

    saver = tf.train.Saver(tf.trainable_variables())

    with tf.Session() as sess:
        saver.restore(sess, tf.train.latest_checkpoint('model'))
        for j in range(750, 755):
            # extract feature
            wav_file = os.path.join(os.getcwd(), 'data', 'wav', 'test', 'D4',
                                    'D4_' + str(j) + '.wav')
            wav, sr = librosa.load(wav_file, mono=True)
            mfcc = np.transpose(
                np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=n_mfcc),
                               axis=0), [0, 2, 1])
            mfcc = mfcc.tolist()

            # fill 0
            while len(mfcc[0]) < speech_loader.wav_max_len:
                mfcc[0].append([0] * n_mfcc)

            # word dict
            wmap = {value: key for key, value in speech_loader.wordmap.items()}

            # recognition
            decoded = tf.transpose(model.logit, perm=[1, 0, 2])
            decoded, probs = tf.nn.ctc_beam_search_decoder(decoded,
                                                           model.seq_len,
                                                           top_paths=1,
                                                           merge_repeated=True)
            predict = tf.sparse_to_dense(decoded[0].indices,
                                         decoded[0].dense_shape,
                                         decoded[0].values) + 1
            output, probs = sess.run([predict, probs],
                                     feed_dict={model.input_data: mfcc})

            # print result
            words = ''
            for i in range(len(output[0])):
                words += wmap.get(output[0][i], -1)

            print("---------------------------")
            print("Input: " + wav_file)
            print("Output: " + words)
Example #5
0
def speech_to_text(wav_files, labels_dict):
    n_mfcc = 60

    # load data

    speech_loader = SpeechLoader(n_mfcc=n_mfcc, is_training=False)

    wav_max_len = 673

    # load model
    model = WaveNet(speech_loader.vocab_size, n_mfcc=n_mfcc, is_training=False)

    saver = tf.train.Saver(tf.trainable_variables())

    test_wav = wav_files[:10]


    # word dict
    word_map = {value: key for key, value in speech_loader.wordmap.items()}
    print(word_map)

    with tf.Session() as sess:

        saver.restore(sess, tf.train.latest_checkpoint('../model'))

        for wav_path in test_wav:
            wav, sr = librosa.load(wav_path, mono=True)
            mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=n_mfcc),
                                               axis=0), [0, 2, 1])
            mfcc = mfcc.tolist()

            while len(mfcc[0]) < wav_max_len:
                mfcc[0].append([0] * n_mfcc)

            # recognition
            decoded = tf.transpose(model.logit, perm=[1, 0, 2])
            decoded, probs = tf.nn.ctc_beam_search_decoder(decoded, model.seq_len,
                                                           top_paths=1, merge_repeated=True)
            predict = tf.sparse_to_dense(decoded[0].indices,
                                         decoded[0].dense_shape,
                                         decoded[0].values) + 1
            output, probs = sess.run([predict, probs], feed_dict={model.input_data: mfcc})

            # result
            words = ''
            for i in range(len(output[0])):
                words += word_map.get(output[0][i], -1)

            wav_name = os.path.basename(wav_path).split('.')[0]

            print('-------------------------------------------------------')
            print(f'Input: {wav_path}')
            print(f'Output: {words}')
            print(f'True result: {labels_dict[wav_name]}')
Example #6
0
def train():
    # setting parameters
    batch_size = 2
    n_epoch = 100
    n_mfcc = 60

    # load speech data
    wav_path = os.path.join(os.getcwd(), 'data', 'wav', 'train')
    label_file = os.path.join(os.getcwd(), 'data', 'doc', 'trans',
                              'train.word.txt')
    speech_loader = SpeechLoader(wav_path, label_file, batch_size, n_mfcc)
    n_out = speech_loader.vocab_size

    # load model
    model = Model(n_out, batch_size=batch_size, n_mfcc=n_mfcc)

    tf.summary.scalar('loss', model.cost)
    merged = tf.summary.merge_all()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        saver = tf.train.Saver(tf.global_variables())
        if len(os.listdir('./model')) > 3:
            print("loading model from checkpoint")
            checkpoint = tf.train.latest_checkpoint('./model')
            saver.restore(sess, checkpoint)

        tf.train.write_graph(sess.graph_def, './model', 'model.pbtxt')
        summary_writer = tf.summary.FileWriter('./model', graph=sess.graph)

        for epoch in range(n_epoch):
            speech_loader.create_batches()  # random shuffle data
            speech_loader.reset_batch_pointer()
            for batch in range(speech_loader.n_batches):
                start = time.time()
                batches_wav, batches_label = speech_loader.next_batch()
                feed = {
                    model.input_data: batches_wav,
                    model.targets: batches_label
                }
                result, train_loss, _ = sess.run(
                    [merged, model.cost, model.optimizer_op], feed_dict=feed)
                end = time.time()
                print("epoch: %d/%d, batch: %d/%d, loss: %s, time: %.3f." %
                      (epoch, n_epoch, batch, speech_loader.n_batches,
                       train_loss, end - start))
                summary_writer.add_summary(result, epoch)

            # save models
            if epoch % 5 == 0:
                saver.save(sess,
                           os.path.join(os.getcwd(), 'model', 'speech.module'),
                           global_step=epoch)
Example #7
0
def train():
    '''

    :return:
    '''

    batch_size = 8
    n_mfcc = 60
    n_epoch = 100

    source_file = '/home/ydf_micro/datasets/data_thchs30'
    speech_loader = SpeechLoader(os.path.join(source_file, 'train'),
                                 batch_size, n_mfcc)

    n_out = speech_loader.vocab_size

    # load model

    model = WaveNet(n_out, batch_size=batch_size, n_mfcc=n_mfcc)

    saver = tf.train.Saver(tf.global_variables())

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        # sess.graph.finalize() # Graph is read-only after this statement

        for epoch in range(n_epoch):
            speech_loader.create_batches()  # random shuffle data
            speech_loader.reset_batch_pointer()
            for batch in range(speech_loader.n_batches):
                batch_start = time.time()
                batches_wav, batches_label = speech_loader.next_batch()
                feed = {
                    model.input_data: batches_wav,
                    model.targets: batches_label
                }
                train_loss, _ = sess.run([model.cost, model.optimizer_op],
                                         feed_dict=feed)
                batch_end = time.time()
                print(
                    f'epoch: {epoch+1}/{n_epoch}, batch: {batch+1}/{speech_loader.n_batches}, '
                    f'loss: {train_loss:.2f}, time: {(batch_end-batch_start):.2f}s'
                )

            # save models
            if epoch % 5 == 0:
                saver.save(sess,
                           os.path.join(os.path.dirname(os.getcwd()), 'model',
                                        'speech.module'),
                           global_step=epoch)
Example #8
0
def speech_to_text():
    n_mfcc = 60

    # load data
    speech_loader = SpeechLoader(n_mfcc=60, is_test=True)

    # load model
    model = Model(speech_loader.vocab_size, n_mfcc=n_mfcc, is_training=False)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        # extract feature
        wav, sr = librosa.load("data/test-cn.wav", mono=True)
        mfcc = np.transpose(
            np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=n_mfcc),
                           axis=0), [0, 2, 1])
        mfcc = mfcc.tolist()

        # fill 0
        while len(mfcc[0]) < speech_loader.wav_max_len:
            mfcc[0].append([0] * n_mfcc)

        # word dict
        wmap = {value: key for key, value in speech_loader.wordmap.items()}

        # recognition
        #saver.restore(sess, tf.train.latest_checkpoint('model'))
        saver.restore(sess, "model/SpeechToTextCN/speech.module.v2-99")
        decoded = tf.transpose(model.logit, perm=[1, 0, 2])
        decoded, probs = tf.nn.ctc_beam_search_decoder(decoded,
                                                       model.seq_len,
                                                       top_paths=1,
                                                       merge_repeated=True)
        predict = tf.sparse_to_dense(
            decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1
        output, probs = sess.run([predict, probs],
                                 feed_dict={model.input_data: mfcc})

        # print result
        words = ''
        for i in range(len(output[0])):
            words += wmap.get(output[0][i], -1)

        print("---------------------------")
        print("Predict: " + words)