Python read_audio_filesの例、utils.read_audio_files Pythonの例

コード例 #1

0

ファイルを表示

ファイル: train.py プロジェクト: yearofthewhopper/tensorflow-rnn-ctc

def main(argv):
    # Read train data files.
    train_texts = utils.read_text_files(TRAIN_DIR)
    train_labels = utils.texts_encoder(train_texts,
                                       first_index=FIRST_INDEX,
                                       space_index=SPACE_INDEX,
                                       space_token=SPACE_TOKEN)
    train_inputs = utils.read_audio_files(TRAIN_DIR)
    train_inputs = utils.standardize_audios(train_inputs)
    train_sequence_lengths = utils.get_sequence_lengths(train_inputs)
    train_inputs = utils.make_sequences_same_length(train_inputs, train_sequence_lengths)

    # Read validation data files.
    validation_texts = utils.read_text_files(DEV_DIR)
    validation_labels = utils.texts_encoder(validation_texts,
                                            first_index=FIRST_INDEX,
                                            space_index=SPACE_INDEX,
                                            space_token=SPACE_TOKEN)
    validation_labels = utils.sparse_tuples_from_sequences(validation_labels)
    validation_inputs = utils.read_audio_files(DEV_DIR)
    validation_inputs = utils.standardize_audios(validation_inputs)
    validation_sequence_lengths = utils.get_sequence_lengths(validation_inputs)
    validation_inputs = utils.make_sequences_same_length(validation_inputs, validation_sequence_lengths)

    # Read test data files.
    test_texts = utils.read_text_files(TEST_DIR)
    test_labels = utils.texts_encoder(test_texts,
                                      first_index=FIRST_INDEX,
                                      space_index=SPACE_INDEX,
                                      space_token=SPACE_TOKEN)
    test_labels = utils.sparse_tuples_from_sequences(test_labels)
    test_inputs = utils.read_audio_files(DEV_DIR)
    test_inputs = utils.standardize_audios(test_inputs)
    test_sequence_lengths = utils.get_sequence_lengths(test_inputs)
    test_inputs = utils.make_sequences_same_length(test_inputs, test_sequence_lengths)

    with tf.device('/cpu:0'):
        config = tf.ConfigProto()

        graph = tf.Graph()
        with graph.as_default():
            logging.debug("Starting new TensorFlow graph.")
            inputs_placeholder = tf.placeholder(tf.float32, [None, None, NUM_FEATURES])

            # SparseTensor placeholder required by ctc_loss op.
            labels_placeholder = tf.sparse_placeholder(tf.int32)

            # 1d array of size [batch_size].
            sequence_length_placeholder = tf.placeholder(tf.int32, [None])

            # Defining the cell.
            def lstm_cell():
                return tf.contrib.rnn.LSTMCell(NUM_HIDDEN, state_is_tuple=True)

            # Stacking rnn cells.
            stack = tf.contrib.rnn.MultiRNNCell(
                [lstm_cell() for _ in range(NUM_LAYERS)], state_is_tuple=True)

            # Creates a recurrent neural network.
            outputs, _ = tf.nn.dynamic_rnn(stack, inputs_placeholder, sequence_length_placeholder, dtype=tf.float32)

            shape = tf.shape(inputs_placeholder)
            batch_size, max_time_steps = shape[0], shape[1]

            # Reshaping to apply the same weights over the time steps.
            outputs = tf.reshape(outputs, [-1, NUM_HIDDEN])

            weights = tf.Variable(tf.truncated_normal([NUM_HIDDEN, NUM_CLASSES], stddev=0.1),
                                  name='weights')
            bias = tf.Variable(tf.constant(0., shape=[NUM_CLASSES]),
                               name='bias')

            # Doing the affine projection.
            logits = tf.matmul(outputs, weights) + bias

            # Reshaping back to the original shape.
            logits = tf.reshape(logits, [batch_size, -1, NUM_CLASSES])

            # Time is major.
            logits = tf.transpose(logits, (1, 0, 2))

            with tf.name_scope('loss'):
                loss = tf.nn.ctc_loss(labels_placeholder, logits, sequence_length_placeholder)
                cost = tf.reduce_mean(loss)
                tf.summary.scalar("loss", cost)

            optimizer = tf.train.MomentumOptimizer(INITIAL_LEARNING_RATE, 0.9).minimize(cost)

            # CTC decoder.
            decoded, neg_sum_logits = tf.nn.ctc_greedy_decoder(logits, sequence_length_placeholder)

            label_error_rate = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                                               labels_placeholder))

        with tf.Session(config=config, graph=graph) as session:
            logging.debug("Starting TensorFlow session.")

            # Saver op to save and restore all the variables.
            saver = tf.train.Saver()

            # Merge all the summaries and write them out.
            merged_summary = tf.summary.merge_all()

            # Initializing summary writer for TensorBoard.
            summary_writer = tf.summary.FileWriter(SUMMARY_PATH, tf.get_default_graph())

            # Initialize the weights and biases.
            tf.global_variables_initializer().run()

            train_num = train_inputs.shape[0]
            validation_num = validation_inputs.shape[0]

            # Check if there is any example.
            if train_num <= 0:
                logging.error("There are no training examples.")
                return

            num_batches_per_epoch = math.ceil(train_num / BATCH_SIZE)

            for current_epoch in range(NUM_EPOCHS):
                train_cost = 0
                train_label_error_rate = 0
                start_time = time.time()

                for step in range(num_batches_per_epoch):
                    # Format batches.
                    if int(train_num / ((step + 1) * BATCH_SIZE)) >= 1:
                        indexes = [i % train_num for i in range(step * BATCH_SIZE, (step + 1) * BATCH_SIZE)]
                    else:
                        indexes = [i % train_num for i in range(step * BATCH_SIZE, train_num)]

                    batch_train_inputs = train_inputs[indexes]
                    batch_train_sequence_lengths = train_sequence_lengths[indexes]
                    batch_train_targets = utils.sparse_tuples_from_sequences(train_labels[indexes])

                    feed = {inputs_placeholder: batch_train_inputs,
                            labels_placeholder: batch_train_targets,
                            sequence_length_placeholder: batch_train_sequence_lengths}

                    batch_cost, _, summary = session.run([cost, optimizer, merged_summary], feed)
                    train_cost += batch_cost * BATCH_SIZE
                    train_label_error_rate += session.run(label_error_rate, feed_dict=feed) * BATCH_SIZE

                    # Write logs at every iteration.
                    summary_writer.add_summary(summary, current_epoch * num_batches_per_epoch + step)

                train_cost /= train_num
                train_label_error_rate /= train_num

                validation_feed = {inputs_placeholder: validation_inputs,
                                   labels_placeholder: validation_labels,
                                   sequence_length_placeholder: validation_sequence_lengths}

                validation_cost, validation_label_error_rate = session.run([cost, label_error_rate],
                                                                           feed_dict=validation_feed)

                validation_cost /= validation_num
                validation_label_error_rate /= validation_num

                # Output intermediate step information.
                logging.info("Epoch %d/%d (time: %.3f s)",
                             current_epoch + 1,
                             NUM_EPOCHS,
                             time.time() - start_time)
                logging.info("Train cost: %.3f, train label error rate: %.3f",
                             train_cost,
                             train_label_error_rate)
                logging.info("Validation cost: %.3f, validation label error rate: %.3f",
                             validation_cost,
                             validation_label_error_rate)

            test_feed = {inputs_placeholder: test_inputs,
                         sequence_length_placeholder: test_sequence_lengths}
            # Decoding.
            decoded_outputs = session.run(decoded[0], feed_dict=test_feed)
            dense_decoded = tf.sparse_tensor_to_dense(decoded_outputs, default_value=-1).eval(session=session)
            test_num = test_texts.shape[0]

            for i, sequence in enumerate(dense_decoded):
                sequence = [s for s in sequence if s != -1]
                decoded_text = utils.sequence_decoder(sequence)

                logging.info("Sequence %d/%d", i + 1, test_num)
                logging.info("Original:\n%s", test_texts[i])
                logging.info("Decoded:\n%s", decoded_text)

            # Save model weights to disk.
            save_path = saver.save(session, MODEL_PATH)
            logging.info("Model saved in file: %s", save_path)

コード例 #2

0

ファイルを表示

ファイル: utils_test.py プロジェクト: yearofthewhopper/tensorflow-rnn-ctc

 def test_standardize_audios(self):
     files = utils.read_audio_files(TEST_AUDIO_FILE_DIR)
     self.assertEqual(utils.standardize_audios(files).size, files.size)

コード例 #3

0

ファイルを表示

ファイル: utils_test.py プロジェクト: yearofthewhopper/tensorflow-rnn-ctc

 def test_read_audio_files(self):
     self.assertTrue(utils.read_audio_files(TEST_AUDIO_FILE_DIR).size > 0)

コード例 #4

0

ファイルを表示

import utils

DATA_DIR = "C:\\Users\Sai Teja\Desktop\ELL888-RNN\\CTC"
TRAIN_DIR = DATA_DIR + "\\TRAIN\\DR"
TEST_DIR = DATA_DIR + "\\TEST\\DR"
DEV_DIR = DATA_DIR + "\\TRAIN\\DR"

SPACE_TOKEN = '<space>'
SPACE_INDEX = 0
FIRST_INDEX = ord('a') - 1  # 0 is reserved to space
test_inputs = utils.read_audio_files(TEST_DIR)
   # test_inputs = utils.standardize_audios(test_inputs)
test_sequence_lengths = utils.get_sequence_lengths(test_inputs)
test_inputs = utils.make_sequences_same_length(test_inputs, test_sequence_lengths)

コード例 #5

0

ファイルを表示

# found cached file
if os.path.exists(CACHE_PATH):
    try:
        with open(file=CACHE_PATH, mode='rb') as f:
            sample_rate, raw_data, feat_data = pickle.load(f)
            print("Loaded cached features at:", CACHE_PATH)
    except Exception as e:
        # somebody messed up the cached file, do everything again
        warnings.warn("Couldn't load cached file at path: %s, Error: %s" %
                      (CACHE_PATH, str(e)))
        os.remove(CACHE_PATH)
# check if cached file exist
if not os.path.exists(CACHE_PATH):
    # ====== load audio ====== #
    with performance_evaluate(name="Reading Audio"):
        sample_rate, raw_data = read_audio_files()
    # ====== acoustic features ====== #
    with performance_evaluate(name="Extract Features"):
        feat_data = {}
        for name, dat in raw_data.items():
            pow_spec, mel_spec, mfcc = extract_acoustic_features(dat)
            feat_data[name] = (pow_spec, mel_spec, mfcc)
    # ====== save cached features ====== #
    with open(file=CACHE_PATH, mode='wb') as f:
        pickle.dump((sample_rate, raw_data, feat_data), f)
        print("Saved cached features at:", CACHE_PATH)
# ====== infer digit and speaker information from file name ====== #
all_name = sorted(raw_data.keys())
digits = sorted(set([i.split('_')[0] for i in all_name]))
speakers = sorted(set([i.split('_')[1] for i in all_name]))
indices = sorted(set([i.split('_')[2] for i in all_name]))

コード例 #6

0

ファイルを表示

ファイル: demo.py プロジェクト: yearofthewhopper/tensorflow-rnn-ctc

def main(argv):
    # Read test data files.
    test_texts = utils.read_text_files(TEST_DIR)
    test_labels = utils.texts_encoder(test_texts,
                                      first_index=FIRST_INDEX,
                                      space_index=SPACE_INDEX,
                                      space_token=SPACE_TOKEN)
    test_labels = utils.sparse_tuples_from_sequences(test_labels)
    test_inputs = utils.read_audio_files(DEV_DIR)
    test_inputs = utils.standardize_audios(test_inputs)
    test_sequence_lengths = utils.get_sequence_lengths(test_inputs)
    test_inputs = utils.make_sequences_same_length(test_inputs,
                                                   test_sequence_lengths)

    with tf.device('/cpu:0'):
        config = tf.ConfigProto()

        graph = tf.Graph()
        with graph.as_default():
            logging.debug("Starting new TensorFlow graph.")
            inputs_placeholder = tf.placeholder(tf.float32,
                                                [None, None, NUM_FEATURES])

            # SparseTensor placeholder required by ctc_loss op.
            labels_placeholder = tf.sparse_placeholder(tf.int32)

            # 1d array of size [batch_size].
            sequence_length_placeholder = tf.placeholder(tf.int32, [None])

            # Defining the cell.
            cell = tf.contrib.rnn.LSTMCell(NUM_HIDDEN, state_is_tuple=True)

            # Stacking rnn cells.
            stack = tf.contrib.rnn.MultiRNNCell([cell] * NUM_LAYERS,
                                                state_is_tuple=True)

            # Creates a recurrent neural network.
            outputs, _ = tf.nn.dynamic_rnn(stack,
                                           inputs_placeholder,
                                           sequence_length_placeholder,
                                           dtype=tf.float32)

            shape = tf.shape(inputs_placeholder)
            batch_size, max_time_steps = shape[0], shape[1]

            # Reshaping to apply the same weights over the time steps.
            outputs = tf.reshape(outputs, [-1, NUM_HIDDEN])

            weights = tf.Variable(tf.truncated_normal(
                [NUM_HIDDEN, NUM_CLASSES], stddev=0.1),
                                  name='weights')
            bias = tf.Variable(tf.constant(0., shape=[NUM_CLASSES]),
                               name='bias')

            # Doing the affine projection.
            logits = tf.matmul(outputs, weights) + bias

            # Reshaping back to the original shape.
            logits = tf.reshape(logits, [batch_size, -1, NUM_CLASSES])

            # Time is major.
            logits = tf.transpose(logits, (1, 0, 2))

            # CTC decoder.
            decoded, neg_sum_logits = tf.nn.ctc_greedy_decoder(
                logits, sequence_length_placeholder)

        with tf.Session(config=config, graph=graph) as session:
            logging.debug("Starting TensorFlow session.")

            # Initialize the weights and biases.
            tf.global_variables_initializer().run()

            # Saver op to save and restore all the variables.
            saver = tf.train.Saver()

            # Restore model weights from previously saved model.
            saver.restore(session, MODEL_PATH)

            test_feed = {
                inputs_placeholder: test_inputs,
                sequence_length_placeholder: test_sequence_lengths
            }
            # Decoding.
            decoded_outputs = session.run(decoded[0], feed_dict=test_feed)
            dense_decoded = tf.sparse_tensor_to_dense(
                decoded_outputs, default_value=-1).eval(session=session)
            test_num = test_texts.shape[0]

            for i, sequence in enumerate(dense_decoded):
                sequence = [s for s in sequence if s != -1]
                decoded_text = utils.sequence_decoder(sequence)

                logging.info("Sequence %d/%d", i + 1, test_num)
                logging.info("Original:\n%s", test_texts[i])
                logging.info("Decoded:\n%s", decoded_text)