global n_input
n_input = 40  # MFCC, maybe need add Delta

# The number of frames in the context
global n_context
n_context = 0

global feature_len
feature_len = 100  # all input is 1s wavfiel 1000ms/10ms = 100.

global feature_dim
feature_dim = n_input * (n_context * 2 + 1)

global alphabet
alphabet = Alphabet('./alphabet.txt')
print('alphabet.size() ', alphabet.size())
print(alphabet._label_to_str)
# The number of characters in the target language plus one
global n_character
n_character = alphabet.size() + 1  # +1 for CTC blank label

global max_labellen
max_labellen = 6

global n_hidden
n_hidden = 128

trfile = 'data/speechcmd_train.csv'
cvfile = 'data/speechcmd_dev.csv'
testfile = 'data/speechcmd_test.csv'
Exemple #2
0
def main(_):
    initialize_globals()

    if not FLAGS.test_files:
        log_error('You need to specify what files to use for evaluation via '
                  'the --test_files flag.')
        exit(1)

    global alphabet
    alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path))

    # sort examples by length, improves packing of batches and timesteps
    test_data = preprocess(FLAGS.test_files.split(','),
                           FLAGS.test_batch_size,
                           alphabet=alphabet,
                           numcep=N_FEATURES,
                           numcontext=N_CONTEXT,
                           hdf5_cache_path=FLAGS.hdf5_test_set).sort_values(
                               by="features_len", ascending=False)

    def create_windows(features):
        num_strides = len(features) - (N_CONTEXT * 2)

        # Create a view into the array with overlapping strides of size
        # numcontext (past) + 1 (present) + numcontext (future)
        window_size = 2 * N_CONTEXT + 1
        features = np.lib.stride_tricks.as_strided(
            features, (num_strides, window_size, N_FEATURES),
            (features.strides[0], features.strides[0], features.strides[1]),
            writeable=False)

        return features

    test_data['features'] = test_data['features'].apply(create_windows)

    with tf.Session() as session:
        inputs, outputs = create_inference_graph(
            batch_size=FLAGS.test_batch_size, n_steps=N_STEPS)

        seq_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size])
        decode_logits_ph = tf.placeholder(
            tf.float32, [None, FLAGS.test_batch_size,
                         alphabet.size() + 1])
        labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None])
        label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size])

        decoded, _ = decode_with_lm(decode_logits_ph,
                                    seq_lengths_ph,
                                    merge_repeated=False,
                                    beam_width=FLAGS.beam_width)

        sparse_labels = tf.cast(
            ctc_label_dense_to_sparse(labels_ph, label_lengths_ph,
                                      FLAGS.test_batch_size), tf.int32)
        loss = tf.nn.ctc_loss(labels=sparse_labels,
                              inputs=decode_logits_ph,
                              sequence_length=seq_lengths_ph)

        distance = tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                    sparse_labels)

        # Create a saver using variables from the above newly created graph
        mapping = {
            v.op.name: v
            for v in tf.global_variables()
            if not v.op.name.startswith('previous_state_')
        }
        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if not checkpoint:
            log_error(
                'Checkpoint directory ({}) does not contain a valid checkpoint state.'
                .format(FLAGS.checkpoint_dir))
            exit(1)

        checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(session, checkpoint_path)

        logitses = []

        batch_count = len(test_data) // FLAGS.test_batch_size
        bar = progressbar.ProgressBar(max_value=batch_count - 1,
                                      widget=progressbar.AdaptiveETA)

        for batch in bar(split_data(test_data, FLAGS.test_batch_size)):
            session.run(outputs['initialize_state'])

            batch_features = pad_to_dense(batch['features'].values)
            batch_features_len = batch['features_len'].values
            full_step_len = np.full_like(batch_features_len, N_STEPS)

            logits = np.empty([0, FLAGS.test_batch_size, alphabet.size() + 1])
            for i in range(0, batch_features.shape[1], N_STEPS):
                chunk_features = batch_features[:, i:i + N_STEPS, :, :]
                chunk_features_len = np.minimum(batch_features_len,
                                                full_step_len)

                # pad with zeros if the chunk does not have enough steps
                steps_in_chunk = chunk_features.shape[1]
                if steps_in_chunk < FLAGS.n_steps:
                    chunk_features = np.pad(
                        chunk_features,
                        ((0, 0), (0, FLAGS.n_steps - steps_in_chunk), (0, 0),
                         (0, 0)),
                        mode='constant',
                        constant_values=0)

                output = session.run(outputs['outputs'],
                                     feed_dict={
                                         inputs['input']:
                                         chunk_features,
                                         inputs['input_lengths']:
                                         chunk_features_len,
                                     })
                logits = np.concatenate((logits, output))

                # we have processed N_STEPS so subtract from remaining steps
                batch_features_len -= N_STEPS
                # clip to zero
                batch_features_len = np.maximum(
                    batch_features_len, np.zeros_like(batch_features_len))

            logitses.append(logits)

        ground_truths = []
        predictions = []
        distances = []
        losses = []

        bar = progressbar.ProgressBar(max_value=batch_count - 1,
                                      widget=progressbar.AdaptiveETA)

        for logits, batch in bar(
                zip(logitses, split_data(test_data, FLAGS.test_batch_size))):
            seq_lengths = batch['features_len'].values
            labels = pad_to_dense(batch['transcript'].values)
            label_lengths = batch['transcript_len'].values

            decoded_, loss_, distance_, sparse_labels_ = session.run(
                [decoded, loss, distance, sparse_labels],
                feed_dict={
                    decode_logits_ph: logits,
                    seq_lengths_ph: seq_lengths,
                    labels_ph: labels,
                    label_lengths_ph: label_lengths
                })

            ground_truths.extend(
                sparse_tensor_value_to_texts(sparse_labels_, alphabet))
            predictions.extend(
                sparse_tensor_value_to_texts(decoded_[0], alphabet))
            distances.extend(distance_)
            losses.extend(loss_)

    wer, samples = calculate_report(ground_truths, predictions, distances,
                                    losses)
    mean_edit_distance = np.mean(distances)
    mean_loss = np.mean(losses)

    # Filter out all items with WER=0 and take only the first report_count items
    report_samples = itertools.islice((s for s in samples if s.wer > 0),
                                      FLAGS.report_count)

    print('Test - WER: %f, loss: %f, mean edit distance: %f' %
          (wer, mean_loss, mean_edit_distance))
    print('-' * 80)
    for sample in report_samples:
        print('WER: %f, loss: %f, mean edit distance: %f' %
              (sample.wer, sample.loss, sample.distance))
        print(' - src: "%s"' % sample.src)
        print(' - res: "%s"' % sample.res)
        print('-' * 80)

    if FLAGS.test_output_file:
        json.dump(samples,
                  open(FLAGS.test_output_file, 'w'),
                  default=lambda x: float(x))
Exemple #3
0
def run_inference():
    """Load frozen graph, run inference and display most likely predicted characters"""

    parser = argparse.ArgumentParser(
        description='Run Deepspeech inference to obtain char probabilities')
    parser.add_argument('--input-file',
                        type=str,
                        help='Path to the wav file',
                        action="store",
                        dest="input_file_path")
    parser.add_argument('--alphabet-file',
                        type=str,
                        help='Path to the alphabet.txt file',
                        action="store",
                        dest="alphabet_file_path")
    parser.add_argument('--model-file',
                        type=str,
                        help='Path to the tf model file',
                        action="store",
                        dest="model_file_path")
    parser.add_argument(
        '--predicted-character-count',
        type=int,
        help='Number of most likely characters to be displayed',
        action="store",
        dest="predicted_character_count",
        default=5)
    args = parser.parse_args()

    alphabet = Alphabet(os.path.abspath(args.alphabet_file_path))

    if args.predicted_character_count >= alphabet.size():
        args.predicted_character_count = alphabet.size() - 1

    # Load frozen graph from file and parse it
    with tf.io.gfile.GFile(args.model_file_path, "rb") as f:
        graph_def = tf.compat.v1.GraphDef()
        graph_def.ParseFromString(f.read())
        # print(graph_def.node)

    with tf.Graph().as_default() as graph:

        tf.import_graph_def(graph_def, name="prefix")

        # currently hardcoded values used during inference

        with tf.compat.v1.Session(graph=graph) as session:

            features, features_len = audiofile_to_features(
                args.input_file_path)
            previous_state_c = np.zeros([1, n_cell_dim])
            previous_state_h = np.zeros([1, n_cell_dim])

            # Add batch dimension
            features = tf.expand_dims(features, 0)
            features_len = tf.expand_dims(features_len, 0)

            # Evaluate
            features = create_overlapping_windows(features).eval(
                session=session)
            features_len = features_len.eval(session=session)

            # we are interested only into logits, not CTC decoding
            inputs = {
                'input':
                graph.get_tensor_by_name('prefix/input_node:0'),
                'previous_state_c':
                graph.get_tensor_by_name('prefix/previous_state_c:0'),
                'previous_state_h':
                graph.get_tensor_by_name('prefix/previous_state_h: 0'),
                'input_lengths':
                graph.get_tensor_by_name('prefix/input_lengths:0')
            }
            outputs = {
                'outputs': graph.get_tensor_by_name('prefix/raw_logits:0'),
                'new_state_c':
                graph.get_tensor_by_name('prefix/new_state_c:0'),
                'new_state_h':
                graph.get_tensor_by_name('prefix/new_state_h: 0'),
            }

            logits = np.empty([0, 1, alphabet.size() + 1])

            # the frozen model only accepts input split to 16 step chunks,
            # if the inference was run from checkpoint instead (as in single inference in deepspeech script), this loop wouldn't be needed
            for i in range(0, features_len[0], n_steps):
                chunk = features[:, i:i + n_steps, :, :]
                chunk_length = chunk.shape[1]
                # pad with zeros if not enough steps (len(features) % FLAGS.n_steps != 0)
                if chunk_length < n_steps:
                    chunk = np.pad(chunk, ((0, 0), (0, n_steps - chunk_length),
                                           (0, 0), (0, 0)),
                                   mode='constant',
                                   constant_values=0)

                # need to update the states with each loop iteration
                logits_step, previous_state_c, previous_state_h = session.run(
                    [
                        outputs['outputs'], outputs['new_state_c'],
                        outputs['new_state_h']
                    ],
                    feed_dict={
                        inputs['input']: chunk,
                        inputs['input_lengths']: [chunk_length],
                        inputs['previous_state_c']: previous_state_c,
                        inputs['previous_state_h']: previous_state_h,
                    })

                logits = np.concatenate((logits, logits_step))

            logits = np.squeeze(logits)

            row_output = []
            for j in range(args.predicted_character_count):
                row_output.append([])

            # now sort logits and turn them into characters + probabilities
            for i in range(0, len(logits)):
                softmax_output = softmax(logits[i])
                indexes_sorted = softmax_output.argsort(
                )[args.predicted_character_count * -1:][::-1]
                most_likely_chars = ''
                chars_probability = ''
                for j in range(args.predicted_character_count):
                    char_index = indexes_sorted[j]
                    if char_index < alphabet.size():
                        text = alphabet._string_from_label(char_index)
                        most_likely_chars += text + ' '
                        row_output[j].append(text)
                        chars_probability += ' (' + str(
                            softmax_output[char_index]) + ')'
                    else:
                        most_likely_chars += '- '
                        row_output[j].append('-')
                        chars_probability += ' (' + str(
                            softmax_output[char_index]) + ')'
                print(most_likely_chars, " ", chars_probability)

            with open(args.input_file_path + "_acoustic.txt", "w") as out:
                for j in range(len(row_output)):
                    out.write(', '.join(row_output[j]) + "\n")
                    print(row_output[j])
def run_inference():
    """Load frozen graph, run inference and display most likely predicted characters"""

    parser = argparse.ArgumentParser(description='Run Deepspeech inference to obtain char probabilities')
    parser.add_argument('--input-file', type=str,
                        help='Path to the wav file', action="store", dest="input_file_path")
    parser.add_argument('--alphabet-file', type=str,
                        help='Path to the alphabet.txt file', action="store", dest="alphabet_file_path")
    parser.add_argument('--model-file', type=str,
                        help='Path to the tf model file', action="store", dest="model_file_path")
    parser.add_argument('--predicted-character-count', type=int,
                        help='Number of most likely characters to be displayed', action="store",
                        dest="predicted_character_count", default=5)
    args = parser.parse_args()

    alphabet = Alphabet(os.path.abspath(args.alphabet_file_path))

    if args.predicted_character_count >= alphabet.size():
        args.predicted_character_count = alphabet.size() - 1

    # Load frozen graph from file and parse it
    with tf.gfile.GFile(args.model_file_path, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())

    with tf.Graph().as_default() as graph:

        tf.import_graph_def(graph_def, name="prefix")

        # currently hardcoded values used during inference
        n_input = 26
        n_context = 9
        n_steps = 16

        with tf.Session(graph=graph) as session:
            session.run('prefix/initialize_state')

            features = util.audio.audiofile_to_input_vector(args.input_file_path, n_input, n_context)
            num_strides = len(features) - (n_context * 2)
            window_size = 2 * n_context + 1

            features = np.lib.stride_tricks.as_strided(
                features,
                (num_strides, window_size, n_input),
                (features.strides[0], features.strides[0], features.strides[1]),
                writeable=False)


            # we are interested only into logits, not CTC decoding
            inputs = {'input': graph.get_tensor_by_name('prefix/input_node:0'),
                      'input_lengths': graph.get_tensor_by_name('prefix/input_lengths:0')}
            outputs = {'outputs': graph.get_tensor_by_name('prefix/logits:0')}

            logits = np.empty([0, 1, alphabet.size() + 1])


            for i in range(0, len(features), n_steps):
                chunk = features[i:i + n_steps]

                # pad with zeros if not enough steps (len(features) % FLAGS.n_steps != 0)
                if len(chunk) < n_steps:
                    chunk = np.pad(chunk,
                                   (
                                       (0, n_steps - len(chunk)),
                                       (0, 0),
                                       (0, 0)
                                   ),
                                   mode='constant',
                                   constant_values=0)

                output = session.run(outputs['outputs'], feed_dict={
                    inputs['input']: [chunk],
                    inputs['input_lengths']: [len(chunk)],
                })
                logits = np.concatenate((logits, output))

            for i in range(0, len(logits)):
                softmax_output = softmax(logits[i][0])
                indexes_sorted = softmax_output.argsort()[args.predicted_character_count * -1:][::-1]
                most_likely_chars = ''
                chars_probability = ''
                for j in range(args.predicted_character_count):
                    char_index = indexes_sorted[j]
                    if char_index < alphabet.size():
                        text = alphabet.string_from_label(char_index)
                        most_likely_chars += text+' '
                        chars_probability += ' (' + str(softmax_output[char_index]) + ')'
                    else:
                        most_likely_chars += '- '
                        chars_probability += ' (' + str(softmax_output[char_index]) + ')'
                print(most_likely_chars, " ", chars_probability)