global n_input n_input = 40 # MFCC, maybe need add Delta # The number of frames in the context global n_context n_context = 0 global feature_len feature_len = 100 # all input is 1s wavfiel 1000ms/10ms = 100. global feature_dim feature_dim = n_input * (n_context * 2 + 1) global alphabet alphabet = Alphabet('./alphabet.txt') print('alphabet.size() ', alphabet.size()) print(alphabet._label_to_str) # The number of characters in the target language plus one global n_character n_character = alphabet.size() + 1 # +1 for CTC blank label global max_labellen max_labellen = 6 global n_hidden n_hidden = 128 trfile = 'data/speechcmd_train.csv' cvfile = 'data/speechcmd_dev.csv' testfile = 'data/speechcmd_test.csv'
def main(_): initialize_globals() if not FLAGS.test_files: log_error('You need to specify what files to use for evaluation via ' 'the --test_files flag.') exit(1) global alphabet alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path)) # sort examples by length, improves packing of batches and timesteps test_data = preprocess(FLAGS.test_files.split(','), FLAGS.test_batch_size, alphabet=alphabet, numcep=N_FEATURES, numcontext=N_CONTEXT, hdf5_cache_path=FLAGS.hdf5_test_set).sort_values( by="features_len", ascending=False) def create_windows(features): num_strides = len(features) - (N_CONTEXT * 2) # Create a view into the array with overlapping strides of size # numcontext (past) + 1 (present) + numcontext (future) window_size = 2 * N_CONTEXT + 1 features = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, N_FEATURES), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) return features test_data['features'] = test_data['features'].apply(create_windows) with tf.Session() as session: inputs, outputs = create_inference_graph( batch_size=FLAGS.test_batch_size, n_steps=N_STEPS) seq_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size]) decode_logits_ph = tf.placeholder( tf.float32, [None, FLAGS.test_batch_size, alphabet.size() + 1]) labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None]) label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size]) decoded, _ = decode_with_lm(decode_logits_ph, seq_lengths_ph, merge_repeated=False, beam_width=FLAGS.beam_width) sparse_labels = tf.cast( ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32) loss = tf.nn.ctc_loss(labels=sparse_labels, inputs=decode_logits_ph, sequence_length=seq_lengths_ph) distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), sparse_labels) # Create a saver using variables from the above newly created graph mapping = { v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_') } saver = tf.train.Saver(mapping) # Restore variables from training checkpoint checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if not checkpoint: log_error( 'Checkpoint directory ({}) does not contain a valid checkpoint state.' .format(FLAGS.checkpoint_dir)) exit(1) checkpoint_path = checkpoint.model_checkpoint_path saver.restore(session, checkpoint_path) logitses = [] batch_count = len(test_data) // FLAGS.test_batch_size bar = progressbar.ProgressBar(max_value=batch_count - 1, widget=progressbar.AdaptiveETA) for batch in bar(split_data(test_data, FLAGS.test_batch_size)): session.run(outputs['initialize_state']) batch_features = pad_to_dense(batch['features'].values) batch_features_len = batch['features_len'].values full_step_len = np.full_like(batch_features_len, N_STEPS) logits = np.empty([0, FLAGS.test_batch_size, alphabet.size() + 1]) for i in range(0, batch_features.shape[1], N_STEPS): chunk_features = batch_features[:, i:i + N_STEPS, :, :] chunk_features_len = np.minimum(batch_features_len, full_step_len) # pad with zeros if the chunk does not have enough steps steps_in_chunk = chunk_features.shape[1] if steps_in_chunk < FLAGS.n_steps: chunk_features = np.pad( chunk_features, ((0, 0), (0, FLAGS.n_steps - steps_in_chunk), (0, 0), (0, 0)), mode='constant', constant_values=0) output = session.run(outputs['outputs'], feed_dict={ inputs['input']: chunk_features, inputs['input_lengths']: chunk_features_len, }) logits = np.concatenate((logits, output)) # we have processed N_STEPS so subtract from remaining steps batch_features_len -= N_STEPS # clip to zero batch_features_len = np.maximum( batch_features_len, np.zeros_like(batch_features_len)) logitses.append(logits) ground_truths = [] predictions = [] distances = [] losses = [] bar = progressbar.ProgressBar(max_value=batch_count - 1, widget=progressbar.AdaptiveETA) for logits, batch in bar( zip(logitses, split_data(test_data, FLAGS.test_batch_size))): seq_lengths = batch['features_len'].values labels = pad_to_dense(batch['transcript'].values) label_lengths = batch['transcript_len'].values decoded_, loss_, distance_, sparse_labels_ = session.run( [decoded, loss, distance, sparse_labels], feed_dict={ decode_logits_ph: logits, seq_lengths_ph: seq_lengths, labels_ph: labels, label_lengths_ph: label_lengths }) ground_truths.extend( sparse_tensor_value_to_texts(sparse_labels_, alphabet)) predictions.extend( sparse_tensor_value_to_texts(decoded_[0], alphabet)) distances.extend(distance_) losses.extend(loss_) wer, samples = calculate_report(ground_truths, predictions, distances, losses) mean_edit_distance = np.mean(distances) mean_loss = np.mean(losses) # Filter out all items with WER=0 and take only the first report_count items report_samples = itertools.islice((s for s in samples if s.wer > 0), FLAGS.report_count) print('Test - WER: %f, loss: %f, mean edit distance: %f' % (wer, mean_loss, mean_edit_distance)) print('-' * 80) for sample in report_samples: print('WER: %f, loss: %f, mean edit distance: %f' % (sample.wer, sample.loss, sample.distance)) print(' - src: "%s"' % sample.src) print(' - res: "%s"' % sample.res) print('-' * 80) if FLAGS.test_output_file: json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))
def run_inference(): """Load frozen graph, run inference and display most likely predicted characters""" parser = argparse.ArgumentParser( description='Run Deepspeech inference to obtain char probabilities') parser.add_argument('--input-file', type=str, help='Path to the wav file', action="store", dest="input_file_path") parser.add_argument('--alphabet-file', type=str, help='Path to the alphabet.txt file', action="store", dest="alphabet_file_path") parser.add_argument('--model-file', type=str, help='Path to the tf model file', action="store", dest="model_file_path") parser.add_argument( '--predicted-character-count', type=int, help='Number of most likely characters to be displayed', action="store", dest="predicted_character_count", default=5) args = parser.parse_args() alphabet = Alphabet(os.path.abspath(args.alphabet_file_path)) if args.predicted_character_count >= alphabet.size(): args.predicted_character_count = alphabet.size() - 1 # Load frozen graph from file and parse it with tf.io.gfile.GFile(args.model_file_path, "rb") as f: graph_def = tf.compat.v1.GraphDef() graph_def.ParseFromString(f.read()) # print(graph_def.node) with tf.Graph().as_default() as graph: tf.import_graph_def(graph_def, name="prefix") # currently hardcoded values used during inference with tf.compat.v1.Session(graph=graph) as session: features, features_len = audiofile_to_features( args.input_file_path) previous_state_c = np.zeros([1, n_cell_dim]) previous_state_h = np.zeros([1, n_cell_dim]) # Add batch dimension features = tf.expand_dims(features, 0) features_len = tf.expand_dims(features_len, 0) # Evaluate features = create_overlapping_windows(features).eval( session=session) features_len = features_len.eval(session=session) # we are interested only into logits, not CTC decoding inputs = { 'input': graph.get_tensor_by_name('prefix/input_node:0'), 'previous_state_c': graph.get_tensor_by_name('prefix/previous_state_c:0'), 'previous_state_h': graph.get_tensor_by_name('prefix/previous_state_h: 0'), 'input_lengths': graph.get_tensor_by_name('prefix/input_lengths:0') } outputs = { 'outputs': graph.get_tensor_by_name('prefix/raw_logits:0'), 'new_state_c': graph.get_tensor_by_name('prefix/new_state_c:0'), 'new_state_h': graph.get_tensor_by_name('prefix/new_state_h: 0'), } logits = np.empty([0, 1, alphabet.size() + 1]) # the frozen model only accepts input split to 16 step chunks, # if the inference was run from checkpoint instead (as in single inference in deepspeech script), this loop wouldn't be needed for i in range(0, features_len[0], n_steps): chunk = features[:, i:i + n_steps, :, :] chunk_length = chunk.shape[1] # pad with zeros if not enough steps (len(features) % FLAGS.n_steps != 0) if chunk_length < n_steps: chunk = np.pad(chunk, ((0, 0), (0, n_steps - chunk_length), (0, 0), (0, 0)), mode='constant', constant_values=0) # need to update the states with each loop iteration logits_step, previous_state_c, previous_state_h = session.run( [ outputs['outputs'], outputs['new_state_c'], outputs['new_state_h'] ], feed_dict={ inputs['input']: chunk, inputs['input_lengths']: [chunk_length], inputs['previous_state_c']: previous_state_c, inputs['previous_state_h']: previous_state_h, }) logits = np.concatenate((logits, logits_step)) logits = np.squeeze(logits) row_output = [] for j in range(args.predicted_character_count): row_output.append([]) # now sort logits and turn them into characters + probabilities for i in range(0, len(logits)): softmax_output = softmax(logits[i]) indexes_sorted = softmax_output.argsort( )[args.predicted_character_count * -1:][::-1] most_likely_chars = '' chars_probability = '' for j in range(args.predicted_character_count): char_index = indexes_sorted[j] if char_index < alphabet.size(): text = alphabet._string_from_label(char_index) most_likely_chars += text + ' ' row_output[j].append(text) chars_probability += ' (' + str( softmax_output[char_index]) + ')' else: most_likely_chars += '- ' row_output[j].append('-') chars_probability += ' (' + str( softmax_output[char_index]) + ')' print(most_likely_chars, " ", chars_probability) with open(args.input_file_path + "_acoustic.txt", "w") as out: for j in range(len(row_output)): out.write(', '.join(row_output[j]) + "\n") print(row_output[j])
def run_inference(): """Load frozen graph, run inference and display most likely predicted characters""" parser = argparse.ArgumentParser(description='Run Deepspeech inference to obtain char probabilities') parser.add_argument('--input-file', type=str, help='Path to the wav file', action="store", dest="input_file_path") parser.add_argument('--alphabet-file', type=str, help='Path to the alphabet.txt file', action="store", dest="alphabet_file_path") parser.add_argument('--model-file', type=str, help='Path to the tf model file', action="store", dest="model_file_path") parser.add_argument('--predicted-character-count', type=int, help='Number of most likely characters to be displayed', action="store", dest="predicted_character_count", default=5) args = parser.parse_args() alphabet = Alphabet(os.path.abspath(args.alphabet_file_path)) if args.predicted_character_count >= alphabet.size(): args.predicted_character_count = alphabet.size() - 1 # Load frozen graph from file and parse it with tf.gfile.GFile(args.model_file_path, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) with tf.Graph().as_default() as graph: tf.import_graph_def(graph_def, name="prefix") # currently hardcoded values used during inference n_input = 26 n_context = 9 n_steps = 16 with tf.Session(graph=graph) as session: session.run('prefix/initialize_state') features = util.audio.audiofile_to_input_vector(args.input_file_path, n_input, n_context) num_strides = len(features) - (n_context * 2) window_size = 2 * n_context + 1 features = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, n_input), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) # we are interested only into logits, not CTC decoding inputs = {'input': graph.get_tensor_by_name('prefix/input_node:0'), 'input_lengths': graph.get_tensor_by_name('prefix/input_lengths:0')} outputs = {'outputs': graph.get_tensor_by_name('prefix/logits:0')} logits = np.empty([0, 1, alphabet.size() + 1]) for i in range(0, len(features), n_steps): chunk = features[i:i + n_steps] # pad with zeros if not enough steps (len(features) % FLAGS.n_steps != 0) if len(chunk) < n_steps: chunk = np.pad(chunk, ( (0, n_steps - len(chunk)), (0, 0), (0, 0) ), mode='constant', constant_values=0) output = session.run(outputs['outputs'], feed_dict={ inputs['input']: [chunk], inputs['input_lengths']: [len(chunk)], }) logits = np.concatenate((logits, output)) for i in range(0, len(logits)): softmax_output = softmax(logits[i][0]) indexes_sorted = softmax_output.argsort()[args.predicted_character_count * -1:][::-1] most_likely_chars = '' chars_probability = '' for j in range(args.predicted_character_count): char_index = indexes_sorted[j] if char_index < alphabet.size(): text = alphabet.string_from_label(char_index) most_likely_chars += text+' ' chars_probability += ' (' + str(softmax_output[char_index]) + ')' else: most_likely_chars += '- ' chars_probability += ' (' + str(softmax_output[char_index]) + ')' print(most_likely_chars, " ", chars_probability)