Python Alphabet.Alphabetの例、util.text.Alphabet.Alphabet Pythonの例

コード例 #1

0

ファイルを表示

ファイル: convert-to-text-SD.py プロジェクト: desaur/Speech-to-Text-SD

def initialize_globals():

    global alphabet
    alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path))

    # Geometric Constants
    # ===================

    # For an explanation of the meaning of the geometric constants, please refer to
    # doc/Geometry.md

    # Number of MFCC features
    global n_input
    n_input = 26  # TODO: Determine this programatically from the sample rate

    # The number of frames in the context
    global n_context
    n_context = 9  # TODO: Determine the optimal value using a validation data set

    if len(FLAGS.one_shot_infer) > 0:
        FLAGS.train = False
        FLAGS.test = False
        FLAGS.export_dir = ''
        if not os.path.exists(FLAGS.one_shot_infer):
            log_error(
                'Path specified in --one_shot_infer is not a valid file.')
            exit(1)

    if not os.path.exists(os.path.abspath(FLAGS.decoder_library_path)):
        print('ERROR: The decoder library file does not exist. Make sure you have ' \
              'downloaded or built the native client binaries and pass the ' \
              'appropriate path to the binaries in the --decoder_library_path parameter.')

    global custom_op_module
    custom_op_module = tf.load_op_library(FLAGS.decoder_library_path)

コード例 #2

0

ファイルを表示

ファイル: evaluate.py プロジェクト: PiotrowskiD/DeepSpeech

def main(_):
    initialize_globals()

    if not FLAGS.test_files:
        log_error('You need to specify what files to use for evaluation via '
                  'the --test_files flag.')
        exit(1)

    global alphabet
    alphabet = Alphabet(FLAGS.alphabet_config_path)

    # sort examples by length, improves packing of batches and timesteps
    test_data = preprocess(
        FLAGS.test_files.split(','),
        FLAGS.test_batch_size,
        alphabet=alphabet,
        numcep=Config.n_input,
        numcontext=Config.n_context,
        hdf5_cache_path=FLAGS.hdf5_test_set).sort_values(
        by="features_len",
        ascending=False)

    from DeepSpeech import create_inference_graph
    graph = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1)

    samples = evaluate(test_data, graph, alphabet)

    if FLAGS.test_output_file:
        # Save decoded tuples as JSON, converting NumPy floats to Python floats
        json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))

コード例 #3

0

ファイルを表示

def get_alphabet(language):
    if language in ALPHABETS:
        return ALPHABETS[language]
    alphabet_path = getattr(CLI_ARGS, language + '_alphabet')
    alphabet = Alphabet(alphabet_path) if alphabet_path else None
    ALPHABETS[language] = alphabet
    return alphabet

コード例 #4

0

ファイルを表示

def create_bundle(
    alphabet_path,
    lm_path,
    vocab_path,
    package_path,
    force_utf8,
    default_alpha,
    default_beta,
):
    words = set()
    vocab_looks_char_based = True
    with open(vocab_path) as fin:
        for line in fin:
            for word in line.split():
                words.add(word.encode("utf-8"))
                if len(word) > 1:
                    vocab_looks_char_based = False
    print("{} unique words read from vocabulary file.".format(len(words)))
    print("{} like a character based model.".format(
        "Looks" if vocab_looks_char_based else "Doesn't look"))

    if force_utf8 != None:  # pylint: disable=singleton-comparison
        use_utf8 = force_utf8.value
        print("Forcing UTF-8 mode = {}".format(use_utf8))
    else:
        use_utf8 = vocab_looks_char_based

    if use_utf8:
        serialized_alphabet = UTF8Alphabet().serialize()
    else:
        serialized_alphabet = Alphabet(alphabet_path).serialize()

    alphabet = NativeAlphabet()
    err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet))
    if err != 0:
        print("Error loading alphabet: {}".format(err))
        sys.exit(1)

    scorer = Scorer()
    scorer.set_alphabet(alphabet)
    scorer.set_utf8_mode(use_utf8)
    scorer.reset_params(default_alpha, default_beta)
    scorer.load_lm(lm_path)
    scorer.fill_dictionary(list(words))
    shutil.copy(lm_path, package_path)
    scorer.save_dictionary(package_path, True)  # append, not overwrite
    print("Package created in {}".format(package_path))

コード例 #5

0

ファイルを表示

def initialize_globals():
    c = AttrDict()

    # ps and worker hosts required for p2p cluster setup
    FLAGS.ps_hosts = list(filter(len, FLAGS.ps_hosts.split(',')))
    FLAGS.worker_hosts = list(filter(len, FLAGS.worker_hosts.split(',')))

    # Create a cluster from the parameter server and worker hosts.
    c.cluster = tf.train.ClusterSpec({
        'ps': FLAGS.ps_hosts,
        'worker': FLAGS.worker_hosts
    })

    # The absolute number of computing nodes - regardless of cluster or single mode
    num_workers = max(1, len(FLAGS.worker_hosts))

    # If replica numbers are negative, we multiply their absolute values with the number of workers
    if FLAGS.replicas < 0:
        FLAGS.replicas = num_workers * -FLAGS.replicas
    if FLAGS.replicas_to_agg < 0:
        FLAGS.replicas_to_agg = num_workers * -FLAGS.replicas_to_agg

    # The device path base for this node
    c.worker_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task_index)

    # This node's CPU device
    c.cpu_device = c.worker_device + '/cpu:0'

    # This node's available GPU devices
    c.available_devices = [
        c.worker_device + gpu for gpu in get_available_gpus()
    ]

    # If there is no GPU available, we fall back to CPU based operation
    if 0 == len(c.available_devices):
        c.available_devices = [c.cpu_device]

    # Set default dropout rates
    if FLAGS.dropout_rate2 < 0:
        FLAGS.dropout_rate2 = FLAGS.dropout_rate
    if FLAGS.dropout_rate3 < 0:
        FLAGS.dropout_rate3 = FLAGS.dropout_rate
    if FLAGS.dropout_rate6 < 0:
        FLAGS.dropout_rate6 = FLAGS.dropout_rate

    # Set default checkpoint dir
    if len(FLAGS.checkpoint_dir) == 0:
        FLAGS.checkpoint_dir = xdg.save_data_path(
            os.path.join('deepspeech', 'checkpoints'))

    if FLAGS.benchmark_steps > 0:
        FLAGS.checkpoint_dir = None

    # Set default summary dir
    if len(FLAGS.summary_dir) == 0:
        FLAGS.summary_dir = xdg.save_data_path(
            os.path.join('deepspeech', 'summaries'))

    # Standard session configuration that'll be used for all new sessions.
    c.session_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_placement,
        inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads,
        intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads)

    c.alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path))

    # Geometric Constants
    # ===================

    # For an explanation of the meaning of the geometric constants, please refer to
    # doc/Geometry.md

    # Number of MFCC features
    c.n_input = 26  # TODO: Determine this programmatically from the sample rate

    # The number of frames in the context
    c.n_context = 9  # TODO: Determine the optimal value using a validation data set

    # Number of units in hidden layers
    c.n_hidden = FLAGS.n_hidden

    c.n_hidden_1 = c.n_hidden

    c.n_hidden_2 = c.n_hidden

    c.n_hidden_5 = c.n_hidden

    # LSTM cell state dimension
    c.n_cell_dim = c.n_hidden

    # The number of units in the third layer, which feeds in to the LSTM
    c.n_hidden_3 = c.n_cell_dim

    # Units in the sixth layer = number of characters in the target language plus one
    c.n_hidden_6 = c.alphabet.size() + 1  # +1 for CTC blank label

    # Queues that are used to gracefully stop parameter servers.
    # Each queue stands for one ps. A finishing worker sends a token to each queue before joining/quitting.
    # Each ps will dequeue as many tokens as there are workers before joining/quitting.
    # This ensures parameter servers won't quit, if still required by at least one worker and
    # also won't wait forever (like with a standard `server.join()`).
    done_queues = []
    for i, ps in enumerate(FLAGS.ps_hosts):
        # Queues are hosted by their respective owners
        with tf.device('/job:ps/task:%d' % i):
            done_queues.append(
                tf.FIFOQueue(1, tf.int32, shared_name=('queue%i' % i)))

    # Placeholder to pass in the worker's index as token
    c.token_placeholder = tf.placeholder(tf.int32)

    # Enqueue operations for each parameter server
    c.done_enqueues = [
        queue.enqueue(c.token_placeholder) for queue in done_queues
    ]

    # Dequeue operations for each parameter server
    c.done_dequeues = [queue.dequeue() for queue in done_queues]

    if len(FLAGS.one_shot_infer) > 0:
        FLAGS.train = False
        FLAGS.test = False
        FLAGS.export_dir = ''
        if not os.path.exists(FLAGS.one_shot_infer):
            log_error(
                'Path specified in --one_shot_infer is not a valid file.')
            exit(1)

    # Determine, if we are the chief worker
    c.is_chief = len(
        FLAGS.worker_hosts) == 0 or (FLAGS.task_index == 0
                                     and FLAGS.job_name == 'worker')

    ConfigSingleton._config = c

コード例 #6

0

ファイルを表示

ファイル: Keras_DS2_SpeechRecognition.py プロジェクト: bjtommychen/Keras_DeepSpeech2_SpeechRecognition

# Number of MFCC features
global n_input
n_input = 40  # MFCC, maybe need add Delta

# The number of frames in the context
global n_context
n_context = 0

global feature_len
feature_len = 100  # all input is 1s wavfiel 1000ms/10ms = 100.

global feature_dim
feature_dim = n_input * (n_context * 2 + 1)

global alphabet
alphabet = Alphabet('./alphabet.txt')
print('alphabet.size() ', alphabet.size())
print(alphabet._label_to_str)
# The number of characters in the target language plus one
global n_character
n_character = alphabet.size() + 1  # +1 for CTC blank label

global max_labellen
max_labellen = 6

global n_hidden
n_hidden = 128

trfile = 'data/speechcmd_train.csv'
cvfile = 'data/speechcmd_dev.csv'
testfile = 'data/speechcmd_test.csv'

コード例 #7

0

ファイルを表示

ファイル: import_slr57.py プロジェクト: zh4r0nax/DeepSpeech

    if counter['too_short'] > 0:
        print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
    if counter['too_long'] > 0:
        print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))
    print('Final amount of imported audio: %s.' % secs_to_hours(counter['total_time'] / SAMPLE_RATE))

def handle_args():
    parser = argparse.ArgumentParser(description='Importer for African Accented French dataset. More information on http://www.openslr.org/57/.')
    parser.add_argument(dest='target_dir')
    parser.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet')
    parser.add_argument('--normalize', action='store_true', help='Converts diacritic characters to their base ones')
    return parser.parse_args()

if __name__ == "__main__":
    CLI_ARGS = handle_args()
    ALPHABET = Alphabet(CLI_ARGS.filter_alphabet) if CLI_ARGS.filter_alphabet else None

    def label_filter(label):
        if CLI_ARGS.normalize:
            label = unicodedata.normalize("NFKD", label.strip()) \
                .encode("ascii", "ignore") \
                .decode("ascii", "ignore")
        label = validate_label(label)
        if ALPHABET and label:
            try:
                ALPHABET.encode(label)
            except KeyError:
                label = None
        return label

    _download_and_preprocess_data(target_dir=CLI_ARGS.target_dir)

コード例 #8

0

ファイルを表示

def run_inference():
    """Load frozen graph, run inference and display most likely predicted characters"""

    parser = argparse.ArgumentParser(
        description='Run Deepspeech inference to obtain char probabilities')
    parser.add_argument('--input-file',
                        type=str,
                        help='Path to the wav file',
                        action="store",
                        dest="input_file_path")
    parser.add_argument('--alphabet-file',
                        type=str,
                        help='Path to the alphabet.txt file',
                        action="store",
                        dest="alphabet_file_path")
    parser.add_argument('--model-file',
                        type=str,
                        help='Path to the tf model file',
                        action="store",
                        dest="model_file_path")
    parser.add_argument(
        '--predicted-character-count',
        type=int,
        help='Number of most likely characters to be displayed',
        action="store",
        dest="predicted_character_count",
        default=5)
    args = parser.parse_args()

    alphabet = Alphabet(os.path.abspath(args.alphabet_file_path))

    if args.predicted_character_count >= alphabet.size():
        args.predicted_character_count = alphabet.size() - 1

    # Load frozen graph from file and parse it
    with tf.io.gfile.GFile(args.model_file_path, "rb") as f:
        graph_def = tf.compat.v1.GraphDef()
        graph_def.ParseFromString(f.read())
        # print(graph_def.node)

    with tf.Graph().as_default() as graph:

        tf.import_graph_def(graph_def, name="prefix")

        # currently hardcoded values used during inference

        with tf.compat.v1.Session(graph=graph) as session:

            features, features_len = audiofile_to_features(
                args.input_file_path)
            previous_state_c = np.zeros([1, n_cell_dim])
            previous_state_h = np.zeros([1, n_cell_dim])

            # Add batch dimension
            features = tf.expand_dims(features, 0)
            features_len = tf.expand_dims(features_len, 0)

            # Evaluate
            features = create_overlapping_windows(features).eval(
                session=session)
            features_len = features_len.eval(session=session)

            # we are interested only into logits, not CTC decoding
            inputs = {
                'input':
                graph.get_tensor_by_name('prefix/input_node:0'),
                'previous_state_c':
                graph.get_tensor_by_name('prefix/previous_state_c:0'),
                'previous_state_h':
                graph.get_tensor_by_name('prefix/previous_state_h: 0'),
                'input_lengths':
                graph.get_tensor_by_name('prefix/input_lengths:0')
            }
            outputs = {
                'outputs': graph.get_tensor_by_name('prefix/raw_logits:0'),
                'new_state_c':
                graph.get_tensor_by_name('prefix/new_state_c:0'),
                'new_state_h':
                graph.get_tensor_by_name('prefix/new_state_h: 0'),
            }

            logits = np.empty([0, 1, alphabet.size() + 1])

            # the frozen model only accepts input split to 16 step chunks,
            # if the inference was run from checkpoint instead (as in single inference in deepspeech script), this loop wouldn't be needed
            for i in range(0, features_len[0], n_steps):
                chunk = features[:, i:i + n_steps, :, :]
                chunk_length = chunk.shape[1]
                # pad with zeros if not enough steps (len(features) % FLAGS.n_steps != 0)
                if chunk_length < n_steps:
                    chunk = np.pad(chunk, ((0, 0), (0, n_steps - chunk_length),
                                           (0, 0), (0, 0)),
                                   mode='constant',
                                   constant_values=0)

                # need to update the states with each loop iteration
                logits_step, previous_state_c, previous_state_h = session.run(
                    [
                        outputs['outputs'], outputs['new_state_c'],
                        outputs['new_state_h']
                    ],
                    feed_dict={
                        inputs['input']: chunk,
                        inputs['input_lengths']: [chunk_length],
                        inputs['previous_state_c']: previous_state_c,
                        inputs['previous_state_h']: previous_state_h,
                    })

                logits = np.concatenate((logits, logits_step))

            logits = np.squeeze(logits)

            row_output = []
            for j in range(args.predicted_character_count):
                row_output.append([])

            # now sort logits and turn them into characters + probabilities
            for i in range(0, len(logits)):
                softmax_output = softmax(logits[i])
                indexes_sorted = softmax_output.argsort(
                )[args.predicted_character_count * -1:][::-1]
                most_likely_chars = ''
                chars_probability = ''
                for j in range(args.predicted_character_count):
                    char_index = indexes_sorted[j]
                    if char_index < alphabet.size():
                        text = alphabet._string_from_label(char_index)
                        most_likely_chars += text + ' '
                        row_output[j].append(text)
                        chars_probability += ' (' + str(
                            softmax_output[char_index]) + ')'
                    else:
                        most_likely_chars += '- '
                        row_output[j].append('-')
                        chars_probability += ' (' + str(
                            softmax_output[char_index]) + ')'
                print(most_likely_chars, " ", chars_probability)

            with open(args.input_file_path + "_acoustic.txt", "w") as out:
                for j in range(len(row_output)):
                    out.write(', '.join(row_output[j]) + "\n")
                    print(row_output[j])

コード例 #9

0

ファイルを表示

def initialize_globals():
    c = AttrDict()

    # CPU device
    c.cpu_device = '/cpu:0'

    # Available GPU devices
    c.available_devices = get_available_gpus()

    # If there is no GPU available, we fall back to CPU based operation
    if not c.available_devices:
        c.available_devices = [c.cpu_device]

    # Set default dropout rates
    if FLAGS.dropout_rate2 < 0:
        FLAGS.dropout_rate2 = FLAGS.dropout_rate
    if FLAGS.dropout_rate3 < 0:
        FLAGS.dropout_rate3 = FLAGS.dropout_rate
    if FLAGS.dropout_rate6 < 0:
        FLAGS.dropout_rate6 = FLAGS.dropout_rate

    # Set default checkpoint dir
    if not FLAGS.checkpoint_dir:
        FLAGS.checkpoint_dir = xdg.save_data_path(
            os.path.join('deepspeech', 'checkpoints'))

    if FLAGS.load not in ['last', 'best', 'init', 'auto']:
        FLAGS.load = 'auto'

    # Set default summary dir
    if not FLAGS.summary_dir:
        FLAGS.summary_dir = xdg.save_data_path(
            os.path.join('deepspeech', 'summaries'))

    c.alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path))

    # Geometric Constants
    # ===================

    # For an explanation of the meaning of the geometric constants, please refer to
    # doc/Geometry.md

    # Number of MFCC features
    c.n_input = 26  # TODO: Determine this programmatically from the sample rate

    # The number of frames in the context
    c.n_context = 9  # TODO: Determine the optimal value using a validation data set

    # Number of units in hidden layers
    c.n_hidden = FLAGS.n_hidden

    c.n_hidden_1 = c.n_hidden

    c.n_hidden_2 = c.n_hidden

    c.n_hidden_5 = c.n_hidden

    # LSTM cell state dimension
    c.n_cell_dim = c.n_hidden

    # The number of units in the third layer, which feeds in to the LSTM
    c.n_hidden_3 = c.n_cell_dim

    # Units in the sixth layer = number of characters in the target language plus one
    c.n_hidden_6 = c.alphabet.size() + 1  # +1 for CTC blank label

    # Size of audio window in samples
    c.audio_window_samples = FLAGS.audio_sample_rate * (FLAGS.feature_win_len /
                                                        1000)

    # Stride for feature computations in samples
    c.audio_step_samples = FLAGS.audio_sample_rate * (FLAGS.feature_win_step /
                                                      1000)

    if FLAGS.one_shot_infer:
        if not os.path.exists(FLAGS.one_shot_infer):
            log_error(
                'Path specified in --one_shot_infer is not a valid file.')
            exit(1)

    ConfigSingleton._config = c  # pylint: disable=protected-access

コード例 #10

0

ファイルを表示

ファイル: evaluate.py プロジェクト: revirevy/DeepSpeech

def main(_):
    initialize_globals()

    if not FLAGS.test_files:
        log_error('You need to specify what files to use for evaluation via '
                  'the --test_files flag.')
        exit(1)

    global alphabet
    alphabet = Alphabet(FLAGS.alphabet_config_path)

    scorer = Scorer(FLAGS.lm_weight, FLAGS.valid_word_count_weight,
                    FLAGS.lm_binary_path, FLAGS.lm_trie_path,
                    alphabet)

    # sort examples by length, improves packing of batches and timesteps
    test_data = preprocess(
        FLAGS.test_files.split(','),
        FLAGS.test_batch_size,
        alphabet=alphabet,
        numcep=N_FEATURES,
        numcontext=N_CONTEXT,
        hdf5_cache_path=FLAGS.hdf5_test_set).sort_values(
        by="features_len",
        ascending=False)

    def create_windows(features):
        num_strides = len(features) - (N_CONTEXT * 2)

        # Create a view into the array with overlapping strides of size
        # numcontext (past) + 1 (present) + numcontext (future)
        window_size = 2*N_CONTEXT+1
        features = np.lib.stride_tricks.as_strided(
            features,
            (num_strides, window_size, N_FEATURES),
            (features.strides[0], features.strides[0], features.strides[1]),
            writeable=False)

        return features

    # Create overlapping windows over the features
    test_data['features'] = test_data['features'].apply(create_windows)

    with tf.Session() as session:
        inputs, outputs, layers = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1)

        # Transpose to batch major for decoder
        transposed = tf.transpose(outputs['outputs'], [1, 0, 2])

        labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None], name="labels")
        label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size], name="label_lengths")

        sparse_labels = tf.cast(ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32)
        loss = tf.nn.ctc_loss(labels=sparse_labels,
                              inputs=layers['raw_logits'],
                              sequence_length=inputs['input_lengths'])

        # Create a saver using variables from the above newly created graph
        mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')}
        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if not checkpoint:
            log_error('Checkpoint directory ({}) does not contain a valid checkpoint state.'.format(FLAGS.checkpoint_dir))
            exit(1)

        checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(session, checkpoint_path)

        logitses = []
        losses = []

        print('Computing acoustic model predictions...')
        batch_count = len(test_data) // FLAGS.test_batch_size
        bar = progressbar.ProgressBar(max_value=batch_count,
                                      widget=progressbar.AdaptiveETA)

        # First pass, compute losses and transposed logits for decoding
        for batch in bar(split_data(test_data, FLAGS.test_batch_size)):
            session.run(outputs['initialize_state'])

            features = pad_to_dense(batch['features'].values)
            features_len = batch['features_len'].values
            labels = pad_to_dense(batch['transcript'].values)
            label_lengths = batch['transcript_len'].values

            logits, loss = session.run([transposed, loss], feed_dict={
                inputs['input']: features,
                inputs['input_lengths']: features_len,
                labels_ph: labels,
                label_lengths_ph: label_lengths
            })

            logitses.append(logits)
            losses.extend(loss)

        ground_truths = []
        predictions = []
        distances = []

        print('Decoding predictions...')
        bar = progressbar.ProgressBar(max_value=batch_count,
                                      widget=progressbar.AdaptiveETA)

        # Get number of accessible CPU cores for this process
        num_processes = len(os.sched_getaffinity(0))

        # Second pass, decode logits and compute WER and edit distance metrics
        for logits, batch in bar(zip(logitses, split_data(test_data, FLAGS.test_batch_size))):
            seq_lengths = batch['features_len'].values.astype(np.int32)
            decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, alphabet, FLAGS.beam_width,
                                                    num_processes=num_processes, scorer=scorer)

            ground_truths.extend(alphabet.decode(l) for l in batch['transcript'])
            predictions.extend(d[0][1] for d in decoded)
            distances.extend(levenshtein(a, b) for a, b in zip(labels, predictions))

    wer, samples = calculate_report(ground_truths, predictions, distances, losses)
    mean_edit_distance = np.mean(distances)
    mean_loss = np.mean(losses)

    # Take only the first report_count items
    report_samples = itertools.islice(samples, FLAGS.report_count)

    print('Test - WER: %f, loss: %f, mean edit distance: %f' %
          (wer, mean_loss, mean_edit_distance))
    print('-' * 80)
    for sample in report_samples:
        print('WER: %f, loss: %f, edit distance: %f' %
              (sample.wer, sample.loss, sample.distance))
        print(' - src: "%s"' % sample.src)
        print(' - res: "%s"' % sample.res)
        print('-' * 80)

    if FLAGS.test_output_file:
        json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))

コード例 #11

0

ファイルを表示

import random
from util.audio import audiofile_to_input_vector
from util.text import text_to_char_array, Alphabet
import numpy as np


SAMPLE_RATE = 16000

training_percent = 0.9
validation_percent = 0.1
# test_percent = 0.05

numcontext = 9
numcep = 26

alphabet = Alphabet(os.path.abspath('/home/guest/Desktop/DeepSpeech/data/alphabet.txt'))


excluded_train_wavs = ['/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon5/060799_a/adb_0467/speech/scr0467/05/04670504/r4670396/u0396196.wav',
'/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon5/220799/adb_0467/speech/scr0467/05/04670505/r4670441/u0441079.wav',
'/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon5/280799/adb_0467/speech/scr0467/05/04670505/r4670451/u0451201.wav',
'/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/160799/adb_0467/speech/scr0467/07/04670706/r4670598/u0598036.wav',
'/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/160799/adb_0467/speech/scr0467/07/04670706/r4670598/u0598037.wav',
'/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/160799/adb_0467/speech/scr0467/07/04670706/r4670598/u0598102.wav',
'/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/100899/adb_0467/speech/scr0467/07/04670707/r4670672/u0672173.wav',
'/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/100899/adb_0467/speech/scr0467/07/04670707/r4670672/u0672174.wav',
'/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/100899/adb_0467/speech/scr0467/07/04670707/r4670672/u0672175.wav',
'/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/100899/adb_0467/speech/scr0467/07/04670707/r4670672/u0672176.wav',
'/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/100899/adb_0467/speech/scr0467/07/04670707/r4670672/u0672177.wav',
'/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/100899/adb_0467/speech/scr0467/07/04670707/r4670672/u0672178.wav',
'/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/100899/adb_0467/speech/scr0467/07/04670707/r4670672/u0672179.wav',

コード例 #12

0

ファイルを表示

ファイル: evaluate.py プロジェクト: gmyzc520/DeepSpeech-1

def main(_):
    initialize_globals()

    if not FLAGS.test_files:
        log_error('You need to specify what files to use for evaluation via '
                  'the --test_files flag.')
        exit(1)

    global alphabet
    alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path))

    # sort examples by length, improves packing of batches and timesteps
    test_data = preprocess(FLAGS.test_files.split(','),
                           FLAGS.test_batch_size,
                           alphabet=alphabet,
                           numcep=N_FEATURES,
                           numcontext=N_CONTEXT,
                           hdf5_cache_path=FLAGS.hdf5_test_set).sort_values(
                               by="features_len", ascending=False)

    def create_windows(features):
        num_strides = len(features) - (N_CONTEXT * 2)

        # Create a view into the array with overlapping strides of size
        # numcontext (past) + 1 (present) + numcontext (future)
        window_size = 2 * N_CONTEXT + 1
        features = np.lib.stride_tricks.as_strided(
            features, (num_strides, window_size, N_FEATURES),
            (features.strides[0], features.strides[0], features.strides[1]),
            writeable=False)

        return features

    test_data['features'] = test_data['features'].apply(create_windows)

    with tf.Session() as session:
        inputs, outputs = create_inference_graph(
            batch_size=FLAGS.test_batch_size, n_steps=N_STEPS)

        seq_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size])
        decode_logits_ph = tf.placeholder(
            tf.float32, [None, FLAGS.test_batch_size,
                         alphabet.size() + 1])
        labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None])
        label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size])

        decoded, _ = decode_with_lm(decode_logits_ph,
                                    seq_lengths_ph,
                                    merge_repeated=False,
                                    beam_width=FLAGS.beam_width)

        sparse_labels = tf.cast(
            ctc_label_dense_to_sparse(labels_ph, label_lengths_ph,
                                      FLAGS.test_batch_size), tf.int32)
        loss = tf.nn.ctc_loss(labels=sparse_labels,
                              inputs=decode_logits_ph,
                              sequence_length=seq_lengths_ph)

        distance = tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                    sparse_labels)

        # Create a saver using variables from the above newly created graph
        mapping = {
            v.op.name: v
            for v in tf.global_variables()
            if not v.op.name.startswith('previous_state_')
        }
        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if not checkpoint:
            log_error(
                'Checkpoint directory ({}) does not contain a valid checkpoint state.'
                .format(FLAGS.checkpoint_dir))
            exit(1)

        checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(session, checkpoint_path)

        logitses = []

        batch_count = len(test_data) // FLAGS.test_batch_size
        bar = progressbar.ProgressBar(max_value=batch_count - 1,
                                      widget=progressbar.AdaptiveETA)

        for batch in bar(split_data(test_data, FLAGS.test_batch_size)):
            session.run(outputs['initialize_state'])

            batch_features = pad_to_dense(batch['features'].values)
            batch_features_len = batch['features_len'].values
            full_step_len = np.full_like(batch_features_len, N_STEPS)

            logits = np.empty([0, FLAGS.test_batch_size, alphabet.size() + 1])
            for i in range(0, batch_features.shape[1], N_STEPS):
                chunk_features = batch_features[:, i:i + N_STEPS, :, :]
                chunk_features_len = np.minimum(batch_features_len,
                                                full_step_len)

                # pad with zeros if the chunk does not have enough steps
                steps_in_chunk = chunk_features.shape[1]
                if steps_in_chunk < FLAGS.n_steps:
                    chunk_features = np.pad(
                        chunk_features,
                        ((0, 0), (0, FLAGS.n_steps - steps_in_chunk), (0, 0),
                         (0, 0)),
                        mode='constant',
                        constant_values=0)

                output = session.run(outputs['outputs'],
                                     feed_dict={
                                         inputs['input']:
                                         chunk_features,
                                         inputs['input_lengths']:
                                         chunk_features_len,
                                     })
                logits = np.concatenate((logits, output))

                # we have processed N_STEPS so subtract from remaining steps
                batch_features_len -= N_STEPS
                # clip to zero
                batch_features_len = np.maximum(
                    batch_features_len, np.zeros_like(batch_features_len))

            logitses.append(logits)

        ground_truths = []
        predictions = []
        distances = []
        losses = []

        bar = progressbar.ProgressBar(max_value=batch_count - 1,
                                      widget=progressbar.AdaptiveETA)

        for logits, batch in bar(
                zip(logitses, split_data(test_data, FLAGS.test_batch_size))):
            seq_lengths = batch['features_len'].values
            labels = pad_to_dense(batch['transcript'].values)
            label_lengths = batch['transcript_len'].values

            decoded_, loss_, distance_, sparse_labels_ = session.run(
                [decoded, loss, distance, sparse_labels],
                feed_dict={
                    decode_logits_ph: logits,
                    seq_lengths_ph: seq_lengths,
                    labels_ph: labels,
                    label_lengths_ph: label_lengths
                })

            ground_truths.extend(
                sparse_tensor_value_to_texts(sparse_labels_, alphabet))
            predictions.extend(
                sparse_tensor_value_to_texts(decoded_[0], alphabet))
            distances.extend(distance_)
            losses.extend(loss_)

    wer, samples = calculate_report(ground_truths, predictions, distances,
                                    losses)
    mean_edit_distance = np.mean(distances)
    mean_loss = np.mean(losses)

    # Filter out all items with WER=0 and take only the first report_count items
    report_samples = itertools.islice((s for s in samples if s.wer > 0),
                                      FLAGS.report_count)

    print('Test - WER: %f, loss: %f, mean edit distance: %f' %
          (wer, mean_loss, mean_edit_distance))
    print('-' * 80)
    for sample in report_samples:
        print('WER: %f, loss: %f, mean edit distance: %f' %
              (sample.wer, sample.loss, sample.distance))
        print(' - src: "%s"' % sample.src)
        print(' - res: "%s"' % sample.res)
        print('-' * 80)

    if FLAGS.test_output_file:
        json.dump(samples,
                  open(FLAGS.test_output_file, 'w'),
                  default=lambda x: float(x))

コード例 #13

0

ファイルを表示

ファイル: config.py プロジェクト: temsa/DeepSpeech

def initialize_globals():
    c = AttrDict()

    # CPU device
    c.cpu_device = '/cpu:0'

    # Available GPU devices
    c.available_devices = get_available_gpus()

    # If there is no GPU available, we fall back to CPU based operation
    if 0 == len(c.available_devices):
        c.available_devices = [c.cpu_device]

    # Set default dropout rates
    if FLAGS.dropout_rate2 < 0:
        FLAGS.dropout_rate2 = FLAGS.dropout_rate
    if FLAGS.dropout_rate3 < 0:
        FLAGS.dropout_rate3 = FLAGS.dropout_rate
    if FLAGS.dropout_rate6 < 0:
        FLAGS.dropout_rate6 = FLAGS.dropout_rate

    # Set default checkpoint dir
    if len(FLAGS.checkpoint_dir) == 0:
        FLAGS.checkpoint_dir = xdg.save_data_path(
            os.path.join('deepspeech', 'checkpoints'))

    if FLAGS.load not in ['last', 'best', 'init', 'auto']:
        FLAGS.load = 'auto'

    # Set default summary dir
    if len(FLAGS.summary_dir) == 0:
        FLAGS.summary_dir = xdg.save_data_path(
            os.path.join('deepspeech', 'summaries'))

    # Standard session configuration that'll be used for all new sessions.
    c.session_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_placement,
        inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads,
        intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads)

    c.alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path))

    # Geometric Constants
    # ===================

    # For an explanation of the meaning of the geometric constants, please refer to
    # doc/Geometry.md

    # Number of MFCC features
    c.n_input = 26  # TODO: Determine this programmatically from the sample rate

    # The number of frames in the context
    c.n_context = 9  # TODO: Determine the optimal value using a validation data set

    # Number of units in hidden layers
    c.n_hidden = FLAGS.n_hidden

    c.n_hidden_1 = c.n_hidden

    c.n_hidden_2 = c.n_hidden

    c.n_hidden_5 = c.n_hidden

    # LSTM cell state dimension
    c.n_cell_dim = c.n_hidden

    # The number of units in the third layer, which feeds in to the LSTM
    c.n_hidden_3 = c.n_cell_dim

    # Units in the sixth layer = number of characters in the target language plus one
    c.n_hidden_6 = c.alphabet.size() + 1  # +1 for CTC blank label

    if len(FLAGS.one_shot_infer) > 0:
        FLAGS.train = False
        FLAGS.test = False
        FLAGS.export_dir = ''
        if not os.path.exists(FLAGS.one_shot_infer):
            log_error(
                'Path specified in --one_shot_infer is not a valid file.')
            exit(1)

    ConfigSingleton._config = c

コード例 #14

0

ファイルを表示

def initialize_globals():
    c = AttrDict()

    # The absolute number of computing nodes - regardless of cluster or single mode
    num_workers = 1

    # The device path base for this node
    c.worker_device = '/job:%s/task:%d' % ('localhost', 0)

    # This node's CPU device
    c.cpu_device = c.worker_device + '/cpu:0'

    # This node's available GPU devices
    c.available_devices = [c.worker_device + gpu for gpu in get_available_gpus()]

    # If there is no GPU available, we fall back to CPU based operation
    if 0 == len(c.available_devices):
        c.available_devices = [c.cpu_device]

    # Set default dropout rates
    if FLAGS.dropout_rate2 < 0:
        FLAGS.dropout_rate2 = FLAGS.dropout_rate
    if FLAGS.dropout_rate3 < 0:
        FLAGS.dropout_rate3 = FLAGS.dropout_rate
    if FLAGS.dropout_rate6 < 0:
        FLAGS.dropout_rate6 = FLAGS.dropout_rate

    # Set default checkpoint dir
    if len(FLAGS.checkpoint_dir) == 0:
        FLAGS.checkpoint_dir = xdg.save_data_path(os.path.join('deepspeech','checkpoints'))

    # Set default summary dir
    if len(FLAGS.summary_dir) == 0:
        FLAGS.summary_dir = xdg.save_data_path(os.path.join('deepspeech','summaries'))

    # Standard session configuration that'll be used for all new sessions.
    c.session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.log_placement,
                                      inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads,
                                      intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads)

    c.alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path))

    # Geometric Constants
    # ===================

    # For an explanation of the meaning of the geometric constants, please refer to
    # doc/Geometry.md

    # Number of MFCC features
    c.n_input = 26 # TODO: Determine this programmatically from the sample rate

    # The number of frames in the context
    c.n_context = 9 # TODO: Determine the optimal value using a validation data set

    # Number of units in hidden layers
    c.n_hidden = FLAGS.n_hidden

    c.n_hidden_1 = c.n_hidden

    c.n_hidden_2 = c.n_hidden

    c.n_hidden_5 = c.n_hidden

    # LSTM cell state dimension
    c.n_cell_dim = c.n_hidden

    # The number of units in the third layer, which feeds in to the LSTM
    c.n_hidden_3 = c.n_cell_dim

    # Units in the sixth layer = number of characters in the target language plus one
    c.n_hidden_6 = c.alphabet.size() + 1 # +1 for CTC blank label

    # Determine, if we are the chief worker
    c.is_chief = True

    ConfigSingleton._config = c

コード例 #15

0

ファイルを表示

    write_csvs(extracted)
    cleanup(archive)


def handle_args():
    parser = argparse.ArgumentParser(
        description='Import German Distant Speech (TUDA)')
    parser.add_argument('base_dir', help='Directory containing all data')
    parser.add_argument('--max_duration',
                        type=int,
                        default=10000,
                        help='Maximum sample duration in milliseconds')
    parser.add_argument(
        '--normalize',
        action='store_true',
        help='Converts diacritic characters to their base ones')
    parser.add_argument(
        '--alphabet',
        help='Exclude samples with characters not in provided alphabet file')
    parser.add_argument('--keep_archive',
                        type=bool,
                        default=True,
                        help='If downloaded archives should be kept')
    return parser.parse_args()


if __name__ == "__main__":
    CLI_ARGS = handle_args()
    ALPHABET = Alphabet(CLI_ARGS.alphabet) if CLI_ARGS.alphabet else None
    download_and_prepare()

コード例 #16

0

ファイルを表示

ファイル: config.py プロジェクト: Perpleex/DeepSpeech

def initialize_globals():
    c = AttrDict()

    # Set default dropout rates
    if FLAGS.dropout_rate2 < 0:
        FLAGS.dropout_rate2 = FLAGS.dropout_rate
    if FLAGS.dropout_rate3 < 0:
        FLAGS.dropout_rate3 = FLAGS.dropout_rate
    if FLAGS.dropout_rate6 < 0:
        FLAGS.dropout_rate6 = FLAGS.dropout_rate

    # Set default checkpoint dir
    if not FLAGS.checkpoint_dir:
        FLAGS.checkpoint_dir = xdg.save_data_path(
            os.path.join('deepspeech', 'checkpoints'))

    if FLAGS.load not in ['last', 'best', 'init', 'auto', 'transfer']:
        FLAGS.load = 'auto'

    # Set default summary dir
    if not FLAGS.summary_dir:
        FLAGS.summary_dir = xdg.save_data_path(
            os.path.join('deepspeech', 'summaries'))

    # Standard session configuration that'll be used for all new sessions.
    c.session_config = tfv1.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_placement,
        inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads,
        intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads,
        gpu_options=tfv1.GPUOptions(allow_growth=FLAGS.use_allow_growth))

    # CPU device
    c.cpu_device = '/cpu:0'

    # Available GPU devices
    c.available_devices = get_available_gpus(c.session_config)

    # If there is no GPU available, we fall back to CPU based operation
    if not c.available_devices:
        c.available_devices = [c.cpu_device]

    if FLAGS.utf8:
        c.alphabet = UTF8Alphabet()
    else:
        c.alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path))

    # Geometric Constants
    # ===================

    # For an explanation of the meaning of the geometric constants, please refer to
    # doc/Geometry.md

    # Number of MFCC features
    c.n_input = 26  # TODO: Determine this programmatically from the sample rate

    # The number of frames in the context
    c.n_context = 9  # TODO: Determine the optimal value using a validation data set

    # Number of units in hidden layers
    c.n_hidden = FLAGS.n_hidden

    c.n_hidden_1 = c.n_hidden

    c.n_hidden_2 = c.n_hidden

    c.n_hidden_5 = c.n_hidden

    # LSTM cell state dimension
    c.n_cell_dim = c.n_hidden

    # The number of units in the third layer, which feeds in to the LSTM
    c.n_hidden_3 = c.n_cell_dim

    # Units in the sixth layer = number of characters in the target language plus one
    c.n_hidden_6 = c.alphabet.size() + 1  # +1 for CTC blank label

    # Size of audio window in samples
    if (FLAGS.feature_win_len * FLAGS.audio_sample_rate) % 1000 != 0:
        log_error(
            '--feature_win_len value ({}) in milliseconds ({}) multiplied '
            'by --audio_sample_rate value ({}) must be an integer value. Adjust '
            'your --feature_win_len value or resample your audio accordingly.'
            ''.format(FLAGS.feature_win_len, FLAGS.feature_win_len / 1000,
                      FLAGS.audio_sample_rate))
        sys.exit(1)

    c.audio_window_samples = FLAGS.audio_sample_rate * (FLAGS.feature_win_len /
                                                        1000)

    # Stride for feature computations in samples
    if (FLAGS.feature_win_step * FLAGS.audio_sample_rate) % 1000 != 0:
        log_error(
            '--feature_win_step value ({}) in milliseconds ({}) multiplied '
            'by --audio_sample_rate value ({}) must be an integer value. Adjust '
            'your --feature_win_step value or resample your audio accordingly.'
            ''.format(FLAGS.feature_win_step, FLAGS.feature_win_step / 1000,
                      FLAGS.audio_sample_rate))
        sys.exit(1)

    c.audio_step_samples = FLAGS.audio_sample_rate * (FLAGS.feature_win_step /
                                                      1000)

    if FLAGS.one_shot_infer:
        if not os.path.exists(FLAGS.one_shot_infer):
            log_error(
                'Path specified in --one_shot_infer is not a valid file.')
            sys.exit(1)

    ConfigSingleton._config = c  # pylint: disable=protected-access

コード例 #17

0

ファイルを表示

    parser.add_argument(
        '--audio_dir',
        help=
        'Directory containing the audio clips - defaults to "<tsv_dir>/clips"')
    parser.add_argument(
        '--filter_alphabet',
        help='Exclude samples with characters not in provided alphabet')
    parser.add_argument(
        '--normalize',
        action='store_true',
        help='Converts diacritic characters to their base ones')
    params = parser.parse_args()

    audio_dir = params.audio_dir if params.audio_dir else os.path.join(
        params.tsv_dir, 'clips')
    alphabet = Alphabet(
        params.filter_alphabet) if params.filter_alphabet else None

    def label_filter(label):
        if params.normalize:
            label = unicodedata.normalize("NFKD", label.strip()) \
                .encode("ascii", "ignore") \
                .decode("ascii", "ignore")
        label = validate_label(label)
        if alphabet and label:
            try:
                [alphabet.label_from_string(c) for c in label]
            except KeyError:
                label = None
        return label

    _preprocess_data(params.tsv_dir, audio_dir, label_filter)

コード例 #18

0

ファイルを表示

    PARSER.add_argument(
        '--filter_alphabet',
        help='Exclude samples with characters not in provided alphabet')
    PARSER.add_argument(
        '--normalize',
        action='store_true',
        help='Converts diacritic characters to their base ones')
    PARSER.add_argument('--space_after_every_character',
                        action='store_true',
                        help='To help transcript join by white space')

    PARAMS = PARSER.parse_args()

    AUDIO_DIR = PARAMS.audio_dir if PARAMS.audio_dir else os.path.join(
        PARAMS.tsv_dir, 'clips')
    ALPHABET = Alphabet(
        PARAMS.filter_alphabet) if PARAMS.filter_alphabet else None

    def label_filter_fun(label):
        if PARAMS.normalize:
            label = unicodedata.normalize("NFKD", label.strip()) \
                .encode("ascii", "ignore") \
                .decode("ascii", "ignore")
        label = validate_label(label)
        if ALPHABET and label:
            try:
                ALPHABET.encode(label)
            except KeyError:
                label = None
        return label

    _preprocess_data(PARAMS.tsv_dir, AUDIO_DIR, label_filter_fun,

コード例 #19

0

ファイルを表示

ファイル: show_inferred_characters.py プロジェクト: pradipcyb/DeepSpeech

def run_inference():
    """Load frozen graph, run inference and display most likely predicted characters"""

    parser = argparse.ArgumentParser(description='Run Deepspeech inference to obtain char probabilities')
    parser.add_argument('--input-file', type=str,
                        help='Path to the wav file', action="store", dest="input_file_path")
    parser.add_argument('--alphabet-file', type=str,
                        help='Path to the alphabet.txt file', action="store", dest="alphabet_file_path")
    parser.add_argument('--model-file', type=str,
                        help='Path to the tf model file', action="store", dest="model_file_path")
    parser.add_argument('--predicted-character-count', type=int,
                        help='Number of most likely characters to be displayed', action="store",
                        dest="predicted_character_count", default=5)
    args = parser.parse_args()

    alphabet = Alphabet(os.path.abspath(args.alphabet_file_path))

    if args.predicted_character_count >= alphabet.size():
        args.predicted_character_count = alphabet.size() - 1

    # Load frozen graph from file and parse it
    with tf.gfile.GFile(args.model_file_path, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())

    with tf.Graph().as_default() as graph:

        tf.import_graph_def(graph_def, name="prefix")

        # currently hardcoded values used during inference
        n_input = 26
        n_context = 9
        n_steps = 16

        with tf.Session(graph=graph) as session:
            session.run('prefix/initialize_state')

            features = util.audio.audiofile_to_input_vector(args.input_file_path, n_input, n_context)
            num_strides = len(features) - (n_context * 2)
            window_size = 2 * n_context + 1

            features = np.lib.stride_tricks.as_strided(
                features,
                (num_strides, window_size, n_input),
                (features.strides[0], features.strides[0], features.strides[1]),
                writeable=False)


            # we are interested only into logits, not CTC decoding
            inputs = {'input': graph.get_tensor_by_name('prefix/input_node:0'),
                      'input_lengths': graph.get_tensor_by_name('prefix/input_lengths:0')}
            outputs = {'outputs': graph.get_tensor_by_name('prefix/logits:0')}

            logits = np.empty([0, 1, alphabet.size() + 1])


            for i in range(0, len(features), n_steps):
                chunk = features[i:i + n_steps]

                # pad with zeros if not enough steps (len(features) % FLAGS.n_steps != 0)
                if len(chunk) < n_steps:
                    chunk = np.pad(chunk,
                                   (
                                       (0, n_steps - len(chunk)),
                                       (0, 0),
                                       (0, 0)
                                   ),
                                   mode='constant',
                                   constant_values=0)

                output = session.run(outputs['outputs'], feed_dict={
                    inputs['input']: [chunk],
                    inputs['input_lengths']: [len(chunk)],
                })
                logits = np.concatenate((logits, output))

            for i in range(0, len(logits)):
                softmax_output = softmax(logits[i][0])
                indexes_sorted = softmax_output.argsort()[args.predicted_character_count * -1:][::-1]
                most_likely_chars = ''
                chars_probability = ''
                for j in range(args.predicted_character_count):
                    char_index = indexes_sorted[j]
                    if char_index < alphabet.size():
                        text = alphabet.string_from_label(char_index)
                        most_likely_chars += text+' '
                        chars_probability += ' (' + str(softmax_output[char_index]) + ')'
                    else:
                        most_likely_chars += '- '
                        chars_probability += ' (' + str(softmax_output[char_index]) + ')'
                print(most_likely_chars, " ", chars_probability)