Ejemplo n.º 1
0
def character_based():
    is_character_based = False
    if FLAGS.scorer_path:
        scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path,
                        Config.alphabet)
        is_character_based = scorer.is_utf8_mode()
    return is_character_based
Ejemplo n.º 2
0
    def classify(self, audio_path):
        """
        Classify with 3rd-party ctc beam search
        """
        sample = self.load_audio(audio_path)

        # Apply softmax for CTC decoder
        logits = tf.nn.softmax(self.logits, name='logits')

        logits = logits.eval(feed_dict={self.audio: sample}, session=self.sess)
        print('logits:', logits)

        logits = np.squeeze(logits)

        if FLAGS.lm_binary_path:
            self.scorer = Scorer(
                FLAGS.lm_alpha, FLAGS.lm_beta,
                os.path.join('DeepSpeech', FLAGS.lm_binary_path),
                os.path.join('DeepSpeech', FLAGS.lm_trie_path),
                Config.alphabet)
        else:
            self.scorer = None

        r = ctc_beam_search_decoder(logits,
                                    Config.alphabet,
                                    FLAGS.beam_width,
                                    scorer=self.scorer,
                                    cutoff_prob=FLAGS.cutoff_prob,
                                    cutoff_top_n=FLAGS.cutoff_top_n)

        # Print highest probability result
        print(r[0][1])
Ejemplo n.º 3
0
def transcribe_file(audio_path, tlog_path):
    from DeepSpeech import create_model  # pylint: disable=cyclic-import,import-outside-toplevel
    from util.checkpoints import load_or_init_graph
    initialize_globals()
    scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path,
                    Config.alphabet)
    try:
        num_processes = cpu_count()
    except NotImplementedError:
        num_processes = 1
    with AudioFile(audio_path, as_path=True) as wav_path:
        data_set = split_audio_file(
            wav_path,
            batch_size=FLAGS.batch_size,
            aggressiveness=FLAGS.vad_aggressiveness,
            outlier_duration_ms=FLAGS.outlier_duration_ms,
            outlier_batch_size=FLAGS.outlier_batch_size)
        iterator = tf.data.Iterator.from_structure(
            data_set.output_types,
            data_set.output_shapes,
            output_classes=data_set.output_classes)
        batch_time_start, batch_time_end, batch_x, batch_x_len = iterator.get_next(
        )
        no_dropout = [None] * 6
        logits, _ = create_model(batch_x=batch_x,
                                 seq_length=batch_x_len,
                                 dropout=no_dropout)
        transposed = tf.nn.softmax(tf.transpose(logits, [1, 0, 2]))
        tf.train.get_or_create_global_step()
        with tf.Session(config=Config.session_config) as session:
            if FLAGS.load == 'auto':
                method_order = ['best', 'last']
            else:
                method_order = [FLAGS.load]
            load_or_init_graph(session, method_order)
            session.run(iterator.make_initializer(data_set))
            transcripts = []
            while True:
                try:
                    starts, ends, batch_logits, batch_lengths = \
                        session.run([batch_time_start, batch_time_end, transposed, batch_x_len])
                except tf.errors.OutOfRangeError:
                    break
                decoded = ctc_beam_search_decoder_batch(
                    batch_logits,
                    batch_lengths,
                    Config.alphabet,
                    FLAGS.beam_width,
                    num_processes=num_processes,
                    scorer=scorer)
                decoded = list(d[0][1] for d in decoded)
                transcripts.extend(zip(starts, ends, decoded))
            transcripts.sort(key=lambda t: t[0])
            transcripts = [{
                'start': int(start),
                'end': int(end),
                'transcript': transcript
            } for start, end, transcript in transcripts]
            with open(tlog_path, 'w') as tlog_file:
                json.dump(transcripts, tlog_file, default=float)
Ejemplo n.º 4
0
def do_single_file_inference(input_file_path):
    with tfv1.Session(config=Config.session_config) as session:
        inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1)

        # Create a saver using variables from the above newly created graph
        saver = tfv1.train.Saver()

        # Restore variables from training checkpoint
        # TODO: This restores the most recent checkpoint, but if we use validation to counteract
        #       over-fitting, we may want to restore an earlier checkpoint.
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if not checkpoint:
            log_error(
                'Checkpoint directory ({}) does not contain a valid checkpoint state.'
                .format(FLAGS.checkpoint_dir))
            exit(1)

        checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(session, checkpoint_path)

        features, features_len = audiofile_to_features(input_file_path)
        previous_state_c = np.zeros([1, Config.n_cell_dim])
        previous_state_h = np.zeros([1, Config.n_cell_dim])

        # Add batch dimension
        features = tf.expand_dims(features, 0)
        features_len = tf.expand_dims(features_len, 0)

        # Evaluate
        features = create_overlapping_windows(features).eval(session=session)
        features_len = features_len.eval(session=session)

        logits = outputs['outputs'].eval(feed_dict={
            inputs['input']:
            features,
            inputs['input_lengths']:
            features_len,
            inputs['previous_state_c']:
            previous_state_c,
            inputs['previous_state_h']:
            previous_state_h,
        },
                                         session=session)

        logits = np.squeeze(logits)

        if FLAGS.lm_binary_path:
            scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                            FLAGS.lm_binary_path, FLAGS.lm_trie_path,
                            Config.alphabet)
        else:
            scorer = None
        decoded = ctc_beam_search_decoder(logits,
                                          Config.alphabet,
                                          FLAGS.beam_width,
                                          scorer=scorer,
                                          cutoff_prob=FLAGS.cutoff_prob,
                                          cutoff_top_n=FLAGS.cutoff_top_n)
        # Print highest probability result
        print(decoded[0][1])
Ejemplo n.º 5
0
def do_single_file_inference(input_file_path):
    with tf.Session(config=Config.session_config) as session:
        inputs, outputs, layers = create_inference_graph(batch_size=1,
                                                         n_steps=-1)

        # REVIEW josephz: Hack: print all layers here.
        for i, l in enumerate(layers):
            print("layer '{}': '{}'".format(i, l))

        # Create a saver using variables from the above newly created graph
        mapping = {
            v.op.name: v
            for v in tf.global_variables()
            if not v.op.name.startswith('previous_state_')
        }
        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        # TODO: This restores the most recent checkpoint, but if we use validation to counteract
        #       over-fitting, we may want to restore an earlier checkpoint.
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if not checkpoint:
            log_error(
                'Checkpoint directory ({}) does not contain a valid checkpoint state.'
                .format(FLAGS.checkpoint_dir))
            exit(1)

        checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(session, checkpoint_path)

        session.run(outputs['initialize_state'])

        features = audiofile_to_input_vector(input_file_path, Config.n_input,
                                             Config.n_context)
        num_strides = len(features) - (Config.n_context * 2)

        # Create a view into the array with overlapping strides of size
        # numcontext (past) + 1 (present) + numcontext (future)
        window_size = 2 * Config.n_context + 1
        features = np.lib.stride_tricks.as_strided(
            features, (num_strides, window_size, Config.n_input),
            (features.strides[0], features.strides[0], features.strides[1]),
            writeable=False)

        logits = session.run(outputs['outputs'],
                             feed_dict={
                                 inputs['input']: [features],
                                 inputs['input_lengths']: [num_strides],
                             })

        logits = np.squeeze(logits)

        scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path,
                        FLAGS.lm_trie_path, Config.alphabet)
        decoded = ctc_beam_search_decoder(logits,
                                          Config.alphabet,
                                          FLAGS.beam_width,
                                          scorer=scorer)
        # Print highest probability result
        print(decoded[0][1])
Ejemplo n.º 6
0
def do_single_file_inference(input_file_path):
    with tf.Session(config=Config.session_config) as session:
        inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1)

        # Create a saver using variables from the above newly created graph
        mapping = {
            v.op.name: v
            for v in tf.global_variables()
            if not v.op.name.startswith('previous_state_')
        }
        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        # TODO: This restores the most recent checkpoint, but if we use validation to counteract
        #       over-fitting, we may want to restore an earlier checkpoint.
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if not checkpoint:
            log_error(
                'Checkpoint directory ({}) does not contain a valid checkpoint state.'
                .format(FLAGS.checkpoint_dir))
            exit(1)

        checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(session, checkpoint_path)
        session.run(outputs['initialize_state'])

        features, features_len = audiofile_to_features(input_file_path)

        # Add batch dimension
        features = tf.expand_dims(features, 0)
        features_len = tf.expand_dims(features_len, 0)

        # Evaluate
        features = create_overlapping_windows(features).eval(session=session)
        features_len = features_len.eval(session=session)

        logits = outputs['outputs'].eval(feed_dict={
            inputs['input']: features,
            inputs['input_lengths']: features_len,
        },
                                         session=session)

        logits = np.squeeze(logits)

        scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path,
                        FLAGS.lm_trie_path, Config.alphabet)
        decoded = ctc_beam_search_decoder(logits,
                                          Config.alphabet,
                                          FLAGS.beam_width,
                                          scorer=scorer)
        # Print highest probability result
        print(decoded[0][1])
Ejemplo n.º 7
0
def do_single_file_inference(input_file_path):
    with tfv1.Session(config=Config.session_config) as session:
        inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1)

        # Restore variables from training checkpoint
        if FLAGS.load == 'auto':
            method_order = ['best', 'last']
        else:
            method_order = [FLAGS.load]
        load_or_init_graph(session, method_order)

        features, features_len = audiofile_to_features(input_file_path)
        previous_state_c = np.zeros([1, Config.n_cell_dim])
        previous_state_h = np.zeros([1, Config.n_cell_dim])

        # Add batch dimension
        features = tf.expand_dims(features, 0)
        features_len = tf.expand_dims(features_len, 0)

        # Evaluate
        features = create_overlapping_windows(features).eval(session=session)
        features_len = features_len.eval(session=session)

        logits = outputs['outputs'].eval(feed_dict={
            inputs['input']:
            features,
            inputs['input_lengths']:
            features_len,
            inputs['previous_state_c']:
            previous_state_c,
            inputs['previous_state_h']:
            previous_state_h,
        },
                                         session=session)

        logits = np.squeeze(logits)

        if FLAGS.scorer_path:
            scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path,
                            Config.alphabet)
        else:
            scorer = None
        decoded = ctc_beam_search_decoder(logits,
                                          Config.alphabet,
                                          FLAGS.beam_width,
                                          scorer=scorer,
                                          cutoff_prob=FLAGS.cutoff_prob,
                                          cutoff_top_n=FLAGS.cutoff_top_n)
        # Print highest probability result
        print(decoded[0][1])
Ejemplo n.º 8
0
 def __init__(self, time_step, feature_size, hidden_size, output_size,
              num_rnn_layers):
     super(IAMModel, self).__init__()
     self.cnn = CNN(time_step=time_step)
     self.rnn = RNN(feature_size=feature_size,
                    hidden_size=hidden_size,
                    output_size=output_size,
                    num_layers=num_rnn_layers)
     self.time_step = time_step
     self.alphabet = Alphabet(os.path.abspath("chars.txt"))
     self.scorer = Scorer(alphabet=self.alphabet,
                          scorer_path='iam_uncased.scorer',
                          alpha=0.75,
                          beta=1.85)
Ejemplo n.º 9
0
def do_single_file_inference(input_file_path):
    with tfv1.Session(config=Config.session_config) as session:
        inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1)

        # Create a saver using variables from the above newly created graph
        saver = tfv1.train.Saver()

        # Restore variables from training checkpoint
        loaded = False
        if not loaded and FLAGS.load in ['auto', 'last']:
            loaded = try_loading(session, saver, 'checkpoint', 'most recent', load_step=False)
        if not loaded and FLAGS.load in ['auto', 'best']:
            loaded = try_loading(session, saver, 'best_dev_checkpoint', 'best validation', load_step=False)
        if not loaded:
            print('Could not load checkpoint from {}'.format(FLAGS.checkpoint_dir))
            sys.exit(1)

        features, features_len = audiofile_to_features(input_file_path)
        previous_state_c = np.zeros([1, Config.n_cell_dim])
        previous_state_h = np.zeros([1, Config.n_cell_dim])

        # Add batch dimension
        features = tf.expand_dims(features, 0)
        features_len = tf.expand_dims(features_len, 0)

        # Evaluate
        features = create_overlapping_windows(features).eval(session=session)
        features_len = features_len.eval(session=session)

        logits = outputs['outputs'].eval(feed_dict={
            inputs['input']: features,
            inputs['input_lengths']: features_len,
            inputs['previous_state_c']: previous_state_c,
            inputs['previous_state_h']: previous_state_h,
        }, session=session)

        logits = np.squeeze(logits)

        if FLAGS.lm_binary_path:
            scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                            FLAGS.lm_binary_path, FLAGS.lm_trie_path,
                            Config.alphabet)
        else:
            scorer = None
        decoded = ctc_beam_search_decoder(logits, Config.alphabet, FLAGS.beam_width,
                                          scorer=scorer, cutoff_prob=FLAGS.cutoff_prob,
                                          cutoff_top_n=FLAGS.cutoff_top_n)
        # Print highest probability result
        print(decoded[0][1])
Ejemplo n.º 10
0
def early_training_checks():
    # Check for proper scorer early
    if FLAGS.scorer_path:
        scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                        FLAGS.scorer_path, Config.alphabet)
        del scorer

    if FLAGS.train_files and FLAGS.test_files and FLAGS.load_checkpoint_dir != FLAGS.save_checkpoint_dir:
        log_warn('WARNING: You specified different values for --load_checkpoint_dir '
                 'and --save_checkpoint_dir, but you are running training and testing '
                 'in a single invocation. The testing step will respect --load_checkpoint_dir, '
                 'and thus WILL NOT TEST THE CHECKPOINT CREATED BY THE TRAINING STEP. '
                 'Train and test in two separate invocations, specifying the correct '
                 '--load_checkpoint_dir in both cases, or use the same location '
                 'for loading and saving.')
 def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats):
     super(SpeechRecognitionModel, self).__init__()
     n_feats = n_feats // 2
     self.cnn = nn.Conv2d(1, 32, 3, stride=2, padding=1)
     self.res_cnn = nn.Sequential(*[
         ResidualCNN(32, 32, kernel=3, n_feats=n_feats)
         for _ in range(n_cnn_layers)
     ])
     self.fc = nn.Linear(n_feats * 32, rnn_dim)
     self.rnn = nn.Sequential(*[
         RNN(rnn_dim=rnn_dim if i == 0 else rnn_dim * 2,
             hidden_size=rnn_dim,
             batch_first=i == 0) for i in range(n_rnn_layers)
     ])
     self.dense = nn.Sequential(nn.Linear(rnn_dim * 2, rnn_dim), nn.GELU(),
                                nn.Dropout(0.2),
                                nn.Linear(rnn_dim, n_class))
     self.alphabet = Alphabet(os.path.abspath("chars.txt"))
     self.scorer = Scorer(alphabet=self.alphabet,
                          scorer_path='librispeech.scorer',
                          alpha=0.75,
                          beta=1.85)
Ejemplo n.º 12
0
def evaluate(test_csvs, create_model, try_loading):
    if FLAGS.lm_binary_path:
        scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path,
                        FLAGS.lm_trie_path, Config.alphabet)
    else:
        scorer = None

    test_csvs = FLAGS.test_files.split(',')
    test_sets = [
        create_dataset([csv],
                       batch_size=FLAGS.test_batch_size,
                       train_phase=False) for csv in test_csvs
    ]
    iterator = tfv1.data.Iterator.from_structure(
        tfv1.data.get_output_types(test_sets[0]),
        tfv1.data.get_output_shapes(test_sets[0]),
        output_classes=tfv1.data.get_output_classes(test_sets[0]))
    test_init_ops = [
        iterator.make_initializer(test_set) for test_set in test_sets
    ]

    batch_wav_filename, (batch_x, batch_x_len), batch_y = iterator.get_next()

    # One rate per layer
    no_dropout = [None] * 6
    logits, _ = create_model(batch_x=batch_x,
                             batch_size=FLAGS.test_batch_size,
                             seq_length=batch_x_len,
                             dropout=no_dropout)

    # Transpose to batch major and apply softmax for decoder
    transposed = tf.nn.softmax(tf.transpose(a=logits, perm=[1, 0, 2]))

    loss = tfv1.nn.ctc_loss(labels=batch_y,
                            inputs=logits,
                            sequence_length=batch_x_len)

    tfv1.train.get_or_create_global_step()

    # Get number of accessible CPU cores for this process
    try:
        num_processes = cpu_count()
    except NotImplementedError:
        num_processes = 1

    # Create a saver using variables from the above newly created graph
    saver = tfv1.train.Saver()

    with tfv1.Session(config=Config.session_config) as session:
        # Restore variables from training checkpoint
        loaded = try_loading(session, saver, 'best_dev_checkpoint',
                             'best validation')
        if not loaded:
            loaded = try_loading(session, saver, 'checkpoint', 'most recent')
        if not loaded:
            log_error(
                'Checkpoint directory ({}) does not contain a valid checkpoint state.'
                .format(FLAGS.checkpoint_dir))
            exit(1)

        def run_test(init_op, dataset):
            wav_filenames = []
            losses = []
            predictions = []
            ground_truths = []

            bar = create_progressbar(prefix='Test epoch | ',
                                     widgets=[
                                         'Steps: ',
                                         progressbar.Counter(), ' | ',
                                         progressbar.Timer()
                                     ]).start()
            log_progress('Test epoch...')

            step_count = 0

            # Initialize iterator to the appropriate dataset
            session.run(init_op)

            # First pass, compute losses and transposed logits for decoding
            while True:
                try:
                    batch_wav_filenames, batch_logits, batch_loss, batch_lengths, batch_transcripts = \
                        session.run([batch_wav_filename, transposed, loss, batch_x_len, batch_y])
                except tf.errors.OutOfRangeError:
                    break

                decoded = ctc_beam_search_decoder_batch(
                    batch_logits,
                    batch_lengths,
                    Config.alphabet,
                    FLAGS.beam_width,
                    num_processes=num_processes,
                    scorer=scorer,
                    cutoff_prob=FLAGS.cutoff_prob,
                    cutoff_top_n=FLAGS.cutoff_top_n)
                predictions.extend(d[0][1] for d in decoded)
                ground_truths.extend(
                    sparse_tensor_value_to_texts(batch_transcripts,
                                                 Config.alphabet))
                wav_filenames.extend(
                    wav_filename.decode('UTF-8')
                    for wav_filename in batch_wav_filenames)
                losses.extend(batch_loss)

                step_count += 1
                bar.update(step_count)

            bar.finish()

            wer, cer, samples = calculate_report(wav_filenames, ground_truths,
                                                 predictions, losses)
            mean_loss = np.mean(losses)

            # Take only the first report_count items
            report_samples = itertools.islice(samples, FLAGS.report_count)

            print('Test on %s - WER: %f, CER: %f, loss: %f' %
                  (dataset, wer, cer, mean_loss))
            print('-' * 80)
            for sample in report_samples:
                print('WER: %f, CER: %f, loss: %f' %
                      (sample.wer, sample.cer, sample.loss))
                print(' - wav: file://%s' % sample.wav_filename)
                print(' - src: "%s"' % sample.src)
                print(' - res: "%s"' % sample.res)
                print('-' * 80)

            return samples

        samples = []
        for csv, init_op in zip(test_csvs, test_init_ops):
            print('Testing model on {}'.format(csv))
            samples.extend(run_test(init_op, dataset=csv))
        return samples
Ejemplo n.º 13
0
def early_checks():
    # Check for proper scorer early
    if FLAGS.scorer_path:
        scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path,
                        Config.alphabet)
        del scorer
Ejemplo n.º 14
0
def evaluate_with_pruning(test_csvs,
                          prune_percentage,
                          random,
                          scores_file,
                          result_file,
                          verbose=True,
                          skip_lstm=False):
    '''Code originaly comes from the DeepSpeech repository (./DeepSpeech/evaluate.py).
    The code is adapted for evaluation on pruned versions of the DeepSpeech model.
    '''
    tfv1.reset_default_graph()
    if FLAGS.lm_binary_path:
        scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path,
                        FLAGS.lm_trie_path, Config.alphabet)
    else:
        scorer = None

    test_csvs = test_csvs.split(',')
    test_sets = [
        create_dataset([csv],
                       batch_size=FLAGS.test_batch_size,
                       train_phase=False) for csv in test_csvs
    ]
    iterator = tfv1.data.Iterator.from_structure(
        tfv1.data.get_output_types(test_sets[0]),
        tfv1.data.get_output_shapes(test_sets[0]),
        output_classes=tfv1.data.get_output_classes(test_sets[0]))
    test_init_ops = [
        iterator.make_initializer(test_set) for test_set in test_sets
    ]

    batch_wav_filename, (batch_x, batch_x_len), batch_y = iterator.get_next()

    # One rate per layer
    no_dropout = [None] * 6
    logits, _ = create_model(batch_x=batch_x,
                             batch_size=FLAGS.test_batch_size,
                             seq_length=batch_x_len,
                             dropout=no_dropout)

    # Transpose to batch major and apply softmax for decoder
    transposed = tf.nn.softmax(tf.transpose(a=logits, perm=[1, 0, 2]))

    loss = tfv1.nn.ctc_loss(labels=batch_y,
                            inputs=logits,
                            sequence_length=batch_x_len)

    tfv1.train.get_or_create_global_step()

    # Get number of accessible CPU cores for this process
    try:
        num_processes = cpu_count()
    except NotImplementedError:
        num_processes = 1

    # Create a saver using variables from the above newly created graph
    saver = tfv1.train.Saver()

    with tfv1.Session(config=Config.session_config) as session:

        # Create a saver using variables from the above newly created graph
        saver = tfv1.train.Saver()

        # Restore variables from training checkpoint
        loaded = False
        if not loaded and FLAGS.load in ['auto', 'last']:
            loaded = try_loading(session,
                                 saver,
                                 'checkpoint',
                                 'most recent',
                                 load_step=False)
        if not loaded and FLAGS.load in ['auto', 'best']:
            loaded = try_loading(session,
                                 saver,
                                 'best_dev_checkpoint',
                                 'best validation',
                                 load_step=False)
        if not loaded:
            print('Could not load checkpoint from {}'.format(
                FLAGS.checkpoint_dir))
            sys.exit(1)

        ###### PRUNING PART ######

        if verbose:
            if not prune_percentage: print('No pruning done.')
        else:
            if verbose: print('-' * 80)
            if verbose: print('pruning with {}%...'.format(prune_percentage))
            scores_per_layer = np.load(scores_file)
            layer_masks = prune_matrices(scores_per_layer,
                                         prune_percentage=prune_percentage,
                                         random=random,
                                         verbose=verbose,
                                         skip_lstm=skip_lstm)

            n_layers_to_prune = len(layer_masks)
            i = 0
            for index, v in enumerate(tf.trainable_variables()):
                lstm_layer_name = 'cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/kernel:0'
                if 'weights' not in v.name and v.name != lstm_layer_name:
                    continue
                if (i >= n_layers_to_prune):
                    break  # if i < total_ops, it is not yet the last layer
                # make mask into the shape of the weights
                if v.name == lstm_layer_name:
                    if skip_lstm: continue
                    # Shape of LSTM weights: [(2*neurons), (4*neurons)]
                    cell_template = np.ones((2, 4))
                    mask = np.repeat(layer_masks[i], v.shape[0] // 2, axis=0)
                    mask = mask.reshape(
                        [layer_masks[i].shape[0], v.shape[0] // 2])
                    mask = np.swapaxes(mask, 0, 1)
                    mask = np.kron(mask, cell_template)
                else:
                    idx = layer_masks[i] == 1
                    mask = np.repeat(layer_masks[i], v.shape[0], axis=0)
                    mask = mask.reshape([layer_masks[i].shape[0], v.shape[0]])
                    mask = np.swapaxes(mask, 0, 1)

                # apply mask to weights
                session.run(v.assign(tf.multiply(v, mask)))
                i += 1

        ###### END PRUNING PART ######

        def run_test(init_op, dataset):
            wav_filenames = []
            losses = []
            predictions = []
            ground_truths = []

            bar = create_progressbar(prefix='Test epoch | ',
                                     widgets=[
                                         'Steps: ',
                                         progressbar.Counter(), ' | ',
                                         progressbar.Timer()
                                     ]).start()
            log_progress('Test epoch...')

            step_count = 0

            # Initialize iterator to the appropriate dataset
            session.run(init_op)

            # First pass, compute losses and transposed logits for decoding
            while True:
                try:
                    batch_wav_filenames, batch_logits, batch_loss, batch_lengths, batch_transcripts = \
                        session.run([batch_wav_filename, transposed, loss, batch_x_len, batch_y])
                except tf.errors.OutOfRangeError:
                    break

                decoded = ctc_beam_search_decoder_batch(
                    batch_logits,
                    batch_lengths,
                    Config.alphabet,
                    FLAGS.beam_width,
                    num_processes=num_processes,
                    scorer=scorer,
                    cutoff_prob=FLAGS.cutoff_prob,
                    cutoff_top_n=FLAGS.cutoff_top_n)
                predictions.extend(d[0][1] for d in decoded)
                ground_truths.extend(
                    sparse_tensor_value_to_texts(batch_transcripts,
                                                 Config.alphabet))
                wav_filenames.extend(
                    wav_filename.decode('UTF-8')
                    for wav_filename in batch_wav_filenames)
                losses.extend(batch_loss)

                step_count += 1
                bar.update(step_count)

            bar.finish()

            wer, cer, samples = calculate_report(wav_filenames, ground_truths,
                                                 predictions, losses)
            mean_loss = np.mean(losses)

            # Take only the first report_count items
            report_samples = itertools.islice(samples, FLAGS.report_count)

            if verbose:
                print('Test on %s - WER: %f, CER: %f, loss: %f' %
                      (dataset, wer, cer, mean_loss))
            if verbose: print('-' * 80)

            if result_file:
                pruning_type = 'score-based' if not random else 'random'
                result_string = '''Results for evaluating model with pruning percentage of {}% and {} pruning:
                Test on {} - WER: {}, CER: {}, loss: {}
                '''.format(prune_percentage * 100, pruning_type, dataset, wer,
                           cer, mean_loss)
                write_to_file(result_file, result_string, 'a+')

            return wer, cer, mean_loss

        results = []
        for csv, init_op in zip(test_csvs, test_init_ops):
            if verbose: print('Testing model on {}'.format(csv))
            results.extend(run_test(init_op, dataset=csv))
        return results
Ejemplo n.º 15
0
def create_bundle(
    alphabet_path,
    lm_path,
    vocab_path,
    package_path,
    force_utf8,
    default_alpha,
    default_beta,
):
    words = set()
    with open(vocab_path) as fin:
        for line in fin:
            for word in line.split():
                words.add(word.encode("utf-8"))

    if not alphabet_path:
        raise RuntimeError("No --alphabet path specified, can't continue.")
    serialized_alphabet = Alphabet(alphabet_path).serialize()

    alphabet = NativeAlphabet()
    err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet))
    if err != 0:
        raise RuntimeError("Error loading alphabet: {}".format(err))

    scorer = Scorer()
    scorer.set_alphabet(alphabet)
    scorer.reset_params(default_alpha, default_beta)
    scorer.load_lm(lm_path)
    # TODO: Why is this not working?
    #err = scorer.load_lm(lm_path)
    #if err != ds_ctcdecoder.DS_ERR_SCORER_NO_TRIE:
    #    print('Error loading language model file: 0x{:X}.'.format(err))
    #    print('See the error codes section in https://deepspeech.readthedocs.io for a description.')
    #    sys.exit(1)
    scorer.fill_dictionary(list(words))
    shutil.copy(lm_path, package_path)
    scorer.save_dictionary(package_path, True)  # append, not overwrite
    print("Package created in {}".format(package_path))
Ejemplo n.º 16
0
def evaluate(test_csvs, create_model):
    if FLAGS.scorer_path:
        scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path,
                        Config.alphabet)
    else:
        scorer = None

    test_csvs = FLAGS.test_files.split(',')
    test_sets = [
        create_dataset([csv],
                       batch_size=FLAGS.test_batch_size,
                       train_phase=False) for csv in test_csvs
    ]
    iterator = tfv1.data.Iterator.from_structure(
        tfv1.data.get_output_types(test_sets[0]),
        tfv1.data.get_output_shapes(test_sets[0]),
        output_classes=tfv1.data.get_output_classes(test_sets[0]))
    test_init_ops = [
        iterator.make_initializer(test_set) for test_set in test_sets
    ]

    batch_wav_filename, (batch_x, batch_x_len), batch_y = iterator.get_next()

    # One rate per layer
    no_dropout = [None] * 6
    logits, _ = create_model(batch_x=batch_x,
                             batch_size=FLAGS.test_batch_size,
                             seq_length=batch_x_len,
                             dropout=no_dropout)

    # Transpose to batch major and apply softmax for decoder
    transposed = tf.nn.softmax(tf.transpose(a=logits, perm=[1, 0, 2]))

    loss = tfv1.nn.ctc_loss(labels=batch_y,
                            inputs=logits,
                            sequence_length=batch_x_len)

    tfv1.train.get_or_create_global_step()

    # Get number of accessible CPU cores for this process
    try:
        num_processes = cpu_count()
    except NotImplementedError:
        num_processes = 1

    with tfv1.Session(config=Config.session_config) as session:
        if FLAGS.load == 'auto':
            method_order = ['best', 'last']
        else:
            method_order = [FLAGS.load]
        load_or_init_graph(session, method_order)

        def run_test(init_op, dataset):
            wav_filenames = []
            losses = []
            predictions = []
            ground_truths = []

            bar = create_progressbar(prefix='Test epoch | ',
                                     widgets=[
                                         'Steps: ',
                                         progressbar.Counter(), ' | ',
                                         progressbar.Timer()
                                     ]).start()
            log_progress('Test epoch...')

            step_count = 0

            # Initialize iterator to the appropriate dataset
            session.run(init_op)

            # First pass, compute losses and transposed logits for decoding
            while True:
                try:
                    batch_wav_filenames, batch_logits, batch_loss, batch_lengths, batch_transcripts = \
                        session.run([batch_wav_filename, transposed, loss, batch_x_len, batch_y])
                except tf.errors.OutOfRangeError:
                    break

                decoded = ctc_beam_search_decoder_batch(
                    batch_logits,
                    batch_lengths,
                    Config.alphabet,
                    FLAGS.beam_width,
                    num_processes=num_processes,
                    scorer=scorer,
                    cutoff_prob=FLAGS.cutoff_prob,
                    cutoff_top_n=FLAGS.cutoff_top_n)
                predictions.extend(d[0][1] for d in decoded)
                ground_truths.extend(
                    sparse_tensor_value_to_texts(batch_transcripts,
                                                 Config.alphabet))
                wav_filenames.extend(
                    wav_filename.decode('UTF-8')
                    for wav_filename in batch_wav_filenames)
                losses.extend(batch_loss)

                step_count += 1
                bar.update(step_count)

            bar.finish()

            # Print test summary
            test_samples = calculate_and_print_report(wav_filenames,
                                                      ground_truths,
                                                      predictions, losses,
                                                      dataset)
            return test_samples

        samples = []
        for csv, init_op in zip(test_csvs, test_init_ops):
            print('Testing model on {}'.format(csv))
            samples.extend(run_test(init_op, dataset=csv))
        return samples
Ejemplo n.º 17
0
def create_bundle(
    alphabet_path,
    lm_path,
    vocab_path,
    package_path,
    force_utf8,
    default_alpha,
    default_beta,
):
    words = set()
    vocab_looks_char_based = True
    with open(vocab_path) as fin:
        for line in fin:
            for word in line.split():
                words.add(word.encode("utf-8"))
                if len(word) > 1:
                    vocab_looks_char_based = False
    print("{} unique words read from vocabulary file.".format(len(words)))

    cbm = "Looks" if vocab_looks_char_based else "Doesn't look"
    print("{} like a character based model.".format(cbm))

    if force_utf8 != None:  # pylint: disable=singleton-comparison
        use_utf8 = force_utf8.value
    else:
        use_utf8 = vocab_looks_char_based
        print("Using detected UTF-8 mode: {}".format(use_utf8))

    if use_utf8:
        serialized_alphabet = UTF8Alphabet().serialize()
    else:
        if not alphabet_path:
            raise RuntimeError("No --alphabet path specified, can't continue.")
        serialized_alphabet = Alphabet(alphabet_path).serialize()

    alphabet = NativeAlphabet()
    err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet))
    if err != 0:
        raise RuntimeError("Error loading alphabet: {}".format(err))

    scorer = Scorer()
    scorer.set_alphabet(alphabet)
    scorer.set_utf8_mode(use_utf8)
    scorer.reset_params(default_alpha, default_beta)
    err = scorer.load_lm(lm_path)
    if err != ds_ctcdecoder.DS_ERR_SCORER_NO_TRIE:
        print('Error loading language model file: 0x{:X}.'.format(err))
        print(
            'See the error codes section in https://deepspeech.readthedocs.io for a description.'
        )
        sys.exit(1)
    scorer.fill_dictionary(list(words))
    shutil.copy(lm_path, package_path)
    scorer.save_dictionary(package_path, True)  # append, not overwrite
    print("Package created in {}".format(package_path))
Ejemplo n.º 18
0
def evaluate(test_data, inference_graph):
    scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path,
                    FLAGS.lm_trie_path, Config.alphabet)

    def create_windows(features):
        num_strides = len(features) - (Config.n_context * 2)

        # Create a view into the array with overlapping strides of size
        # numcontext (past) + 1 (present) + numcontext (future)
        window_size = 2 * Config.n_context + 1
        features = np.lib.stride_tricks.as_strided(
            features, (num_strides, window_size, Config.n_input),
            (features.strides[0], features.strides[0], features.strides[1]),
            writeable=False)

        return features

    # Create overlapping windows over the features
    test_data['features'] = test_data['features'].apply(create_windows)

    with tf.Session(config=Config.session_config) as session:
        inputs, outputs, layers = inference_graph
        layer_4 = layers['rnn_output']
        layer_5 = layers['layer_5']
        layer_6 = layers['layer_6']
        # Transpose to batch major for decoder
        transposed = tf.transpose(outputs['outputs'], [1, 0, 2])

        labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None],
                                   name="labels")
        label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size],
                                          name="label_lengths")

        # We add 1 to all elements of the transcript to avoid any zero values
        # since we use that as an end-of-sequence token for converting the batch
        # into a SparseTensor. So here we convert the placeholder back into a
        # SparseTensor and subtract ones to get the real labels.
        sparse_labels = tf.contrib.layers.dense_to_sparse(labels_ph)
        neg_ones = tf.SparseTensor(sparse_labels.indices,
                                   -1 * tf.ones_like(sparse_labels.values),
                                   sparse_labels.dense_shape)
        sparse_labels = tf.sparse_add(sparse_labels, neg_ones)

        loss = tf.nn.ctc_loss(labels=sparse_labels,
                              inputs=layers['raw_logits'],
                              sequence_length=inputs['input_lengths'])

        # Create a saver using variables from the above newly created graph
        mapping = {
            v.op.name: v
            for v in tf.global_variables()
            if not v.op.name.startswith('previous_state_')
        }
        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if not checkpoint:
            log_error(
                'Checkpoint directory ({}) does not contain a valid checkpoint state.'
                .format(FLAGS.checkpoint_dir))
            exit(1)

        checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(session, checkpoint_path)

        logitses = []
        losses = []
        ## To Print the embeddings
        layer_4s = []
        layer_5s = []
        layer_6s = []

        print('Computing acoustic model predictions...')
        batch_count = len(test_data) // FLAGS.test_batch_size
        print('Batch Count: ', batch_count)
        bar = progressbar.ProgressBar(max_value=batch_count,
                                      widget=progressbar.AdaptiveETA)

        # First pass, compute losses and transposed logits for decoding
        for batch in bar(split_data(test_data, FLAGS.test_batch_size)):
            session.run(outputs['initialize_state'])
            #TODO: Need to remove it to generalize for greater batch size!
            assert FLAGS.test_batch_size == 1, 'Embedding Extraction will only work for Batch Size = 1 for now!'

            features = pad_to_dense(batch['features'].values)
            features_len = batch['features_len'].values
            labels = pad_to_dense(batch['transcript'].values + 1)
            label_lengths = batch['transcript_len'].values

            logits, loss_, lay4, lay5, lay6 = session.run(
                [transposed, loss, layer_4, layer_5, layer_6],
                feed_dict={
                    inputs['input']: features,
                    inputs['input_lengths']: features_len,
                    labels_ph: labels,
                    label_lengths_ph: label_lengths
                })

            logitses.append(logits)
            losses.extend(loss_)
            layer_4s.append(lay4)
            layer_5s.append(lay5)
            layer_6s.append(lay6)
            print('Saving to Files: ')
            #lay4.tofile('embeddings/lay4.txt')
            #lay5.tofile('embeddings/lay5.txt')
            #lay6.tofile('embeddings/lay6.txt')
            #            np.save('embeddings/lay41.npy', lay4)
            filename = batch.fname.iloc[0]
            save_np_array(lay4, Config.LAYER4 + filename + '.npy')
            save_np_array(lay5, Config.LAYER5 + filename + '.npy')
            save_np_array(lay6, Config.LAYER6 + filename + '.npy')
            #            print('\nLayer 4 Shape: ', load_np_array('embeddings/lay41.npy').shape)
            #            print('\nLayer 4 Shape: ', np.load('embeddings/lay41.npy').shape)
            print('Layer 5 Shape: ', lay5.shape)
            print('Layer 6 Shape: ', lay6.shape)
    print('LAYER4: ', Config.LAYER4)
    ground_truths = []
    predictions = []
    fnames = []

    print('Decoding predictions...')
    bar = progressbar.ProgressBar(max_value=batch_count,
                                  widget=progressbar.AdaptiveETA)

    # Get number of accessible CPU cores for this process
    try:
        num_processes = cpu_count()
    except:
        num_processes = 1

    # Second pass, decode logits and compute WER and edit distance metrics
    for logits, batch in bar(
            zip(logitses, split_data(test_data, FLAGS.test_batch_size))):
        seq_lengths = batch['features_len'].values.astype(np.int32)
        decoded = ctc_beam_search_decoder_batch(logits,
                                                seq_lengths,
                                                Config.alphabet,
                                                FLAGS.beam_width,
                                                num_processes=num_processes,
                                                scorer=scorer)
        #print('Batch\n', batch)
        ground_truths.extend(
            Config.alphabet.decode(l) for l in batch['transcript'])
        fnames.extend([l for l in batch['fname']])
        #fnames.append(batch['fname'])
        #print(fnames)
        predictions.extend(d[0][1] for d in decoded)

    distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]

    wer, cer, samples = calculate_report(ground_truths, predictions, distances,
                                         losses, fnames)
    print('Sample Lengths: ', len(samples))
    mean_loss = np.mean(losses)

    # Take only the first report_count items
    report_samples = itertools.islice(samples, FLAGS.report_count)
    print(report_samples)
    print('Test - WER: %f, CER: %f, loss: %f' % (wer, cer, mean_loss))
    print('-' * 80)
    count = 0
    for sample in report_samples:
        count += 1
        with open(Config.TEXT + sample.fname + '.txt', 'w') as f:
            f.write(sample.res)
        print("File Name: ", sample.fname)
        print('WER: %f, CER: %f, loss: %f' %
              (sample.wer, sample.distance, sample.loss))
        print(' - src: "%s"' % sample.src)
        print(' - res: "%s"' % sample.res)
        print('-' * 80)
    print('Total Count: ', count)
    return samples
Ejemplo n.º 19
0
#            log_device_placement=True)

# gpu上跑指定gpu
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
config = tf.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 0.2
config.gpu_options.allow_growth = True  #不全部占满显存, 按需分配

# 加载语言模型并设置权重
lm_binary_path = 'LM_model/lm.klm'
lm_trie_path = 'LM_model/lm_trie'
alphabet_path = 'LM_model/alphabet_zh.txt'
language_model_weight = 0.75
word_count_weight = 1.85
alphabet = Alphabet(os.path.abspath(alphabet_path))
LM_model = Scorer(language_model_weight, word_count_weight, lm_binary_path,
                  lm_trie_path, alphabet)

# 加载声学模型
load_dir = "speech_model/"
speech_model = "speech_model/20190804model.pb"
with gfile.FastGFile(speech_model, 'rb') as f:
    graph_def = tf.GraphDef()
    graph_def.ParseFromString(f.read())
    input_x, output_y, relu = tf.import_graph_def(
        graph_def,
        name='',
        return_elements=[
            "acoustic_input:0", "time_distributed_1/Reshape_1:0",
            "conv1d/Relu:0"
        ])
speech_sess = tf.Session(graph=tf.get_default_graph(), config=config)
Ejemplo n.º 20
0
def create_bundle(
    alphabet_path,
    lm_path,
    vocab_path,
    package_path,
    force_utf8,
    default_alpha,
    default_beta,
):
    words = set()
    vocab_looks_char_based = True
    with open(vocab_path) as fin:
        for line in fin:
            for word in line.split():
                words.add(word.encode("utf-8"))
                if len(word) > 1:
                    vocab_looks_char_based = False
    print("{} unique words read from vocabulary file.".format(len(words)))
    print("{} like a character based model.".format(
        "Looks" if vocab_looks_char_based else "Doesn't look"))

    if force_utf8 != None:  # pylint: disable=singleton-comparison
        use_utf8 = force_utf8.value
        print("Forcing UTF-8 mode = {}".format(use_utf8))
    else:
        use_utf8 = vocab_looks_char_based

    if use_utf8:
        serialized_alphabet = UTF8Alphabet().serialize()
    else:
        if not alphabet_path:
            print("No --alphabet path specified, can't continue.")
            sys.exit(1)
        serialized_alphabet = Alphabet(alphabet_path).serialize()

    alphabet = NativeAlphabet()
    err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet))
    if err != 0:
        print("Error loading alphabet: {}".format(err))
        sys.exit(1)

    scorer = Scorer()
    scorer.set_alphabet(alphabet)
    scorer.set_utf8_mode(use_utf8)
    scorer.reset_params(default_alpha, default_beta)
    scorer.load_lm(lm_path)
    scorer.fill_dictionary(list(words))
    shutil.copy(lm_path, package_path)
    scorer.save_dictionary(package_path, True)  # append, not overwrite
    print("Package created in {}".format(package_path))
Ejemplo n.º 21
0
def evaluate(test_csvs, create_model, try_loading):
    scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path,
                    FLAGS.lm_trie_path, Config.alphabet)

    test_set = create_dataset(test_csvs,
                              batch_size=FLAGS.test_batch_size,
                              cache_path=FLAGS.test_cached_features_path)
    it = test_set.make_one_shot_iterator()

    (batch_x, batch_x_len), batch_y = it.get_next()

    # One rate per layer
    no_dropout = [None] * 6
    logits, _ = create_model(batch_x=batch_x,
                             seq_length=batch_x_len,
                             dropout=no_dropout)

    # Transpose to batch major and apply softmax for decoder
    transposed = tf.nn.softmax(tf.transpose(logits, [1, 0, 2]))

    loss = tf.nn.ctc_loss(labels=batch_y,
                          inputs=logits,
                          sequence_length=batch_x_len)

    global_step = tf.train.get_or_create_global_step()

    with tf.Session(config=Config.session_config) as session:
        # Create a saver using variables from the above newly created graph
        saver = tf.train.Saver()

        # Restore variables from training checkpoint
        loaded = try_loading(session, saver, 'best_dev_checkpoint',
                             'best validation')
        if not loaded:
            loaded = try_loading(session, saver, 'checkpoint', 'most recent')
        if not loaded:
            log_error(
                'Checkpoint directory ({}) does not contain a valid checkpoint state.'
                .format(FLAGS.checkpoint_dir))
            exit(1)

        logitses = []
        losses = []
        seq_lengths = []
        ground_truths = []

        print('Computing acoustic model predictions...')
        bar = progressbar.ProgressBar(widgets=[
            'Steps: ',
            progressbar.Counter(), ' | ',
            progressbar.Timer()
        ])

        step_count = 0

        # First pass, compute losses and transposed logits for decoding
        while True:
            try:
                logits, loss_, lengths, transcripts = session.run(
                    [transposed, loss, batch_x_len, batch_y])
            except tf.errors.OutOfRangeError:
                break

            step_count += 1
            bar.update(step_count)

            logitses.append(logits)
            losses.extend(loss_)
            seq_lengths.append(lengths)
            ground_truths.extend(
                sparse_tensor_value_to_texts(transcripts, Config.alphabet))

    bar.finish()

    predictions = []

    # Get number of accessible CPU cores for this process
    try:
        num_processes = cpu_count()
    except:
        num_processes = 1

    print('Decoding predictions...')
    bar = progressbar.ProgressBar(max_value=step_count,
                                  widget=progressbar.AdaptiveETA)

    # Second pass, decode logits and compute WER and edit distance metrics
    for logits, seq_length in bar(zip(logitses, seq_lengths)):
        decoded = ctc_beam_search_decoder_batch(logits,
                                                seq_length,
                                                Config.alphabet,
                                                FLAGS.beam_width,
                                                num_processes=num_processes,
                                                scorer=scorer)
        predictions.extend(d[0][1] for d in decoded)

    distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]

    wer, cer, samples = calculate_report(ground_truths, predictions, distances,
                                         losses)
    mean_loss = np.mean(losses)

    # Take only the first report_count items
    report_samples = itertools.islice(samples, FLAGS.report_count)

    print('Test - WER: %f, CER: %f, loss: %f' % (wer, cer, mean_loss))
    print('-' * 80)
    for sample in report_samples:
        print('WER: %f, CER: %f, loss: %f' %
              (sample.wer, sample.distance, sample.loss))
        print(' - src: "%s"' % sample.src)
        print(' - res: "%s"' % sample.res)
        print('-' * 80)

    return samples
Ejemplo n.º 22
0
import os
from ds_ctcdecoder import ctc_beam_search_decoder_batch, Scorer
from util.text import Alphabet, UTF8Alphabet
alphabet = UTF8Alphabet()
scorer = Scorer(0.75, 1.85,
                "/content/DeepSpeech-Indo/data/lm/kenlm_tamil.scorer",
                alphabet)
print(scorer)
Ejemplo n.º 23
0
def main():
    alphabet_txt = os.path.join(LANG.model_dir, 'alphabet.txt')
    raw_txt_gz = os.path.join(LANG.model_dir, 'raw.txt.gz')
    unprepared_txt = os.path.join(LANG.model_dir, 'unprepared.txt')
    prepared_txt = os.path.join(LANG.model_dir, 'prepared.txt')
    vocabulary_txt = os.path.join(LANG.model_dir, 'vocabulary.txt')
    unfiltered_arpa = os.path.join(LANG.model_dir, 'unfiltered.arpa')
    filtered_arpa = os.path.join(LANG.model_dir, 'filtered.arpa')
    lm_binary = os.path.join(LANG.model_dir, 'lm.binary')
    kenlm_scorer = os.path.join(LANG.model_dir, 'kenlm.scorer')
    temp_prefix = os.path.join(LANG.model_dir, 'tmp')

    section('Writing alphabet file', empty_lines_before=1)
    with open(alphabet_txt, 'w', encoding='utf-8') as alphabet_file:
        alphabet_file.write('\n'.join(LANG.alphabet) + '\n')

    redo = ARGS.force_download

    section('Downloading text data')
    redo = maybe_download(LANG.text_url, raw_txt_gz, force=redo)

    section('Unzipping text data')
    redo = maybe_ungzip(raw_txt_gz, unprepared_txt, force=redo)

    redo = redo or ARGS.force_prepare

    section('Preparing text and building vocabulary')
    if redo or not os.path.isfile(prepared_txt) or not os.path.isfile(vocabulary_txt):
        redo = True
        announce('Preparing {} shards of "{}"...'.format(ARGS.workers, unprepared_txt))
        counters = Queue(ARGS.workers)
        source_bytes = os.path.getsize(unprepared_txt)
        aggregator_process = Process(target=aggregate_counters, args=(vocabulary_txt, source_bytes, counters))
        aggregator_process.start()
        counter_processes = list(map(lambda index: Process(target=count_words, args=(index, counters)),
                                     range(ARGS.workers)))
        try:
            for p in counter_processes:
                p.start()
            for p in counter_processes:
                p.join()
            counters.put(STOP_TOKEN)
            aggregator_process.join()
            print('')
            partials = list(map(lambda i: get_partial_path(i), range(ARGS.workers)))
            join_files(partials, prepared_txt)
            for partial in partials:
                os.unlink(partial)
        except KeyboardInterrupt:
            aggregator_process.terminate()
            for p in counter_processes:
                p.terminate()
            raise
    else:
        announce('Files "{}" and \n\t"{}" existing - not preparing'.format(prepared_txt, vocabulary_txt))

    redo = redo or ARGS.force_generate

    section('Building unfiltered language model')
    if redo or not os.path.isfile(unfiltered_arpa):
        redo = True
        lmplz_args = [
            KENLM_BIN + '/lmplz',
            '--temp_prefix', temp_prefix,
            '--memory', '80%',
            '--discount_fallback',
            '--limit_vocab_file', vocabulary_txt,
            '--text', prepared_txt,
            '--arpa', unfiltered_arpa,
            '--skip', 'symbols',
            '--order', str(LANG.order)
        ]
        if len(LANG.prune) > 0:
            lmplz_args.append('--prune')
            lmplz_args.extend(list(map(str, LANG.prune)))
        subprocess.check_call(lmplz_args)
    else:
        announce('File "{}" existing - not generating'.format(unfiltered_arpa))

    section('Filtering language model')
    if redo or not os.path.isfile(filtered_arpa):
        redo = True
        with open(vocabulary_txt, 'rb') as vocabulary_file:
            vocabulary_content = vocabulary_file.read()
        subprocess.run([
            KENLM_BIN + '/filter',
            'single',
            'model:' + unfiltered_arpa,
            filtered_arpa
        ], input=vocabulary_content, check=True)
    else:
        announce('File "{}" existing - not filtering'.format(filtered_arpa))

    section('Generating binary representation')
    if redo or not os.path.isfile(lm_binary):
        redo = True
        subprocess.check_call([
            KENLM_BIN + '/build_binary',
            '-a', '255',
            '-q', '8',
            '-v',
            'trie',
            filtered_arpa,
            lm_binary
        ])
    else:
        announce('File "{}" existing - not generating'.format(lm_binary))

    section('Building scorer')
    if redo or not os.path.isfile(kenlm_scorer):
        redo = True
        words = set()
        vocab_looks_char_based = True
        with open(vocabulary_txt) as vocabulary_file:
            for line in vocabulary_file:
                for word in line.split():
                    words.add(word.encode())
                    if len(word) > 1:
                        vocab_looks_char_based = False
        announce("{} unique words read from vocabulary file.".format(len(words)))
        announce(
            "{} like a character based model.".format(
                "Looks" if vocab_looks_char_based else "Doesn't look"
            )
        )
        if ARGS.alphabet_mode == 'auto':
            use_utf8 = vocab_looks_char_based
        elif ARGS.alphabet_mode == 'utf8':
            use_utf8 = True
        else:
            use_utf8 = False
        serialized_alphabet = get_serialized_utf8_alphabet() if use_utf8 else LANG.get_serialized_alphabet()
        from ds_ctcdecoder import Scorer, Alphabet
        alphabet = Alphabet()
        err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet))
        if err != 0:
            announce('Error loading alphabet: {}'.format(err))
            sys.exit(1)
        scorer = Scorer()
        scorer.set_alphabet(alphabet)
        scorer.set_utf8_mode(use_utf8)
        scorer.reset_params(LANG.alpha, LANG.beta)
        scorer.load_lm(lm_binary)
        scorer.fill_dictionary(list(words))
        shutil.copy(lm_binary, kenlm_scorer)
        scorer.save_dictionary(kenlm_scorer, True)  # append, not overwrite
        announce('Package created in {}'.format(kenlm_scorer))
        announce('Testing package...')
        scorer = Scorer()
        scorer.load_lm(kenlm_scorer)
    else:
        announce('File "{}" existing - not building'.format(kenlm_scorer))
Ejemplo n.º 24
0
def main(_):
    initialize_globals()

    if not FLAGS.test_files:
        log_error('You need to specify what files to use for evaluation via '
                  'the --test_files flag.')
        exit(1)

    global alphabet
    alphabet = Alphabet(FLAGS.alphabet_config_path)

    scorer = Scorer(FLAGS.lm_weight, FLAGS.valid_word_count_weight,
                    FLAGS.lm_binary_path, FLAGS.lm_trie_path,
                    alphabet)

    # sort examples by length, improves packing of batches and timesteps
    test_data = preprocess(
        FLAGS.test_files.split(','),
        FLAGS.test_batch_size,
        alphabet=alphabet,
        numcep=N_FEATURES,
        numcontext=N_CONTEXT,
        hdf5_cache_path=FLAGS.hdf5_test_set).sort_values(
        by="features_len",
        ascending=False)

    def create_windows(features):
        num_strides = len(features) - (N_CONTEXT * 2)

        # Create a view into the array with overlapping strides of size
        # numcontext (past) + 1 (present) + numcontext (future)
        window_size = 2*N_CONTEXT+1
        features = np.lib.stride_tricks.as_strided(
            features,
            (num_strides, window_size, N_FEATURES),
            (features.strides[0], features.strides[0], features.strides[1]),
            writeable=False)

        return features

    # Create overlapping windows over the features
    test_data['features'] = test_data['features'].apply(create_windows)

    with tf.Session() as session:
        inputs, outputs, layers = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1)

        # Transpose to batch major for decoder
        transposed = tf.transpose(outputs['outputs'], [1, 0, 2])

        labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None], name="labels")
        label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size], name="label_lengths")

        sparse_labels = tf.cast(ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32)
        loss = tf.nn.ctc_loss(labels=sparse_labels,
                              inputs=layers['raw_logits'],
                              sequence_length=inputs['input_lengths'])

        # Create a saver using variables from the above newly created graph
        mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')}
        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if not checkpoint:
            log_error('Checkpoint directory ({}) does not contain a valid checkpoint state.'.format(FLAGS.checkpoint_dir))
            exit(1)

        checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(session, checkpoint_path)

        logitses = []
        losses = []

        print('Computing acoustic model predictions...')
        batch_count = len(test_data) // FLAGS.test_batch_size
        bar = progressbar.ProgressBar(max_value=batch_count,
                                      widget=progressbar.AdaptiveETA)

        # First pass, compute losses and transposed logits for decoding
        for batch in bar(split_data(test_data, FLAGS.test_batch_size)):
            session.run(outputs['initialize_state'])

            features = pad_to_dense(batch['features'].values)
            features_len = batch['features_len'].values
            labels = pad_to_dense(batch['transcript'].values)
            label_lengths = batch['transcript_len'].values

            logits, loss = session.run([transposed, loss], feed_dict={
                inputs['input']: features,
                inputs['input_lengths']: features_len,
                labels_ph: labels,
                label_lengths_ph: label_lengths
            })

            logitses.append(logits)
            losses.extend(loss)

        ground_truths = []
        predictions = []
        distances = []

        print('Decoding predictions...')
        bar = progressbar.ProgressBar(max_value=batch_count,
                                      widget=progressbar.AdaptiveETA)

        # Get number of accessible CPU cores for this process
        num_processes = len(os.sched_getaffinity(0))

        # Second pass, decode logits and compute WER and edit distance metrics
        for logits, batch in bar(zip(logitses, split_data(test_data, FLAGS.test_batch_size))):
            seq_lengths = batch['features_len'].values.astype(np.int32)
            decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, alphabet, FLAGS.beam_width,
                                                    num_processes=num_processes, scorer=scorer)

            ground_truths.extend(alphabet.decode(l) for l in batch['transcript'])
            predictions.extend(d[0][1] for d in decoded)
            distances.extend(levenshtein(a, b) for a, b in zip(labels, predictions))

    wer, samples = calculate_report(ground_truths, predictions, distances, losses)
    mean_edit_distance = np.mean(distances)
    mean_loss = np.mean(losses)

    # Take only the first report_count items
    report_samples = itertools.islice(samples, FLAGS.report_count)

    print('Test - WER: %f, loss: %f, mean edit distance: %f' %
          (wer, mean_loss, mean_edit_distance))
    print('-' * 80)
    for sample in report_samples:
        print('WER: %f, loss: %f, edit distance: %f' %
              (sample.wer, sample.loss, sample.distance))
        print(' - src: "%s"' % sample.src)
        print(' - res: "%s"' % sample.res)
        print('-' * 80)

    if FLAGS.test_output_file:
        json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))
Ejemplo n.º 25
0
def evaluate(test_data, inference_graph, alphabet):
    scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                    FLAGS.lm_binary_path, FLAGS.lm_trie_path,
                    Config.alphabet)


    def create_windows(features):
        num_strides = len(features) - (Config.n_context * 2)

        # Create a view into the array with overlapping strides of size
        # numcontext (past) + 1 (present) + numcontext (future)
        window_size = 2*Config.n_context+1
        features = np.lib.stride_tricks.as_strided(
            features,
            (num_strides, window_size, Config.n_input),
            (features.strides[0], features.strides[0], features.strides[1]),
            writeable=False)

        return features

    # Create overlapping windows over the features
    test_data['features'] = test_data['features'].apply(create_windows)

    with tf.Session(config=Config.session_config) as session:
        inputs, outputs, layers = inference_graph

        # Transpose to batch major for decoder
        transposed = tf.transpose(outputs['outputs'], [1, 0, 2])

        labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None], name="labels")
        label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size], name="label_lengths")

        sparse_labels = tf.cast(ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32)
        loss = tf.nn.ctc_loss(labels=sparse_labels,
                              inputs=layers['raw_logits'],
                              sequence_length=inputs['input_lengths'])

        # Create a saver using variables from the above newly created graph
        mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')}
        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        if FLAGS.checkpoint_dir is not None:
            checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
            if not checkpoint:
                log_error('Checkpoint directory ({}) does not contain a valid checkpoint state.'.format(FLAGS.checkpoint_dir))
                exit(1)

            checkpoint_path = checkpoint.model_checkpoint_path
            saver.restore(session, checkpoint_path)

        logitses = []
        losses = []

        print('Computing acoustic model predictions...')
        batch_count = len(test_data) // FLAGS.test_batch_size
        bar = progressbar.ProgressBar(max_value=batch_count,
                                      widget=progressbar.AdaptiveETA)

        # First pass, compute losses and transposed logits for decoding
        for batch in bar(split_data(test_data, FLAGS.test_batch_size)):
            session.run(outputs['initialize_state'])

            features = pad_to_dense(batch['features'].values)
            features_len = batch['features_len'].values
            labels = pad_to_dense(batch['transcript'].values)
            label_lengths = batch['transcript_len'].values

            logits, loss_ = session.run([transposed, loss], feed_dict={
                inputs['input']: features,
                inputs['input_lengths']: features_len,
                labels_ph: labels,
                label_lengths_ph: label_lengths
            })

            logitses.append(logits)
            losses.extend(loss_)

    ground_truths = []
    predictions = []

    print('Decoding predictions...')
    bar = progressbar.ProgressBar(max_value=batch_count,
                                  widget=progressbar.AdaptiveETA)

    # Get number of accessible CPU cores for this process
    try:
        num_processes = cpu_count()
    except:
        num_processes = 1

    # Second pass, decode logits and compute WER and edit distance metrics
    for logits, batch in bar(zip(logitses, split_data(test_data, FLAGS.test_batch_size))):
        seq_lengths = batch['features_len'].values.astype(np.int32)
        decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, alphabet, FLAGS.beam_width,
                                                num_processes=num_processes, scorer=scorer)

        ground_truths.extend(alphabet.decode(l) for l in batch['transcript'])
        predictions.extend(d[0][1] for d in decoded)

    distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]

    wer, samples = calculate_report(ground_truths, predictions, distances, losses)
    mean_edit_distance = np.mean(distances)
    mean_loss = np.mean(losses)

    # Take only the first report_count items
    report_samples = itertools.islice(samples, FLAGS.report_count)

    print('Test - WER: %f, CER: %f, loss: %f' %
          (wer, mean_edit_distance, mean_loss))
    print('-' * 80)
    for sample in report_samples:
        print('WER: %f, CER: %f, loss: %f' %
              (sample.wer, sample.distance, sample.loss))
        print(' - src: "%s"' % sample.src)
        print(' - res: "%s"' % sample.res)
        print('-' * 80)

    return samples
Ejemplo n.º 26
0
text_file = open("chars_small.txt", "w", encoding='utf-8')
text_file.write('\n'.join([x if x != '#' else '\\#' for x in list(classes)]))
text_file.close()


def softmax(matrix):
    time_steps, _ = matrix.shape
    result = np.zeros(matrix.shape)
    for t in range(time_steps):
        e = np.exp(matrix[t, :])
        result[t, :] = e / np.sum(e)
    return result


def load_rnn_output(fn):
    return np.genfromtxt(fn, delimiter=';')[:, :-1]


alphabet = Alphabet(os.path.abspath("chars_small.txt"))
crnn_output = softmax(load_rnn_output('./rnn_output.csv'))
res = ctc_beam_search_decoder(probs_seq=crnn_output,
                              alphabet=alphabet,
                              beam_size=25,
                              scorer=Scorer(alphabet=alphabet,
                                            scorer_path='iam.scorer',
                                            alpha=0.75,
                                            beta=1.85))
# predicted: the fake friend of the family has to
# actual: the fake friend of the family, like the
print(res[0][1])
Ejemplo n.º 27
0
from ds_ctcdecoder import ctc_beam_search_decoder, Scorer

lm_alpha = 0.75
lm_beta = 1.85
lm_binary_path = 'lm/lm.binary'
lm_trie_path = 'lm/trie'
beam_width = 32
cutoff_prob = 1.0
cutoff_top_n = 300
scorer = None

from text import Alphabet

alphabet = Alphabet('alphabet.txt')

scorer = Scorer(lm_alpha, lm_beta, lm_binary_path, lm_trie_path, alphabet)


def decodex(txt, mapping):
    out = ''
    for ch in txt:
        out = out + mapping[ch]
    return out


mapping = {}

with open('arabic_mapping.txt', 'r', encoding='utf-8') as inf:
    for line in inf:
        key, val = line.split('\t')
        mapping[key] = val.strip()