Example #1
0
def process_single_file(row):
    # row = index, Series
    _, file = row
    features = audiofile_to_input_vector(file.wav_filename, N_FEATURES, N_CONTEXT)
    transcript = text_to_char_array(file.transcript, alphabet)

    return features, len(features), transcript, len(transcript)
    def find_transcripts(self, wav_file_path, visual_feature_json_path=None):
        '''
		Args: 
			wav_file_path:		the filepath for your wav file.
			visual_features:	Visual features for video based speech recognition.
								These will be required when the exported model is of AVSR type.
							 
		'''
        if self.use_visual_features:
            assert visual_feature_json_path is not None

            source = np.array([(get_audio_visual_feature_vector(
                wav_file_path, visual_feature_json_path,
                NUM_MFCC_COEFF + NUM_VISUAL, N_CONTEXT))])
        else:
            source = np.array([
                (audiofile_to_input_vector(wav_file_path, NUM_MFCC_COEFF,
                                           N_CONTEXT))
            ])
        source_len = np.array([(len(source[-1]))])

        feed_dict = {self.input: source, self.input_len: source_len}

        decoded = self.session.run(self.output, feed_dict)[0][0]
        # session.run() will return shape = (1,1,X). X = Number of characters in the transcript

        transcript = ndarray_to_text(decoded)
        if self.use_spell_check:
            transcript = correction(transcript)

        return transcript
Example #3
0
 def _populate_batch_queue(self, session, coord):
     '''
     Queue thread routine.
     '''
     file_count = len(self._data_set.files)
     index = -1
     while not coord.should_stop():
         index = self._data_set.next_index(index) % file_count
         wav_file, transcript = self._data_set.files[index]
         source = audiofile_to_input_vector(wav_file,
                                            self._model_feeder.numcep,
                                            self._model_feeder.numcontext)
         source_len = len(source)
         target = text_to_char_array(
             transcript, self._alphabet)  ## 이 부분을 diphone 형태로 받아오도록 수정
         target_len = len(target)
         if source_len < target_len:
             raise ValueError(
                 'Error: Audio file {} is too short for transcription.'.
                 format(wav_file))
         try:
             session.run(self._enqueue_op,
                         feed_dict={
                             self._model_feeder.ph_x: source,
                             self._model_feeder.ph_x_length: source_len,
                             self._model_feeder.ph_y: target,
                             self._model_feeder.ph_y_length: target_len
                         })
         except tf.errors.CancelledError:
             return
Example #4
0
 def _populate_batch_queue(self, session, coord):
     '''
     Queue thread routine.
     '''
     file_count = len(self._data_set.files)
     index = -1
     while not coord.should_stop():
         index = self._data_set.next_index(index) % file_count
         wav_file, transcript = self._data_set.files[index]
         source = audiofile_to_input_vector(wav_file,
                                            self._model_feeder.numcep,
                                            self._model_feeder.numcontext)
         source_len = len(source)
         target = text_to_char_array(transcript)
         target_len = len(target)
         try:
             session.run(self._enqueue_op,
                         feed_dict={
                             self._model_feeder.ph_x: source,
                             self._model_feeder.ph_x_length: source_len,
                             self._model_feeder.ph_y: target,
                             self._model_feeder.ph_y_length: target_len
                         })
         except tf.errors.CancelledError:
             return
Example #5
0
 def _populate_batch_queue(self):
     with self._graph.as_default():
         while True:
             n_steps = 0
             sources = []
             targets = []
             for index, (txt_file,
                         wav_file) in enumerate(self._files_circular_list):
                 if index >= self._batch_size:
                     break
                 next_source = audiofile_to_input_vector(
                     wav_file, self._numcep, self._numcontext)
                 if n_steps < next_source.shape[0]:
                     n_steps = next_source.shape[0]
                 sources.append(next_source)
                 with open(txt_file) as open_txt_file:
                     targets.append(open_txt_file.read())
             target = texts_to_sparse_tensor(targets)
             for index, next_source in enumerate(sources):
                 npad = ((0, (n_steps - next_source.shape[0])), (0, 0))
                 sources[index] = np.pad(next_source,
                                         pad_width=npad,
                                         mode='constant')
             source = np.array(sources)
             self._batch_queue.put((source, target))
Example #6
0
def do_single_file_inference(input_file_path):
    with tf.Session(config=Config.session_config) as session:
        inputs, outputs, layers = create_inference_graph(batch_size=1,
                                                         n_steps=-1)

        # REVIEW josephz: Hack: print all layers here.
        for i, l in enumerate(layers):
            print("layer '{}': '{}'".format(i, l))

        # Create a saver using variables from the above newly created graph
        mapping = {
            v.op.name: v
            for v in tf.global_variables()
            if not v.op.name.startswith('previous_state_')
        }
        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        # TODO: This restores the most recent checkpoint, but if we use validation to counteract
        #       over-fitting, we may want to restore an earlier checkpoint.
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if not checkpoint:
            log_error(
                'Checkpoint directory ({}) does not contain a valid checkpoint state.'
                .format(FLAGS.checkpoint_dir))
            exit(1)

        checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(session, checkpoint_path)

        session.run(outputs['initialize_state'])

        features = audiofile_to_input_vector(input_file_path, Config.n_input,
                                             Config.n_context)
        num_strides = len(features) - (Config.n_context * 2)

        # Create a view into the array with overlapping strides of size
        # numcontext (past) + 1 (present) + numcontext (future)
        window_size = 2 * Config.n_context + 1
        features = np.lib.stride_tricks.as_strided(
            features, (num_strides, window_size, Config.n_input),
            (features.strides[0], features.strides[0], features.strides[1]),
            writeable=False)

        logits = session.run(outputs['outputs'],
                             feed_dict={
                                 inputs['input']: [features],
                                 inputs['input_lengths']: [num_strides],
                             })

        logits = np.squeeze(logits)

        scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path,
                        FLAGS.lm_trie_path, Config.alphabet)
        decoded = ctc_beam_search_decoder(logits,
                                          Config.alphabet,
                                          FLAGS.beam_width,
                                          scorer=scorer)
        # Print highest probability result
        print(decoded[0][1])
Example #7
0
def process_single_file(row, numcep, numcontext, alphabet):
    # row = index, Series
    _, file = row
    features = audiofile_to_input_vector(file.wav_filename, numcep, numcontext)
    features_len = len(features) - 2*numcontext
    transcript = text_to_char_array(file.transcript, alphabet)

    if features_len < len(transcript):
        raise ValueError('Error: Audio file {} is too short for transcription.'.format(file.wav_filename))

    return features, features_len, transcript, len(transcript)
Example #8
0
def process_single_file(row, numcep, numcontext, alphabet):
    # row = index, Series
    _, file = row
    features = audiofile_to_input_vector(file.wav_filename, numcep, numcontext)
    features_len = len(features) - 2*numcontext
    transcript = text_to_char_array(file.transcript, alphabet)

    if features_len < len(transcript):
        raise ValueError('Error: Audio file {} is too short for transcription.'.format(file.wav_filename))

    return features, features_len, transcript, len(transcript)
Example #9
0
def main(_):
    if not FLAGS.server:
        print 'please specify server host:port'
        return
    if not FLAGS.file:
        print 'pleace specify an audio file'
        return

    audio_waves = audiofile_to_input_vector(FLAGS.file, FLAGS.n_input,
                                            FLAGS.n_context)
    audio = np.array([audio_waves])
    do_inference(FLAGS.server, audio)
Example #10
0
    def _compute_source_target(self):
        txt_file = self._txt_files[0]
        wav_file = path.splitext(txt_file)[0] + ".wav"

        audio_waves = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext)

        with open(txt_file) as open_txt_file:
            original = ' '.join(open_txt_file.read().strip().lower().split(' ')[2:]).replace('.', '')

        target = text_to_char_array(original)

        return audio_waves, len(audio_waves), target, len(target)
Example #11
0
 def _populate_batch_queue(self, session):
     for txt_file, wav_file in self._files_circular_list:
         source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext)
         source_len = len(source)
         with codecs.open(txt_file, encoding="utf-8") as open_txt_file:
             target = unicodedata.normalize("NFKD", open_txt_file.read()).encode("ascii", "ignore")
             target = text_to_char_array(target)
         target_len = len(target)
         session.run(self._enqueue_op, feed_dict={
             self._x: source,
             self._x_length: source_len,
             self._y: target,
             self._y_length: target_len})
 def _populate_batch_queue(self, session):
     for wav_file, transcript in self._indices():
         source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext)
         source_len = len(source)
         target = text_to_char_array(transcript)
         target_len = len(target)
         try:
             session.run(self._enqueue_op, feed_dict={
                 self._x: source,
                 self._x_length: source_len,
                 self._y: target,
                 self._y_length: target_len})
         except tf.errors.CancelledError:
             return
Example #13
0
def pipeline(data):
    data = data.head(2)  #overfitting  it for 2 file
    #print train['transcript']
    inputs_encoder = []
    inputs_decoder = []
    outputs_decoder = []
    decoder_length = []
    sequence_length = []
    for ind, row in data.iterrows():
        inputs_encoder.append(
            audiofile_to_input_vector(row['wav_filename'], 26, 0))
        inputs_decoder.append(
            np.append([1], text_to_char_array(row['transcript'])))
        outputs_decoder.append(
            np.append(text_to_char_array(row['transcript']), [1]))
        sequence_length.append(
            audiofile_to_input_vector(row['wav_filename'], 26, 0).shape[0])
        decoder_length.append(len(row['transcript']) + 1)
    xt_decoder_input, xlen_decoder_input = helpers2.batch(inputs_decoder)
    xt_encoder, xlen_encoder = helpers.batch(inputs_encoder)
    xt_decoder_output, xlen_decoder_output = helpers2.batch(outputs_decoder)
    sequence_length = np.asarray(sequence_length, dtype=np.int32)
    decoder_length = np.asarray(decoder_length, dtype=np.int32)
    #print inputs_encoder[1].shape
    #print inputs_decoder[1].shape
    #print xt_encoder.shape
    #print xt_encoder.shape
    #print xt_decoder_input.dtype
    #print xt_decoder_output.shape
    #fd={encoder_inputs_embedded:xt_encoder,seq_len_tensor:sequence_length,decoder_lengths:decoder_length,decoder_inputs:xt_decoder_input,decoder_targets:xt_decoder_output}
    return ({
        "A": xt_encoder,
        "B": xt_decoder_input,
        "C": sequence_length,
        "D": decoder_length
    }, xt_decoder_output)
Example #14
0
def do_single_file_inference(input_file_path):
    with tf.Session(config=Config.session_config) as session:
        inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1)

        # Create a saver using variables from the above newly created graph
        mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')}
        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        # TODO: This restores the most recent checkpoint, but if we use validation to counteract
        #       over-fitting, we may want to restore an earlier checkpoint.
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if not checkpoint:
            log_error('Checkpoint directory ({}) does not contain a valid checkpoint state.'.format(FLAGS.checkpoint_dir))
            exit(1)

        checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(session, checkpoint_path)
        session.run(outputs['initialize_state'])

        features = audiofile_to_input_vector(input_file_path, Config.n_input, Config.n_context)
        num_strides = len(features) - (Config.n_context * 2)

        # Create a view into the array with overlapping strides of size
        # numcontext (past) + 1 (present) + numcontext (future)
        window_size = 2*Config.n_context+1
        features = np.lib.stride_tricks.as_strided(
            features,
            (num_strides, window_size, Config.n_input),
            (features.strides[0], features.strides[0], features.strides[1]),
            writeable=False)

        logits = session.run(outputs['outputs'], feed_dict = {
            inputs['input']: [features],
            inputs['input_lengths']: [num_strides],
        })

        logits = np.squeeze(logits)

        scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                        FLAGS.lm_binary_path, FLAGS.lm_trie_path,
                        Config.alphabet)
        decoded = ctc_beam_search_decoder(logits, Config.alphabet, FLAGS.beam_width, scorer=scorer)
        # Print highest probability result
        print(decoded[0][1])
def _get_files_mfcc(wav_filenames):
    # print('Processing MFCC...')
    mfccs = []
    lens = []
    for audio_fname in wav_filenames:
        this_mfcc = audiofile_to_input_vector(audio_fname, n_input, n_context)
        if len(this_mfcc) != feature_len:
            needlen = feature_len - len(this_mfcc)
            a = ([[0 for x in range(feature_dim)] for y in range(needlen)])
            this_mfcc = np.concatenate((this_mfcc, np.array(a)))
        # print(this_mfcc.shape)
        this_mfcc = np.reshape(this_mfcc, (feature_len, n_input, 1))
        mfccs.append(this_mfcc)
        lens.append(len(this_mfcc))
    a_mfccs = np.array(mfccs)  # shape, (batch, time_step_len, feature_len)
    a_lens = np.array(lens)  # shape, (batch, 1), value == time_step_len
    # print('MFCCs shape', a_mfccs.shape, a_lens.shape)
    return a_mfccs, a_lens
Example #16
0
def do_inference(hostport, audio_file, server):
    audio_waves = audiofile_to_input_vector(audio_file, FLAGS.n_input,
                                            FLAGS.n_context)
    audio = np.array([audio_waves])

    host, port = hostport.split(':')
    channel = implementations.insecure_channel(host, int(port))
    stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)

    request = predict_pb2.PredictRequest()
    request.model_spec.name = 'deepspeech'
    request.inputs['input'].CopyFrom(tf.contrib.util.make_tensor_proto(audio))

    event = threading.Event()
    result_future = stub.Predict.future(request, 5.0)  # 5 seconds
    result_future.add_done_callback(_create_rpc_callback(event, server))
    if event.is_set() != True:
        event.wait()
def make_checkpoint(model_path, audio_path, save_path):
    graph_def = GraphDef()
    loaded = graph_def.ParseFromString(open(model_path, 'rb').read())

    with tf.Graph().as_default() as graph:
        new_input = tf.placeholder(tf.float32, [None, None, None],
                                   name='new_input')
        # Load the saved .pb into the current graph to let us grab
        # access to the weights.
        logits, = tf.import_graph_def(
            graph_def,
            input_map={'input_node:0': new_input},
            return_elements=['logits:0'],
            name='newname',
            op_dict=None,
            producer_op_list=None
        )

        # Now let's dump these weights into a new copy of the network.
        with tf.Session(graph=graph) as sess:
            # Sample sentence, to make sure we've done it right
            mfcc = audiofile_to_input_vector(audio_path, 26, 9)

            # Okay, so this is ugly again.
            # We just want it to not crash.
            tf.app.flags.FLAGS.alphabet_config_path = \
                os.path.join(os.path.dirname(__file__), 'DeepSpeech/data/alphabet.txt')
            DeepSpeech.initialize_globals()
            logits2 = DeepSpeech.BiRNN(new_input, [len(mfcc)], [0]*10)

            # Here's where all the work happens. Copy the variables
            # over from the .pb to the session object.
            for var in tf.global_variables():
                sess.run(var.assign(sess.run('newname/'+var.name)))

            # Test to make sure we did it right.
            res = (sess.run(logits, {new_input: [mfcc],
                                     'newname/input_lengths:0': [len(mfcc)]}).flatten())
            res2 = (sess.run(logits2, {new_input: [mfcc]})).flatten()
            print('This value should be small', np.sum(np.abs(res - res2)))

            # And finally save the constructed session.
            saver = tf.train.Saver()
            saver.save(sess, save_path)
Example #18
0
 def _populate_batch_queue(self, session):
     for txt_file, wav_file in self._indices():
         source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext)
         source_len = len(source)
         with codecs.open(txt_file, encoding="utf-8") as open_txt_file:
             # We need to do the encode-decode dance here because encode
             # returns a bytes() object on Python 3, and text_to_char_array
             # expects a string.
             target = unicodedata.normalize("NFKD", open_txt_file.read())   \
                                 .encode("ascii", "ignore")                 \
                                 .decode("ascii", "ignore")
             target = text_to_char_array(target)
         target_len = len(target)
         try:
             session.run(self._enqueue_op, feed_dict={
                 self._x: source,
                 self._x_length: source_len,
                 self._y: target,
                 self._y_length: target_len})
         except tf.errors.CancelledError:
             return
Example #19
0
 def _populate_batch_queue(self, session):
     for txt_file, wav_file in self._files_circular_list:
         if self._coord.should_stop():
             return
         source = audiofile_to_input_vector(wav_file, self._numcep,
                                            self._numcontext)
         source_len = len(source)
         with codecs.open(txt_file, encoding="utf-8") as open_txt_file:
             target = unicodedata.normalize("NFKD", open_txt_file.read())
             target = text_to_char_array(target)
         target_len = len(target)
         try:
             session.run(self._enqueue_op,
                         feed_dict={
                             self._x: source,
                             self._x_length: source_len,
                             self._y: target,
                             self._y_length: target_len
                         })
         except tf.errors.CancelledError:
             return
Example #20
0
 def _populate_batch_queue(self, session, coord):
     '''
     Queue thread routine.
     '''
     file_count = len(self._data_set.files)
     index = -1
     while not coord.should_stop():
         index = self._data_set.next_index(index) % file_count
         wav_file, transcript = self._data_set.files[index]
         source = audiofile_to_input_vector(wav_file, self._model_feeder.numcep, self._model_feeder.numcontext)
         source_len = len(source)
         target = text_to_char_array(transcript, self._alphabet)
         target_len = len(target)
         if source_len < target_len:
             raise ValueError('Error: Audio file {} is too short for transcription.'.format(wav_file))
         try:
             session.run(self._enqueue_op, feed_dict={ self._model_feeder.ph_x: source,
                                                       self._model_feeder.ph_x_length: source_len,
                                                       self._model_feeder.ph_y: target,
                                                       self._model_feeder.ph_y_length: target_len })
         except tf.errors.CancelledError:
             return
Example #21
0
with tf.Graph().as_default() as graph:
    new_input = tf.placeholder(tf.float32, [None, None, None],
                               name="new_input")
    # Load the saved .pb into the current graph to let us grab
    # access to the weights.
    logits, = tf.import_graph_def(graph_def,
                                  input_map={"input_node:0": new_input},
                                  return_elements=['logits:0'],
                                  name="newname",
                                  op_dict=None,
                                  producer_op_list=None)

    # Now let's dump these weights into a new copy of the network.
    with tf.Session(graph=graph) as sess:
        # Sample sentetnce, to make sure we've done it right
        mfcc = audiofile_to_input_vector(wav_file, 26, 9)

        # Okay, so this is ugly again.
        # We just want it to not crash.
        tf.app.flags.FLAGS.alphabet_config_path = alphabet_file
        DeepSpeech.initialize_globals()
        logits2 = DeepSpeech.BiRNN(new_input, [len(mfcc)], [0] * 10)

        # Here's where all the work happens. Copy the variables
        # over from the .pb to the session object.
        for var in tf.global_variables():
            sess.run(var.assign(sess.run('newname/' + var.name)))

        # Test to make sure we did it right.
        res = (sess.run(logits, {
            new_input: [mfcc],
Example #22
0
        # Open tf.Session.
        with tf.Session(graph=graph) as sess:

            # Extract graph node names.
            tf.import_graph_def(graph_def, name='')
            graph_nodes = [n for n in graph_def.node]
            names = []
            for i, t in enumerate(graph_nodes):
                names.append(t.name)
                print("graph_node: '{:03d}' -- '{}'".format(i, t.name))

            # Prepare audio input data.
            input_file_path = '/home/josephz/GoogleDrive/University/UW/2018-19/CSE481I/singing-style-transfer' \
                              '/src/data/aligned/one_last_time/one_last_time_original_30s.wav'
            features = audiofile_to_input_vector(input_file_path,
                                                 Config.n_input,
                                                 Config.n_context)
            num_strides = len(features) - (Config.n_context * 2)
            # Create a view into the array with overlapping strides of size
            # numcontext (past) + 1 (present) + numcontext (future)
            window_size = 2 * Config.n_context + 1
            features = np.lib.stride_tricks.as_strided(
                features, (num_strides, window_size, Config.n_input),
                (features.strides[0], features.strides[0],
                 features.strides[1]),
                writeable=False)

            # Prepare graph nodes for inference.
            # Prepare input nodes.
            # initialize_state = graph.get_tensor_by_name('initialize_state:0')
            input_node = graph.get_tensor_by_name('input_node:0')
Example #23
0
                               name="new_input")
    # Load the saved .pb into the current graph to let us grab
    # access to the weights.
    logits, = tf.import_graph_def(
        graph_def,
        input_map={"input_node:0": new_input},
        return_elements=['logits:0'],
        name="newname",
        op_dict=None,
        producer_op_list=None
    )

    # Now let's dump these weights into a new copy of the network.
    with tf.Session(graph=graph) as sess:
        # Sample sentetnce, to make sure we've done it right
        mfcc = audiofile_to_input_vector("sample.wav", 26, 9)

        # Okay, so this is ugly again.
        # We just want it to not crash.
        tf.app.flags.FLAGS.alphabet_config_path = "DeepSpeech/data/alphabet.txt"
        # Make it stop complaining
        tf.app.flags.FLAGS.decoder_library_path = "."
        DeepSpeech.initialize_globals()
        logits2 = DeepSpeech.BiRNN(new_input, [len(mfcc)], [0]*10)

        # Here's where all the work happens. Copy the variables
        # over from the .pb to the session object.
        for var in tf.global_variables():
            sess.run(var.assign(sess.run('newname/'+var.name)))

        # Test to make sure we did it right.
def _maybe_convert_set(source_dir, target_dir, mode, datasets):
    rows = []

    remove_alphabets = set('۱١٢۳٣٤٥٦٧۷۸٨٩۹٠۰0123456789٪éàçèáâïóöúﺠپچﭽ')

    for dataset in datasets:
        for subdir, dirs, files in os.walk(source_dir + '/' + dataset + '/' + mode):
            # for audio_filename in sorted(glob.iglob(corpus_dir + "/" + '/**/*.' + ext, recursive=True)):
            for file in files:
                if file.endswith('.txt'):
                    filepath = path.abspath(subdir + '/' + file).split('.')[:-1][0]
                    if path.exists(filepath + '.txt') and path.exists(filepath + '.wav'):
                        with open(filepath + '.txt', 'r') as readfile:
                            for transcript in readfile.readlines():
                                features_len = audiofile_to_input_vector(filepath + '.wav', numcep=26, numcontext=9, compute_len=True, model='deepspeech_2')
                                if(features_len > 100 and len(transcript) > 2):
                                    if ('english' in language):
                                        if ('tedlium' in dataset):
                                            if (len(transcript) >= 7 and len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)):
                                                rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript))
                                            # if (features_len <= (len(transcript) * transcript_features_ratio)):
                                            #     print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript)))
                                        elif ('tidigits' in dataset):
                                            if (len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)):
                                                rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript))
                                            # if (features_len <= len(transcript)):
                                            #     print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript)))
                                        elif ('voxforge' in dataset):
                                            if (len(transcript) >= 2 and len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)):
                                                rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript))
                                            # if (features_len <= len(transcript)):
                                            #     print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript)))
                                        elif ('vctk' in dataset):
                                            if (len(transcript) >= 2 and len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)):
                                                rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript))
                                            # if (features_len <= (len(transcript) * transcript_features_ratio)):
                                            #     print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript)))
                                        elif ('common_voice' in dataset):
                                            if (len(transcript) >= 8 and len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)):
                                                rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript))
                                            # if (features_len <= (len(transcript) * transcript_features_ratio)):
                                            #     print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript)))
                                        elif (len(transcript) >= 5 and len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)):
                                            rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript))
                                            # if (features_len <= len(transcript)):
                                            #     print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript)))
                                    else:
                                        transcript = transcript.replace('آ', 'آ')
                                        transcript = transcript.replace('ﻻ', 'لا')
                                        transcript = transcript.replace('ﻵ', 'لآ')
                                        transcript = transcript.replace('ﻷ', 'لأ')
                                        transcript = transcript.replace('ﻹ', 'لإ')
                                        transcript = transcript.replace('ﺇ', 'إ')
                                        transcript = transcript.replace('ک', 'ك')
                                        transcript = transcript.replace('ی', 'ى')
                                        transcript = transcript.replace('‎‌', ' ')
                                        transcript = transcript.replace('‎', ' ')

                                        # remove diacritics
                                        transcript = transcript.replace('ً', '')
                                        transcript = transcript.replace('ٍ', '')
                                        transcript = transcript.replace('ٌ', '')
                                        transcript = transcript.replace('ْ', '')

                                        # normalization
                                        transcript = transcript.replace('َ', '')
                                        transcript = transcript.replace('ِ', '')
                                        transcript = transcript.replace('ُ', '')
                                        transcript = transcript.replace('ّ', '')
                                        transcript = transcript.replace('ؤ', 'ؤ')
                                        transcript = transcript.replace('ئ', 'ىٔ')
                                        transcript = transcript.replace('أ', 'أ')

                                        if not any((c in remove_alphabets) for c in transcript):
                                            if ('ksu' in dataset):
                                                if (len(transcript) >= 2 and len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)):
                                                    rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript))
                                                # if (features_len <= (len(transcript) * transcript_features_ratio)):
                                                #     print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript)))
                                            else:
                                                if (len(transcript) >= 2 and len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)):
                                                    rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript))
                                                # if (features_len <= (len(transcript) * transcript_features_ratio)):
                                                #     print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript)))

    # # if path.exists(target_dir + '/dataset.csv'):
    # samples = []
    # with open(source_csv) as source_csv_file:
    #     reader = csv.DictReader(source_csv_file)
    #     for row in reader:
    #         samples.append((row['filename'], row['text']))
    #
    # # Mutable counters for the concurrent embedded routine
    # counter = { 'all': 0, 'too_short': 0, 'too_long': 0 }
    # lock = RLock()
    # num_samples = len(samples)
    # rows = []
    #
    # def one_sample(sample):
    #     mp3_filename = path.join(*(sample[0].split('/')))
    #     mp3_filename = path.join(extracted_dir, mp3_filename)
    #     # Storing wav files next to the mp3 ones - just with a different suffix
    #     wav_filename = path.splitext(mp3_filename)[0] + ".wav"
    #     _maybe_convert_wav(mp3_filename, wav_filename)
    #     frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
    #     file_size = path.getsize(wav_filename)
    #     with lock:
    #         if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])):
    #             # Excluding samples that are too short to fit the transcript
    #             counter['too_short'] += 1
    #         elif frames/SAMPLE_RATE > MAX_SECS:
    #             # Excluding very long samples to keep a reasonable batch-size
    #             counter['too_long'] += 1
    #         else:
    #             # This one is good - keep it for the target CSV
    #             rows.append((wav_filename, file_size, sample[1]))
    #         counter['all'] += 1
    #
    # print('Importing mp3 files...')
    # pool = Pool(cpu_count())
    # bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
    # for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1):
    #     bar.update(i)
    # bar.update(num_samples)
    # pool.close()
    # pool.join()
    #
    # print('Writing "%s"...' % target_csv)
    # if ('english' in language):
    # dict_ = {}
    rows.sort(key=lambda item: int(item[1]))
    with open(target_dir + '/' + mode + '.csv', 'w') as target_csv_file:
        writer = csv.DictWriter(target_csv_file, fieldnames=FIELDNAMES)
        writer.writeheader()
        # bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)

        for filename, file_size, transcript in rows:
            include = True

            transcript = transcript.lower()

            transcript = apos_re.sub('', transcript)  # remove qoutes

            transcript = transcript.replace('-', ' ')

            # transcript = ''.join(['-'.join(c for c in s if c not in punctuationList) for s in transcript])
            transcript = transcript.replace('  ', ' ').replace('\r', ' ').replace('\n', ' ').replace('\t', ' ').replace(
                '_', ' ').lower().strip()

            for c in transcript:
                if (c in punctuationList):
                    include = False
                    break

            if (include):
                writer.writerow({'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript})
def do_single_file_inference(checkpoint_dir, input_file_path, layer_wanted,
                             softmax_wanted, save_filename, save_folder,
                             stride_size_s, win_size_s, fea_format,
                             csv_format):
    with tf.Session(config=Config.session_config) as session:
        inputs, outputs, _ = create_inference_graph(
            batch_size=1,
            n_steps=-1,
            layer_wanted=layer_wanted,
            softmax_applied=softmax_wanted)

        # Create a saver using variables from the above newly created graph
        mapping = {
            v.op.name: v
            for v in tf.global_variables()
            if not v.op.name.startswith('previous_state_')
        }
        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)
        if not checkpoint:
            log_error(
                'Checkpoint directory ({}) does not contain a valid checkpoint state.'
                .format(checkpoint_dir))
            exit(1)
        checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(session, checkpoint_path)

        session.run(outputs['initialize_state'])

        # transformation of the audio file
        features = audiofile_to_input_vector(input_file_path, Config.n_input,
                                             Config.n_context)
        #print(features.shape)
        num_strides = len(features) - (Config.n_context * 2)

        # Create a view into the array with overlapping strides of size
        # numcontext (past) + 1 (present) + numcontext (future)
        window_size = 2 * Config.n_context + 1
        features = np.lib.stride_tricks.as_strided(
            features, (num_strides, window_size, Config.n_input),
            (features.strides[0], features.strides[0], features.strides[1]),
            writeable=False)

        # This is not the logits but the ouput of the layer wanted
        logits = session.run(outputs['outputs'],
                             feed_dict={
                                 inputs['input']: [features],
                                 inputs['input_lengths']: [num_strides],
                             })

        logits = np.squeeze(logits)
        if fea_format:
            write_fea_file(logits,
                           save_folder,
                           save_filename,
                           stride_size_s=stride_size_s,
                           win_len_s=win_size_s)
        if csv_format:
            np.savetxt(save_folder + '/' + save_filename + '.csv',
                       logits,
                       delimiter=',')
def main(_):

    start = stopwatch()
    initialize_globals()

    if len(FLAGS.one_shot_infer):
        #load the frozen graph as in train(...) or as in https://blog.metaflow.fr/tensorflow-how-to-freeze-a-model-and-serve-it-with-a-python-api-d4f3596b3adc
        with tf.gfile.FastGFile("../../models/output_graph.pb", 'rb') as fin:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(fin.read())
        with tf.Graph().as_default() as pretrained_model:
            tf.import_graph_def(graph_def, name="pretrained_")
        """
        for op in pretrained_model.get_operations():
            print(op.name)
        """

        #        print("------------***-------------")

        # https://stackoverflow.com/questions/36883949/in-tensorflow-get-the-names-of-all-the-tensors-in-a-graph?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
        lstTensors = [op.values() for op in pretrained_model.get_operations()]
        input_node = lstTensors[0]
        input_lengths = lstTensors[1]
        output_node = lstTensors[-1]
        """
        print("input node name: ")
        print(input_node[0].name)
        print("input node shape: ")
        print(input_node[0].shape)#V IMP: shape of input node is [x,y,z] where x = batch_size. For one shot infer, batch_size = 1
        print("input lengths name: ")
        print(input_lengths[0].name)
        print("input lengths shape: ")
        print(input_lengths[0].shape) #V IMP: shape of input_lengths node is [x,y] where x = batch_size. For one shot infer, batch_size = 1
        print("output node name: ")
        print(output_node[0].name)
        print("output node shape: ")
        print(output_node[0].shape)
        """

        #        do_single_file_inference(FLAGS.one_shot_infer)
        #        print("n_input = "+repr(n_input))
        #        print("n_context = "+repr(n_context))
        mfcc = audiofile_to_input_vector(FLAGS.one_shot_infer, n_input,
                                         n_context)
        #        print(mfcc.shape)

        #        output_node = pretrained_model.get_tensor_by_name(pretrained_model.get_operations()[-1].name)

        batch_size = 1
        with tf.Session(graph=pretrained_model) as sess:
            output = sess.run(
                output_node,
                feed_dict={
                    input_node:
                    [mfcc.reshape((batch_size, mfcc.shape[0], mfcc.shape[1]))],
                    input_lengths:
                    [np.array(len(mfcc)).reshape((batch_size, ))]
                })
            #            print(output)
            text = ndarray_to_text(output[0][0][0], alphabet)
            print("\n\nResult:")
            print(text)
    else:
        print(
            "Correct usage: python3 _this.py --one_shot_infer <<path-of-input-wav-file>>"
        )

    delta = stopwatch(start)
    print("Net execution time including loading of the graph = " +
          format_duration(delta))
Example #27
0
    result = np.asarray([
        SPACE_INDEX if xt == SPACE_TOKEN else ord(xt) - FIRST_INDEX
        for xt in result
    ])
    return result


train = pd.read_csv('./real_batch/clean-test_dev-combined.csv')
train = train.head(3)  #overfitting  it for 2 file
print train.shape

inputs_encoder = []
inputs_decoder = []
outputs_decoder = []
for ind, row in train.iterrows():
    inputs_encoder.append(audiofile_to_input_vector(row['wav_filename'], 26,
                                                    0))

for ind, row in train.iterrows():
    inputs_decoder.append(np.append([0],
                                    text_to_char_array(row['transcript'])))

for ind, row in train.iterrows():
    outputs_decoder.append(
        np.append(text_to_char_array(row['transcript']), [0]))

xt_decoder_input, xlen_decoder_input = helpers2.batch(inputs_decoder)

xt_encoder, xlen_encoder = helpers.batch(inputs_encoder)

xt_decoder_output, xlen_decoder_output = helpers2.batch(outputs_decoder)
    new_input = tf.placeholder(tf.float32, [None, None, None],
                               name="new_input")
    # Load the saved .pb into the current graph to let us grab
    # access to the weights.
    logits, = tf.import_graph_def(graph_def,
                                  input_map={"input_node:0": new_input},
                                  return_elements=['logits:0'],
                                  name="newname",
                                  op_dict=None,
                                  producer_op_list=None)

    # Now let's dump these weights into a new copy of the network.
    with tf.Session(graph=graph) as sess:
        # Sample sentetnce, to make sure we've done it right
        # TODO i've substitutes this file
        mfcc = audiofile_to_input_vector("LDC93S1.wav", 26, 9)

        # Okay, so this is ugly again.
        # We just want it to not crash.
        tf.app.flags.FLAGS.alphabet_config_path = "DeepSpeech/data/alphabet.txt"
        DeepSpeech.initialize_globals()
        logits2 = DeepSpeech.BiRNN(new_input, [len(mfcc)], [0] * 10)

        # Here's where all the work happens. Copy the variables
        # over from the .pb to the session object.
        for var in tf.global_variables():
            sess.run(var.assign(sess.run('newname/' + var.name)))

        # Test to make sure we did it right.
        res = (sess.run(logits, {
            new_input: [mfcc],