Exemple #1
0
    def run(self):
        #First make output directories
        self.setupProcessedDataDirs()

        #check if processor needs to be run
        if os.path.exists(os.path.join(self.train_dir, "master_text_file.txt"))\
            and os.path.exists(os.path.join(self.test_dir, "master_text_file.txt")):
            print "No need to run data processor..."
            return

        #next check which data folders are present
        self.data_dirs = self.checkWhichDataFoldersArePresent()
        if len(self.data_dirs) == 0:
            print "Something went wrong, no data detected, check data directory.."
            return

        #get pairs of (audio_file_name, transcribed_text)
        print "Figuring out which files need to be processed..."
        audio_file_text_pairs, will_convert = self.getFileNameTextPairs()
        print "Using {0} files in total dataset...".format(
            len(audio_file_text_pairs))
        #shuffle pairs
        shuffle(audio_file_text_pairs)

        if will_convert:
            audio_processor = audioprocessor.AudioProcessor(1)
            for audio_file_name in audio_file_text_pairs:
                audio_processor.convertAndDeleteFLAC(
                    audio_file_name[0].replace(".wav", ".flac"))

        #print audio_file_text_pairs[-20:]

        return audio_file_text_pairs
Exemple #2
0
 def _read_audio_and_transcode_label(filename_label):
     # Need to convert back to string because tf.py_func changed it to a numpy array
     filename = str(filename_label[0], encoding='UTF-8')
     label = str(filename_label[1], encoding='UTF-8')
     audio_processor = audioprocessor.AudioProcessor(max_input_seq_length, signal_processing)
     audio_decoded, audio_length = audio_processor.process_audio_file(filename)
     label_transcoded = dataprocessor.DataProcessor.get_str_labels(char_map, label)
     return np.array(audio_decoded, dtype=np.float32), np.array(audio_length, dtype=np.int32),\
         np.array(label_transcoded, dtype=np.int32)
Exemple #3
0
    def evaluate_full(self, sess, eval_dataset, input_seq_length, signal_processing, char_map,
                      run_options=None, run_metadata=None):
        # Create an audio_processor
        audio_processor = audioprocessor.AudioProcessor(input_seq_length, signal_processing)

        wer_list = []
        cer_list = []
        file_number = 0
        input_feat_vecs = []
        input_feat_vec_lengths = []
        labels = []
        for file, label, _ in eval_dataset:
            feat_vec, feat_vec_length = audio_processor.process_audio_file(file)
            file_number += 1
            label_data_length = len(label)
            if (label_data_length > self.max_target_seq_length) or\
               (feat_vec_length > self.max_input_seq_length):
                logging.warning("Warning - sample too long : %s (input : %d / text : %s)",
                                file, feat_vec_length, label_data_length)
            else:
                logging.debug("Processed file %d / %d", file_number, len(eval_dataset))
                input_feat_vecs.append(feat_vec)
                input_feat_vec_lengths.append(feat_vec_length)
                labels.append(label)

            # If we reached the last file then pad the lists to obtain a full batch
            if file_number == len(eval_dataset):
                for i in range(self.batch_size - len(input_feat_vecs)):
                    input_feat_vecs.append(np.zeros([self.max_input_seq_length,
                                                     audio_processor.feature_size]))
                    input_feat_vec_lengths.append(0)
                    labels.append("")

            if len(input_feat_vecs) == self.batch_size:
                # Run the batch
                logging.debug("Running a batch")
                input_feat_vecs = np.swapaxes(input_feat_vecs, 0, 1)
                predictions = self.process_input(sess, input_feat_vecs, input_feat_vec_lengths,
                                                 run_options=run_options, run_metadata=run_metadata)
                for index, prediction in enumerate(predictions):
                    transcribed_text = dataprocessor.DataProcessor.get_labels_str(char_map, prediction)
                    true_label = labels[index]
                    if len(true_label) > 0:
                        nb_words = len(true_label.split())
                        nb_chars = len(true_label.replace(" ", ""))
                        wer_list.append(self.calculate_wer(transcribed_text, true_label) / float(nb_words))
                        cer_list.append(self.calculate_cer(transcribed_text, true_label) / float(nb_chars))
                # Reset the lists
                input_feat_vecs = []
                input_feat_vec_lengths = []
                labels = []

        wer = (sum(wer_list) * 100) / float(len(wer_list))
        cer = (sum(cer_list) * 100) / float(len(cer_list))
        return wer, cer
Exemple #4
0
def main():
    prog_params = parse_args()
    serializer = hyperparams.HyperParameterHandler(prog_params['config_file'])
    hyper_params = serializer.getHyperParams()
    audio_processor = audioprocessor.AudioProcessor(
        hyper_params["max_input_seq_length"],
        hyper_params["load_save_input_vec"])

    if prog_params['train'] is True:
        train_rnn(hyper_params, prog_params)
    else:
        process_file(audio_processor, hyper_params, prog_params['file'])
Exemple #5
0
def main():
    prog_params = parse_args()
    serializer = hyperparams.HyperParameterHandler(prog_params['config_file'])
    hyper_params = serializer.getHyperParams()
    audio_processor = audioprocessor.AudioProcessor(
        hyper_params["max_input_seq_length"],
        hyper_params["signal_processing"])
    # Get the input dimension for the RNN, depend on the chosen signal processing mode
    hyper_params["input_dim"] = audio_processor.feature_size

    if prog_params['train'] is True:
        train_rnn(audio_processor, hyper_params, prog_params)
    elif prog_params['file'] is not None:
        process_file(audio_processor, hyper_params, prog_params['file'])
    elif prog_params['record'] is True:
        record_and_write(audio_processor, hyper_params)
Exemple #6
0
def main():
    prog_params = parse_args()
    serializer = hyperparams.HyperParameterHandler(prog_params['config_file'])
    hyper_params = serializer.get_hyper_params()
    audio_processor = audioprocessor.AudioProcessor(
        hyper_params["max_input_seq_length"],
        hyper_params["signal_processing"])
    # Get the input dimension for the RNN, depend on the chosen signal processing mode
    hyper_params["input_dim"] = audio_processor.feature_size

    speech_reco = SpeechRecognizer(hyper_params["language"])
    hyper_params["char_map"] = speech_reco.get_char_map()
    hyper_params["char_map_length"] = speech_reco.get_char_map_length()

    if prog_params['start_ps'] is True:
        start_ps_server(prog_params)
    if (prog_params['train_acoustic'] is
            True) or (prog_params['dtrain_acoustic'] is True):
        if hyper_params["dataset_size_ordering"] in ['True', 'First_run_only']:
            ordered = True
        else:
            ordered = False
        train_set, test_set = speech_reco.load_acoustic_dataset(
            hyper_params["training_dataset_dirs"],
            hyper_params["test_dataset_dirs"],
            hyper_params["training_filelist_cache"], ordered,
            hyper_params["train_frac"])
        if prog_params['train_acoustic'] is True:
            train_acoustic_rnn(train_set, test_set, hyper_params, prog_params)
        else:
            distributed_train_acoustic_rnn(train_set, test_set, hyper_params,
                                           prog_params)
    elif prog_params['train_language'] is True:
        train_set, test_set = load_language_dataset(hyper_params)
        train_language_rnn(train_set, test_set, hyper_params, prog_params)
    elif prog_params['file'] is not None:
        process_file(audio_processor, hyper_params, prog_params['file'])
    elif prog_params['record'] is True:
        record_and_write(audio_processor, hyper_params)
    elif prog_params['evaluate'] is True:
        evaluate(hyper_params)
    elif prog_params['generate_text'] is True:
        generate_text(hyper_params)
Exemple #7
0
    def run(self):
        if self.data_type == "Shtooka":
            audio_file_text_pairs, will_convert = self.getFileNameTextPairs_Shtooka(
                self.raw_data_path)
        elif self.data_type == "LibriSpeech":
            data_dirs = self.checkWhichDataFoldersArePresent()
            # Check which data folders are present
            if len(data_dirs) == 0:
                raise Exception(
                    "ERROR : something went wrong, no data detected, check data directory."
                )
            # Get pairs of (audio_file_name, transcribed_text)
            audio_file_text_pairs, will_convert = self.getFileNameTextPairs_LibriSpeech(
                data_dirs)
        else:
            raise Exception("ERROR : unknown training_dataset_type")

        # Check that there is data
        if len(audio_file_text_pairs) == 0:
            raise Exception("ERROR : no data found in directory {0}".format(
                self.raw_data_path))

        # Shuffle pairs
        shuffle(audio_file_text_pairs)

        if will_convert:
            audio_file_text_pairs_final = []
            audio_processor = audioprocessor.AudioProcessor(1)
            for audio_file_name in audio_file_text_pairs:
                if audio_file_name[0].endswith(".flac"):
                    audio_processor.convertAndDeleteFLAC(audio_file_name[0])
                    audio_file_text_pairs_final.append(
                        (audio_file_name[0].replace(".flac", ".wav"),
                         audio_file_name[1]))
                else:
                    audio_file_text_pairs_final.append(
                        (audio_file_name[0], audio_file_name[1]))
        else:
            audio_file_text_pairs_final = audio_file_text_pairs

        return audio_file_text_pairs_final
 def initializeAudioProcessor(self, max_input_seq_length, load_save_input_vec):
     self.audio_processor = audioprocessor.AudioProcessor(max_input_seq_length, load_save_input_vec)
Exemple #9
0
 def initializeAudioProcessor(self, max_input_seq_length):
     self.audio_processor = audioprocessor.AudioProcessor(
         max_input_seq_length)
    def setUp(self):
        self.audio_processor = audioprocessor.AudioProcessor(1000)
        # Create a temp dir for testing purpose
        cwd = os.getcwd()
        self.directory = cwd + "/test_directory/"
        if not os.path.exists(self.directory):
            os.makedirs(self.directory)
        else:
            # Test self.directory already exist, throw an error
            raise Exception('test_directory already exists')
        # Setup LibriSpeech files
        os.makedirs(self.directory + "Libri/")
        os.makedirs(self.directory + "Libri/train-clean-100/")
        os.makedirs(self.directory + "Libri/train-clean-100/" + "19/")
        os.makedirs(self.directory + "Libri/train-clean-100/" + "19/" + "198/")
        text_file = self.directory + "Libri/train-clean-100/19/198/19-198.trans.txt"
        with open(text_file, "w") as f:
            f.write("19-198-0000 NORTHANGER ABBEY\n")
            f.write("19-198-0001 THIS LITTLE WORK...\n")
            f.write("19-198-0002 NEITHER THE...\n")
        # Create empty audio files
        open(self.directory + "Libri/train-clean-100/19/198/19-198-0000.flac", 'a').close()
        open(self.directory + "Libri/train-clean-100/19/198/19-198-0001.flac", 'a').close()

        # Setup Shtooka files
        os.makedirs(self.directory + "Shtooka/")
        os.makedirs(self.directory + "Shtooka/flac/")
        text_file = self.directory + "Shtooka/flac/index.tags.txt"
        with open(text_file, "w") as f:
            f.write("\Swac_Index_Tags\n\n")
            f.write("[GLOBAL]\n")
            f.write("SWAC_LANG = eng\n")
            f.write("SWAC_SPEAK_LANG = eng\n\n")
            f.write("[eng - I_arose.flac]\n")
            f.write("SWAC_TEXT = I arose\n")
            f.write("SWAC_ALPHAIDX = arise\n")
            f.write("SWAC_BASEFORM = arise\n")
            f.write("SWAC_FORM_NAME = Simple Past\n\n")
            f.write("[eng - I_ate.flac]\n")
            f.write("SWAC_TEXT = I ate\n")
            f.write("SWAC_ALPHAIDX = eat\n")
            f.write("SWAC_BASEFORM = eat\n")
            f.write("SWAC_FORM_NAME = Simple Past\n\n")
            f.write("[eng - I_awoke.flac]\n")
            f.write("SWAC_TEXT=I awoke\n")
            f.write("SWAC_ALPHAIDX=awake\n")
            f.write("SWAC_BASEFORM=awake\n")
            f.write("SWAC_FORM_NAME=Simple Past\n")
        # Create empty audio files
        open(self.directory + "Shtooka/flac/eng - I_arose.flac", 'a').close()
        open(self.directory + "Shtooka/flac/eng - I_ate.flac", 'a').close()

        # Setup Vystadial files
        os.makedirs(self.directory + "Vystadial_2013/")
        os.makedirs(self.directory + "Vystadial_2013/data_voip_en/")
        os.makedirs(self.directory + "Vystadial_2013/data_voip_en/dev/")
        text_file = self.directory + "Vystadial_2013/data_voip_en/dev/jurcic-028-121024_234433_0013625_0013836.wav.trn"
        with open(text_file, "w") as f:
            f.write("ALRIGHT THANK YOU AND GOODBYE\n")
        text_file = self.directory + "Vystadial_2013/data_voip_en/dev/jurcic-028-121015_000550_0026689_0027040.wav.trn"
        with open(text_file, "w") as f:
            f.write("FILE WITH NO AUDIO...\n")
        # Create empty audio file
        open(self.directory + "Vystadial_2013/data_voip_en/dev/jurcic-028-121024_234433_0013625_0013836.wav",
             'a').close()

        # Setup TEDLIUM files
        os.makedirs(self.directory + "TEDLIUM/")
        os.makedirs(self.directory + "TEDLIUM/test/")
        os.makedirs(self.directory + "TEDLIUM/test/stm/")
        text_file = self.directory + "TEDLIUM/test/stm/AimeeMullins_2009P.stm"
        with open(text_file, "w") as f:
            f.write("AimeeMullins_2009P 1 inter_segment_gap 0 17.82 <o,,unknown> ignore_time_segment_in_scoring\n")
            f.write("AimeeMullins_2009P 1 AimeeMullins 17.82 28.81 <o,f0,female> i 'd like to share ...\n")
        # Create empty audio file
        os.makedirs(self.directory + "TEDLIUM/test/sph/")
        open(self.directory + "TEDLIUM/test/sph/AimeeMullins_2009P.sph", 'a').close()
Exemple #11
0
def main():
    all_params,prog_params = parse_args()
    serializer = hyperparams.HyperParameterHandler(prog_params['config_file'],checkpoint_dir=prog_params['train_dir'],program_params=all_params)
    hyper_params = serializer.get_hyper_params()
    audio_processor = audioprocessor.AudioProcessor(hyper_params["max_input_seq_length"],
                                                    hyper_params["signal_processing"])
    # Get the input dimension for the RNN, depend on the chosen signal processing mode
    hyper_params["input_dim"] = audio_processor.feature_size

    speech_reco = SpeechRecognizer(hyper_params["language"])
    hyper_params["char_map"] = speech_reco.get_char_map()
    hyper_params["char_map_length"] = speech_reco.get_char_map_length()

    if prog_params['start_ps'] is True:
        start_ps_server(prog_params)
    if prog_params['save_acoustic'] is True:
        if hyper_params["dataset_size_ordering"] in ['True', 'First_run_only']:
            ordered = True
        else:
            ordered = False
        train_set, test_set = speech_reco.load_acoustic_dataset(hyper_params["training_dataset_dirs"],
                                                                hyper_params["test_dataset_dirs"],
                                                                hyper_params["training_filelist_cache"],
                                                                ordered,
                                                                hyper_params["train_frac"])
        logging.info("Save datasets...")
        save_acoustic_rnn(train_set,"train",hyper_params, prog_params)
        save_acoustic_rnn(test_set,"test",hyper_params, prog_params)
        kl = client.Client()
        kl.datasets.push(os.environ.get('WORKSPACE_NAME'),'librispeech-dev','1.0.'+os.environ.get('BUILD_ID')+'-tfrecords',prog_params["train_dir"],create=True)
    elif (prog_params['train_acoustic'] is True) or (prog_params['dtrain_acoustic'] is True):
        if hyper_params["dataset_size_ordering"] in ['True', 'First_run_only']:
            ordered = True
        else:
            ordered = False
        train_set = None
        test_set = None
        if prog_params['train_set'] is not None:
            train_set = prog_params['train_set']
            test_set = prog_params['test_set']
        else:
            train_set, test_set = speech_reco.load_acoustic_dataset(hyper_params["training_dataset_dirs"],
                                                                hyper_params["test_dataset_dirs"],
                                                                hyper_params["training_filelist_cache"],
                                                                ordered,
                                                                hyper_params["train_frac"])
        if prog_params['train_acoustic'] is True:
            train_acoustic_rnn(train_set, test_set, hyper_params, prog_params)
        else:
            distributed_train_acoustic_rnn(train_set, test_set, hyper_params, prog_params)
    elif prog_params['train_language'] is True:
        train_set, test_set = load_language_dataset(hyper_params)
        train_language_rnn(train_set, test_set, hyper_params, prog_params)
    elif prog_params['file'] is not None:
        process_file(audio_processor, hyper_params, prog_params['file'])
    elif prog_params['record'] is True:
        record_and_write(audio_processor, hyper_params)
    elif prog_params['evaluate'] is True:
        evaluate(hyper_params)
    elif prog_params['generate_text'] is True:
        generate_text(hyper_params)