def _maybe_split_stm_dataset(extracted_dir, data_set): # Create stm dir stm_dir = path.join(extracted_dir, data_set, "stm") # Obtain stm files stm_files = glob(path.join(stm_dir, "*.stm")) # Loop over stm files and split each one for stm_file in stm_files: # Parse stm file stm_segments = parse_stm_file(stm_file) # Loop over stm_segments and create txt file for each one for stm_segment in stm_segments: start_time = stm_segment.start_time stop_time = stm_segment.stop_time txt_filename = path.splitext( path.basename(stm_file))[0] + "-" + str( start_time) + "-" + str(stop_time) + ".txt" txt_file = path.join(stm_dir, txt_filename) # If the txt segment file does not exist create it if not gfile.Exists(txt_file): with open(txt_file, "w+") as f: f.write(stm_segment.transcript) # Remove stm_file remove(stm_file)
def _maybe_split_dataset(extracted_dir, data_set): # Create stm dir stm_dir = path.join(extracted_dir, data_set, "stm") # Create wav dir wav_dir = path.join(extracted_dir, data_set, "wav") files = [] # Loop over stm files and split corresponding wav for stm_file in glob(path.join(stm_dir, "*.stm")): # Parse stm file stm_segments = parse_stm_file(stm_file) # Open wav corresponding to stm_file wav_filename = path.splitext(path.basename(stm_file))[0] + ".wav" wav_file = path.join(wav_dir, wav_filename) origAudio = wave.open(wav_file, 'r') # Loop over stm_segments and split wav_file for each segment for stm_segment in stm_segments: # Create wav segment filename start_time = stm_segment.start_time stop_time = stm_segment.stop_time new_wav_filename = path.splitext( path.basename(stm_file))[0] + "-" + str( start_time) + "-" + str(stop_time) + ".wav" new_wav_file = path.join(wav_dir, new_wav_filename) # If the wav segment filename does not exist create it if not gfile.Exists(new_wav_file): _split_wav(origAudio, start_time, stop_time, new_wav_file) new_wav_filesize = path.getsize(new_wav_file) files.append( (new_wav_file, new_wav_filesize, stm_segment.transcript)) # Close origAudio origAudio.close() # Remove wav_file remove(wav_file) return pandas.DataFrame( data=files, columns=["wav_filename", "wav_filesize", "transcript"])
def _maybe_split_dataset(extracted_dir, data_set): # Create stm dir stm_dir = path.join(extracted_dir, data_set, "stm") # Create wav dir wav_dir = path.join(extracted_dir, data_set, "wav") files = [] # Loop over stm files and split corresponding wav for stm_file in glob(path.join(stm_dir, "*.stm")): # Parse stm file stm_segments = parse_stm_file(stm_file) # Open wav corresponding to stm_file wav_filename = path.splitext(path.basename(stm_file))[0] + ".wav" wav_file = path.join(wav_dir, wav_filename) origAudio = wave.open(wav_file,'r') # Loop over stm_segments and split wav_file for each segment for stm_segment in stm_segments: # Create wav segment filename start_time = stm_segment.start_time stop_time = stm_segment.stop_time new_wav_filename = path.splitext(path.basename(stm_file))[0] + "-" + str(start_time) + "-" + str(stop_time) + ".wav" new_wav_file = path.join(wav_dir, new_wav_filename) # If the wav segment filename does not exist create it if not gfile.Exists(new_wav_file): _split_wav(origAudio, start_time, stop_time, new_wav_file) new_wav_filesize = path.getsize(new_wav_file) files.append((path.abspath(new_wav_file), new_wav_filesize, stm_segment.transcript)) # Close origAudio origAudio.close() return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"])