def convert_chapter(path: pathlib.Path, sink: pathlib.Path, prefix: str, transformer: sox.Transformer): transcriptions = locate_transcriptions(path) if not sink.is_dir(): os.makedirs(sink) if transcriptions: with open(str(transcriptions), "r") as transcriptions: for line in transcriptions.readlines(): line = line.strip() end_of_index = line.find(" ") file_name = line[:end_of_index].strip() file_name_with_extension = file_name + ".flac" audio_input_file = path / file_name_with_extension audio_output_file = sink / f"{prefix}-{file_name}.wav" label_output_file = sink / f"{prefix}-{file_name}.lab" label = line[end_of_index:].strip() transformer.build_file(input_filepath=str(audio_input_file), output_filepath=str(audio_output_file)) with open(str(label_output_file), "w") as label_file: label_file.write(label)
def convert_audio_and_split_transcript(input_dir, source_name, target_name, output_dir, output_file): print(f"Pre-processing audio and transcript for {source_name}") source_dir = os.path.join(input_dir, source_name) target_dir = os.path.join(input_dir, target_name) if not tf.io.gfile.exists(target_dir): tf.io.gfile.makedirs(target_dir) files = [] tfm = Transformer() for root, _, filenames in tf.io.gfile.walk(source_dir): for filename in fnmatch.filter(filenames, "*.trans.txt"): trans_file = os.path.join(root, filename) with codecs.open(trans_file, "r", "utf-8") as fin: for line in fin: seq_id, transcript = line.split(" ", 1) transcript = unicodedata.normalize( "NFKD", transcript).encode("ascii", "ignore").decode( "ascii", "ignore").strip().lower() flac_file = os.path.join(root, seq_id + ".flac") wav_file = os.path.join(target_dir, seq_id + ".wav") if not tf.io.gfile.exists(wav_file): tfm.build(flac_file, wav_file) wav_filesize = os.path.getsize(wav_file) files.append( (os.path.abspath(wav_file), wav_filesize, transcript)) csv_file_path = os.path.join(output_dir, output_file) df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_filesize", "transcript"]) df.to_csv(csv_file_path, index=False, sep="\t") print(f"Successfully generated csv file {csv_file_path}")
def convert_cv(transformer: sox.Transformer, max_per_speaker: int, tsv: pathlib.Path, clips: pathlib.Path, sink: pathlib.Path, prefix: str): meta = pd.read_csv(tsv, delimiter="\t") for client, df in tqdm(meta.groupby(by="client_id")): df = df.tail(max_per_speaker) speaker_sink = sink / client if not speaker_sink.is_dir(): os.makedirs(speaker_sink) for audio, transcription in zip(df["path"], df["sentence"]): try: audio_without_stem = audio.split(".")[0] input_audio_file = clips / audio output_audio_file = speaker_sink / f"{prefix}-{audio_without_stem}.wav" output_transcription_file = speaker_sink / f"{prefix}-{audio_without_stem}.lab" with open(str(output_transcription_file), "w") as o: transcription = normalize_transcription(transcription) o.write(transcription) transformer.build_file(input_filepath=str(input_audio_file), output_filepath=str(output_audio_file)) except Exception as e: print( f"Failed to convert audio {audio} with sentence {transcription} reason: {str(e)}" )
def convert_audio_and_split_transcript(input_dir, source_name, target_name, output_dir, output_file): """Convert FLAC to WAV and split the transcript. Args: input_dir: the directory which holds the input dataset. source_name: the name of the specified dataset. e.g. test-clean target_name: the directory name for the newly generated audio files. e.g. test-clean-wav output_dir: the directory to place the newly generated csv files. output_file: the name of the newly generated csv file. e.g. test-clean.csv """ logging.info("Processing audio and transcript for %s" % source_name) source_dir = os.path.join(input_dir, source_name) target_dir = os.path.join(input_dir, target_name) if not gfile.Exists(target_dir): gfile.MakeDirs(target_dir) files = [] tfm = Transformer() # Convert all FLAC file into WAV format. At the same time, generate the csv for root, _, filenames in gfile.Walk(source_dir): for filename in fnmatch.filter(filenames, "*.trans.txt"): trans_file = os.path.join(root, filename) with codecs.open(trans_file, "r", "utf-8") as fin: for line in fin: seqid, transcript = line.split(" ", 1) # We do a encode-decode transformation here because the output type # of encode is a bytes object, we need convert it to string. transcript = ( unicodedata.normalize("NFKD", transcript) .encode("ascii", "ignore") .decode("ascii", "ignore") .strip() .lower() ) # Convert FLAC to WAV. flac_file = os.path.join(root, seqid + ".flac") wav_file = os.path.join(target_dir, seqid + ".wav") if not gfile.Exists(wav_file): tfm.build(flac_file, wav_file) # wav_filesize = os.path.getsize(wav_file) wav_length = get_wave_file_length(wav_file) files.append((os.path.abspath(wav_file), wav_length, transcript)) # Write to CSV file which contains three columns: # "wav_filename", "wav_length_ms", "transcript". csv_file_path = os.path.join(output_dir, output_file) df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript"] ) df.to_csv(csv_file_path, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(csv_file_path))
def extract_audio(): all_videos = find_all_video_files(output_dir) for video in tqdm(all_videos): mkvfile = os.path.join(os.path.dirname(video), 'temp.mkv') command = 'mkvmerge -o ' + mkvfile + ' ' + video subprocess.call(command, shell=True) video_ts_file = os.path.join(os.path.dirname(video), 'video_ts.txt') audio_ts_file = os.path.join(os.path.dirname(video), 'audio_ts.txt') command = 'mkvextract ' + mkvfile + ' timestamps_v2 0:' + video_ts_file subprocess.call(command, shell=True) command = 'mkvextract ' + mkvfile + ' timestamps_v2 1:' + audio_ts_file subprocess.call(command, shell=True) with open(video_ts_file, 'r') as f: f.readline() # skip header video_start = f.readline() with open(audio_ts_file, 'r') as f: f.readline() # skip header audio_start = f.readline() offset_ms = int(audio_start) - int(video_start) # extract audio audio_tmp = os.path.join(os.path.dirname(video), 'temp.wav') command = 'ffmpeg -i ' + video + ' -ar 44100 -ac 1 -y ' + audio_tmp subprocess.call(command, shell=True) # use the offset to pad the audio with zeros, or trim the audio audio_name = os.path.splitext(video)[0] + '.wav' tfm = Transformer() if offset_ms >= 0: tfm.pad(start_duration=offset_ms / 1000) elif offset_ms < 0: tfm.trim(start_time=-offset_ms / 1000) tfm.build(audio_tmp, audio_name) os.remove(mkvfile) os.remove(audio_tmp) os.remove(video_ts_file) os.remove(audio_ts_file)
def preprocess_wav(cls, fpath: Union[str, Path]) -> np.ndarray: """Load, resample, normalize and trim a waveform.""" transformer = Transformer() transformer.norm() transformer.silence(silence_threshold=1, min_silence_duration=0.1) transformer.set_output_format(rate=cls.sample_rate, bits=16, channels=1) wav = transformer.build_array(input_filepath=str(fpath)) wav = wav / (2**15) return wav.astype(np.float32)
def create_datapoints(transformer: sox.Transformer, writers: Writers, grid: pathlib.Path, audio: pathlib.Path): """Creates datapoints from a TextGrid.""" audio_file = audio / grid.parts[-2] / f"{grid.stem}.wav" if audio_file.is_file(): resampled_audio = transformer.build_array( input_filepath=str(audio_file)) tg = textgrid.TextGrid.fromFile(grid) for interval in tg[0]: start_time = interval.minTime end_time = interval.maxTime text = interval.mark if text in writers.word_counts: start_sample = int( max((start_time - 0.1) * transformer.output_format["rate"], 0)) end_sample = int( min((end_time + 0.1) * transformer.output_format["rate"], resampled_audio.size)) utterance = resampled_audio[start_sample:end_sample] writers.write(word=text, sample_rate=transformer.output_format["rate"], audio=utterance) else: print(f"File not found: {audio_file}")
def __process_transcript(file_path: str, dst_folder: str): """ Converts flac files to wav from a given transcript, capturing the metadata. Args: file_path: path to a source transcript with flac sources dst_folder: path where wav files will be stored Returns: a list of metadata entries for processed files. """ entries = [] root = os.path.dirname(file_path) with open(file_path, encoding="utf-8") as fin: for line in fin: id, text = line[:line.index(" ")], line[line.index(" ") + 1:] transcript_text = text.lower().strip() # Convert FLAC file to WAV flac_file = os.path.join(root, id + ".flac") wav_file = os.path.join(dst_folder, id + ".wav") if not os.path.exists(wav_file): Transformer().build(flac_file, wav_file) # check duration duration = subprocess.check_output("soxi -D {0}".format(wav_file), shell=True) entry = {} entry['audio_filepath'] = os.path.abspath(wav_file) entry['duration'] = float(duration) entry['text'] = transcript_text entries.append(entry) return entries
def convert(self): """Converts the mp3's associated with this instance to wav's Return: wav_directory (os.path): The directory into which the associated wav's were downloaded """ wav_directory = self._pre_convert() for mp3_filename in self.mp3_directory.glob('**/*.mp3'): wav_filename = path.join(wav_directory, os.path.splitext(os.path.basename(mp3_filename))[0] + ".wav") if not path.exists(wav_filename): _logger.debug("Converting mp3 file %s to wav file %s" % (mp3_filename, wav_filename)) transformer = Transformer() transformer.convert(samplerate=SAMPLE_RATE, n_channels=N_CHANNELS, bitdepth=BITDEPTH) transformer.build(str(mp3_filename), str(wav_filename)) else: _logger.debug("Already converted mp3 file %s to wav file %s" % (mp3_filename, wav_filename)) return wav_directory
def _convert_audio_and_split_sentences(extracted_dir, data_set, dest_dir): source_dir = os.path.join(extracted_dir, data_set) target_dir = os.path.join(extracted_dir, dest_dir) if not os.path.exists(target_dir): os.makedirs(target_dir) files = [] for root, dirnames, filenames in os.walk(source_dir): for filename in fnmatch.filter(filenames, '*.trans.txt'): trans_filename = os.path.join(root, filename) with codecs.open(trans_filename, "r", "utf-8") as fin: for line in fin: # Parse each segment line first_space = line.find(" ") seqid, transcript = line[:first_space], line[first_space+1:] transcript = unicodedata.normalize("NFKD", transcript) \ .encode("ascii", "ignore") \ .decode("ascii", "ignore") transcript = transcript.lower().strip() # Convert corresponding FLAC to a WAV flac_file = os.path.join(root, seqid + ".flac") wav_file = os.path.join(target_dir, seqid + ".wav") if not os.path.exists(wav_file): try: Transformer().build(flac_file, wav_file) wav_filesize = os.path.getsize(wav_file) files.append((os.path.abspath(wav_file), wav_filesize, transcript)) except OSError: print("Could not find file:", wav_file, flac_file) return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"])
def _maybe_convert_wav(data_dir, extracted_data, converted_data): source_dir = os.path.join(data_dir, extracted_data) target_dir = os.path.join(data_dir, converted_data) # Conditionally convert FLAC files to wav files if not gfile.Exists(target_dir): # Create target_dir os.makedirs(target_dir) # Loop over FLAC files in source_dir and convert each to wav for root, dirnames, filenames in os.walk(source_dir): for filename in fnmatch.filter(filenames, '*.flac'): flac_file = os.path.join(root, filename) wav_filename = os.path.splitext(os.path.basename(flac_file))[0] + ".wav" wav_file = os.path.join(target_dir, wav_filename) transformer = Transformer() transformer.build(flac_file, wav_file) os.remove(flac_file)
def main(): fileName = "audio" sx = Transformer() proxyPool = scraper() prefs = getProfile(proxyPool) urlAddr, inputs = getInputs() browser = automatePage(fireFoxPath=FIREFOX_PATH, prefs=prefs, address=urlAddr, inputList=inputs) ########################## ### Convert Audio File ### ########################## print "Converting Audio File" sx.build(fileName + ".mp3", fileName + ".wav") answer = getAnswer(fileName) submitAnswer(browser, answer)
def process(x): file_path, text = x file_name = os.path.splitext(os.path.basename(file_path))[0] text = text.lower().strip() audio_path = os.path.join(audio_clips_path, file_path) output_wav_path = os.path.join(wav_dir, file_name + '.wav') tfm = Transformer() tfm.rate(samplerate=args.sample_rate) tfm.channels(n_channels=args.n_channels) tfm.build(input_filepath=audio_path, output_filepath=output_wav_path) duration = sox.file_info.duration(output_wav_path) return output_wav_path, duration, text
def __init__(self, fname): ''' opens .wav audio file with fname ''' self.wav_fname = fname try: with wave.open(fname, mode="r") as wav: self.nchannels, self.sampwidth, self.framerate, self.nframes, self.comptype, self.compname = wav.getparams( ) self.duration = self.nframes / self.framerate self.peak = 256**self.sampwidth / 2 self.content = wav.readframes(self.nframes) self.samples = np.fromstring(self.content, dtype=types[self.sampwidth]) self.tsm = Transformer() return except FileNotFoundError as err: print(err) print("Try Audio.reload_file function") raise FileNotFoundError
def _convert_audio_and_split_sentences(extracted_dir, data_set, dest_dir): source_dir = os.path.join(extracted_dir, data_set) target_dir = os.path.join(extracted_dir, dest_dir) if not os.path.exists(target_dir): os.makedirs(target_dir) # Loop over transcription files and split each one # # The format for each file 1-2.trans.txt is: # 1-2-0 transcription of 1-2-0.flac # 1-2-1 transcription of 1-2-1.flac # ... # # Each file is then split into several files: # 1-2-0.txt (contains transcription of 1-2-0.flac) # 1-2-1.txt (contains transcription of 1-2-1.flac) # ... # # We also convert the corresponding FLACs to WAV in the same pass files = [] for root, dirnames, filenames in os.walk(source_dir): for filename in fnmatch.filter(filenames, '*.trans.txt'): trans_filename = os.path.join(root, filename) with codecs.open(trans_filename, "r", "utf-8") as fin: for line in fin: # Parse each segment line first_space = line.find(" ") seqid, transcript = line[:first_space], line[first_space + 1:] # We need to do the encode-decode dance here because encode # returns a bytes() object on Python 3, and text_to_char_array # expects a string. transcript = unicodedata.normalize("NFKD", transcript) \ .encode("ascii", "ignore") \ .decode("ascii", "ignore") transcript = transcript.lower().strip() # Convert corresponding FLAC to a WAV flac_file = os.path.join(root, seqid + ".flac") wav_file = os.path.join(target_dir, seqid + ".wav") if not os.path.exists(wav_file): Transformer().build(flac_file, wav_file) wav_filesize = os.path.getsize(wav_file) files.append( (os.path.abspath(wav_file), wav_filesize, transcript)) os.remove(flac_file) return pandas.DataFrame( data=files, columns=["wav_filename", "wav_filesize", "transcript"])
def generate_wavs(data_dir): print("hello") pbar = ProgressBar() for mp3_file in pbar(glob(path.join(data_dir, '*.mp3'))): sound = pydub.AudioSegment.from_mp3(mp3_file) filename = mp3_file[-10:-4] new_file = path.splitext(data_dir)[0] + "/wavs/" + filename + ".wav" sound.export(new_file, format="wav") pbar = ProgressBar() data_dir = data_dir + '/wavs/' # change audio file to 16k sample rate for wav_file in pbar(glob(path.join(data_dir, '*.wav'))): new_file = path.splitext(wav_file)[0] + "k16.wav" transformer = Transformer() transformer.convert(samplerate=sample_rate) transformer.build(wav_file, new_file) pbar = ProgressBar() # remove old files for item in pbar(glob(path.join(data_dir, '*.wav'))): if item.endswith("k16.wav"): continue else: os.remove(item) pbar = ProgressBar() # rename files to remove k16 for item in pbar(glob(path.join(data_dir, '*.wav'))): os.rename(item, item.replace('k16', '')) print("end")
def _maybe_convert_wav_dataset(extracted_dir, data_set): # Create source dir source_dir = path.join(extracted_dir, data_set, "sph") # Create target dir target_dir = path.join(extracted_dir, data_set, "wav") # Conditionally convert sph files to wav files if not gfile.Exists(target_dir): # Create target_dir makedirs(target_dir) # Loop over sph files in source_dir and convert each to wav for sph_file in glob(path.join(source_dir, "*.sph")): transformer = Transformer() wav_filename = path.splitext(path.basename(sph_file))[0] + ".wav" wav_file = path.join(target_dir, wav_filename) transformer.build(sph_file, wav_file) remove(sph_file) # Remove source_dir rmdir(source_dir)
def __process_data(data_folder: str, dst_folder: str, manifest_file: str): """ Converts flac to wav and build manifests's json Args: data_folder: source with flac files dst_folder: where wav files will be stored manifest_file: where to store manifest Returns: """ if not os.path.exists(dst_folder): os.makedirs(dst_folder) files = [] entries = [] for root, dirnames, filenames in os.walk(data_folder): for filename in fnmatch.filter(filenames, '*.trans.txt'): files.append((os.path.join(root, filename), root)) for transcripts_file, root in tqdm(files): with open(transcripts_file, encoding="utf-8") as fin: for line in fin: id, text = line[:line.index(" ")], line[ line.index(" ") + 1:] transcript_text = text.lower().strip() # Convert FLAC file to WAV flac_file = os.path.join(root, id + ".flac") wav_file = os.path.join(dst_folder, id + ".wav") if not os.path.exists(wav_file): Transformer().build(flac_file, wav_file) #else: # raise AssertionError("WAV file {0} already exists. Clean up" # "your destination folder and try again" # .format(wav_file)) # check duration duration = subprocess.check_output( "soxi -D {0}".format(wav_file), shell=True) entry = dict() entry['audio_filepath'] = os.path.abspath(wav_file) entry['duration'] = float(duration) entry['text'] = transcript_text entries.append(entry) with open(manifest_file, 'w') as fout: for m in entries: fout.write(json.dumps(m) + '\n')
def sample_triplet( transformer: sox.Transformer, utterances: Utterances ) -> ((np.ndarray, str), (np.ndarray, str), (np.ndarray, str)): anchor = random.choice(utterances.words) negative = random.choice(utterances.words) while negative == anchor: negative = random.choice(utterances.words) anchor_file = random.choice(utterances.word_files[anchor]) positive_file = random.choice(utterances.word_files[anchor]) while positive_file == anchor_file: positive_file = random.choice(utterances.word_files[anchor]) negative_file = random.choice(utterances.word_files[negative]) # print("anchor file", anchor_file) # print("positive file", positive_file) # print("negative file", negative_file) anchor_word = anchor_file.parent.stem anchor_audio = transformer.build_array(input_filepath=str(anchor_file)) positive_word = positive_file.parent.stem positive_audio = transformer.build_array(input_filepath=str(positive_file)) negative_word = negative_file.parent.stem negative_audio = transformer.build_array(input_filepath=str(negative_file)) #print("anchor-word", anchor_word) #print("positive-word", positive_word) #print("negative-word", negative_word) return (anchor_audio, anchor_word), (positive_audio, positive_word), (negative_audio, negative_word)
def compressed_wav_to_full(source_dir, target_dir): """Convert compressed wav files to full wav files.""" assert path.exists(source_dir) is True if not path.exists(target_dir): makedirs(target_dir) for compressed_file in glob(path.join(source_dir, "*.wav")): transformer = Transformer() if hp.callhome_rate == 8000: transformer.set_output_format(encoding='signed-integer', channels=1) # Also set single channel. else: # Do resampling if specified. transformer.set_output_format(encoding='signed-integer', channels=1, rate=hp.callhome_rate) wav_filename = path.basename(compressed_file) wav_file = path.join(target_dir, wav_filename) transformer.build(compressed_file, wav_file)
def loadFile_thread_exec(data): wavs = [] lengths = [] for i in range(len(data)): fullPath = data[i] transformer = Transformer() transformer.norm() transformer.silence(silence_threshold=1, min_silence_duration=0.1) transformer.set_output_format(rate=16000, bits=16, channels=1) wav = transformer.build_array(input_filepath=str(fullPath)) wav = torch.tensor(wav / (2**15)).float() length = len(wav) if length > max_timestep: start = random.randint(0, int(length - max_timestep)) end = start + max_timestep length = max_timestep wav = wav[start:end] wavs.append(wav) lengths.append(torch.tensor(length).long()) return wavs, lengths
def process(x): file_path, text = x file_name = os.path.splitext(os.path.basename(file_path))[0] text = text.strip().upper() with open(os.path.join(txt_dir, file_name + '.txt'), 'w') as f: f.write(text) audio_path = os.path.join(audio_clips_path, file_path) output_wav_path = os.path.join(wav_dir, file_name + '.wav') tfm = Transformer() tfm.rate(samplerate=args.sample_rate) tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
def process(x): file_path, text = x file_name = os.path.splitext(os.path.basename(file_path))[0] text = text.lower().strip() audio_path = os.path.join(audio_clips_path, file_path) if os.path.getsize(audio_path) == 0: logging.warning(f'Skipping empty audio file {audio_path}') return '', '', '' output_wav_path = os.path.join(wav_dir, file_name + '.wav') if not os.path.exists(output_wav_path): tfm = Transformer() tfm.rate(samplerate=args.sample_rate) tfm.channels(n_channels=args.n_channels) tfm.build(input_filepath=audio_path, output_filepath=output_wav_path) duration = sox.file_info.duration(output_wav_path) return output_wav_path, duration, text
def loadFile(data, max_timestep): transformer = Transformer() transformer.norm() # transformer.silence(silence_threshold=1, min_silence_duration=0.1) transformer.set_output_format(rate=16000, bits=16, channels=1) wav = transformer.build_array(input_filepath=str(data)) wav = torch.tensor(wav / (2**15)).float() length = len(wav) if length > max_timestep: start = 0 end = max_timestep length = max_timestep wav = wav[start:end] length = torch.tensor(length).long() return wav, length
def sph_to_wav(source_dir, target_dir): """Convert .sph files to .wav files.""" assert path.exists(source_dir) is True if not path.exists(target_dir): makedirs(target_dir) for sph_file in glob(path.join(source_dir, "*.sph")): transformer = Transformer() if hp.tedlium_rate != 16000: transformer.set_output_format(encoding='signed-integer', channels=1, rate=hp.tedlium_rate) wav_filename = path.splitext(path.basename(sph_file))[0] + ".wav" wav_file = path.join(target_dir, wav_filename) transformer.build(sph_file, wav_file)
def pitch_shift(aud_seg: AudioSegment, semi: float, **kwargs): """Pitch shift audio sample by semi semitones, without changing the speed of the audio segment. Arguments: aud_seg: audio segment to alter semi: Number of semitones to pitch audio """ # Create a sox transformer tfm = Transformer() tfm.pitch(semi) # Unfortunately, using our current libraries, idk how to make this faster # Sox requires an input file and an output file to perform the pitch shift temp_in_file = NamedTemporaryFile(suffix='.wav') aud_seg.export(temp_in_file, format='wav') temp_out_file = NamedTemporaryFile(suffix='.wav') tfm.build(temp_in_file.name, temp_out_file.name) return AudioSegment.from_file(temp_out_file.name, format='wav')
def read(self, audio_metadata): """Read an audio file. :param audio_metadata: metadata info of an audio :return: raw audio data as float32 array and duration in seconds. """ fd = temp_path = None # Convert it to a wav file. if not audio_metadata.path.endswith('.wav'): original_sample_rate = file_info.sample_rate(audio_metadata.path) assert self._sample_rate <= original_sample_rate transformer = Transformer() transformer.convert(samplerate=self._sample_rate, n_channels=self._channels, bitdepth=self._bits_per_sample) fd, temp_path = tempfile.mkstemp(suffix='.wav') transformer.build(audio_metadata.path, temp_path) if temp_path: path = temp_path else: path = audio_metadata.path # Read the audio file. with SoundFile(path) as soundfile: # make sure the audio properties are as expected. assert soundfile.samplerate == self._sample_rate assert soundfile.channels == self._channels duration_sec = len(soundfile) / self._sample_rate pcm = soundfile.read(dtype='float32') # Add 0.5 second silence to the end of files containing keyword as in occasionally the user stopped # recording right after uttering the keyword. If the detector needs some time after seeing the keyword to # make a decision (e.g. endpointing) this is going to artificially increase the miss rates. if audio_metadata.is_keyword: pcm = np.append(pcm, np.zeros(self._sample_rate // 2)) if temp_path: os.close(fd) os.remove(temp_path) return pcm, duration_sec
def speedup(aud_seg: AudioSegment, speed: float, **kwargs): """Speed up (or slow down) audio segment Args: aud_seg: audio segment to alter speed: new playback speed. Should be thought of as a percentage. For example, if we want to speed up aud_seg by 20%, we pass in 1.2. To slow it down to 80%, pass in 0.8 """ tfm = Transformer() tfm.tempo(speed) # Unfortunately, using our current libraries, idk how to make this faster # Sox requires an input file and an output file to perform the tempo shift temp_in_file = NamedTemporaryFile(suffix='.wav') aud_seg.export(temp_in_file, format='wav') temp_out_file = NamedTemporaryFile(suffix='.wav') tfm.build(temp_in_file.name, temp_out_file.name) return AudioSegment.from_file(temp_out_file.name, format='wav')
def convert(self): """Converts the mp3's associated with this instance to wav's Return: wav_directory (os.path): The directory into which the associated wav's were downloaded """ wav_directory = self._pre_convert() for mp3_filename in self.mp3_directory.glob('**/*.mp3'): wav_filename = path.join( wav_directory, os.path.splitext(os.path.basename(mp3_filename))[0] + ".wav") if not path.exists(wav_filename): _logger.debug("Converting mp3 file %s to wav file %s" % (mp3_filename, wav_filename)) transformer = Transformer() transformer.convert(samplerate=SAMPLE_RATE, n_channels=N_CHANNELS, bitdepth=BITDEPTH) transformer.build(str(mp3_filename), str(wav_filename)) else: _logger.debug("Already converted mp3 file %s to wav file %s" % (mp3_filename, wav_filename)) return wav_directory
def convert_audio_and_split_transcript(input_dir, source_name, target_name, output_dir, output_file): """Convert FLAC to WAV and split the transcript. For audio file, convert the format from FLAC to WAV using the sox.Transformer library. For transcripts, each line contains the sequence id and the corresponding transcript (separated by space): Input data format: seq-id transcript_of_seq-id For example: 1-2-0 transcript_of_1-2-0.flac 1-2-1 transcript_of_1-2-1.flac ... Each sequence id has a corresponding .flac file. Parse the transcript file and generate a new csv file which has three columns: "wav_filename": the absolute path to a wav file. "wav_filesize": the size of the corresponding wav file. "transcript": the transcript for this audio segement. Args: input_dir: the directory which holds the input dataset. source_name: the name of the specified dataset. e.g. test-clean target_name: the directory name for the newly generated audio files. e.g. test-clean-wav output_dir: the directory to place the newly generated csv files. output_file: the name of the newly generated csv file. e.g. test-clean.csv """ tf.logging.info("Preprocessing audio and transcript for %s" % source_name) source_dir = os.path.join(input_dir, source_name) target_dir = os.path.join(input_dir, target_name) if not tf.gfile.Exists(target_dir): tf.gfile.MakeDirs(target_dir) files = [] tfm = Transformer() # Convert all FLAC file into WAV format. At the same time, generate the csv # file. for root, _, filenames in tf.gfile.Walk(source_dir): for filename in fnmatch.filter(filenames, "*.trans.txt"): trans_file = os.path.join(root, filename) with codecs.open(trans_file, "r", "utf-8") as fin: for line in fin: seqid, transcript = line.split(" ", 1) # We do a encode-decode transformation here because the output type # of encode is a bytes object, we need convert it to string. transcript = unicodedata.normalize( "NFKD", transcript).encode("ascii", "ignore").decode( "ascii", "ignore").strip().lower() # Convert FLAC to WAV. flac_file = os.path.join(root, seqid + ".flac") wav_file = os.path.join(target_dir, seqid + ".wav") if not tf.gfile.Exists(wav_file): tfm.build(flac_file, wav_file) wav_filesize = os.path.getsize(wav_file) files.append( (os.path.abspath(wav_file), wav_filesize, transcript)) # Write to CSV file which contains three columns: # "wav_filename", "wav_filesize", "transcript". csv_file_path = os.path.join(output_dir, output_file) df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_filesize", "transcript"]) df.to_csv(csv_file_path, index=False, sep="\t") tf.logging.info("Successfully generated csv file {}".format(csv_file_path))
def convert_audio_and_split_transcript(directory, subset, output_dir): """Convert SPH to WAV and split the transcript. Args: directory: the directory which holds the input dataset. subset: the name of the specified dataset. supports train (switchboard+fisher), switchboard, fisher, hub500 and rt03s. output_dir: the directory to place the newly generated csv files. """ logging.info("Processing audio and transcript for %s" % subset) gfile = tf.compat.v1.gfile sph2pip = os.path.join(os.path.dirname(__file__), "../utils/sph2pipe") swd_audio_trans_dir = [os.path.join(directory, "LDC97S62")] fisher_audio_dirs = [ os.path.join(directory, "LDC2004S13"), os.path.join(directory, "LDC2005S13"), ] fisher_trans_dirs = [ os.path.join(directory, "LDC2004T19"), os.path.join(directory, "LDC2005T19"), ] hub_audio_dir = [os.path.join(directory, "LDC2002S09")] hub_trans_dir = [os.path.join(directory, "LDC2002T43")] rts_audio_trans_dir = [os.path.join(directory, "LDC2007S10")] if subset == "train": # Combination of switchboard corpus and fisher corpus. audio_dir = swd_audio_trans_dir + fisher_audio_dirs trans_dir = swd_audio_trans_dir + fisher_trans_dirs elif subset == "switchboard": audio_dir = swd_audio_trans_dir trans_dir = swd_audio_trans_dir elif subset == "fisher": audio_dir = fisher_audio_dirs trans_dir = fisher_trans_dirs elif subset == "hub500": audio_dir = hub_audio_dir trans_dir = hub_trans_dir elif subset == "rt03s": audio_dir = rts_audio_trans_dir trans_dir = rts_audio_trans_dir else: raise ValueError(subset, " is not in switchboard_fisher") subset_dir = os.path.join(directory, subset) if not gfile.Exists(subset_dir): gfile.MakeDirs(subset_dir) output_wav_dir = os.path.join(directory, subset + "/wav") if not gfile.Exists(output_wav_dir): gfile.MakeDirs(output_wav_dir) tmp_dir = os.path.join(directory, "tmp") if not gfile.Exists(tmp_dir): gfile.MakeDirs(tmp_dir) # Build SPH dict. files = [] sph_files_dict = {} for sub_audio_dir in audio_dir: for root, _, filenames in gfile.Walk(sub_audio_dir): for filename in fnmatch.filter(filenames, "*.[Ss][Pp][Hh]"): sph_key = os.path.splitext(filename)[0] sph_file = os.path.join(root, filename) sph_files_dict[sph_key] = sph_file with TemporaryDirectory(dir=tmp_dir) as output_tmp_wav_dir: for sub_trans_dir in trans_dir: if sub_trans_dir in swd_audio_trans_dir: fnmatch_pat = "*-trans.text" split_and_norm_func = split_line_and_norm_swd elif sub_trans_dir in fisher_trans_dirs: fnmatch_pat = "*.[Tt][Xx][Tt]" split_and_norm_func = split_line_and_norm_fisher elif sub_trans_dir in hub_trans_dir: fnmatch_pat = "hub5e00.english.000405.stm" split_and_norm_func = split_line_and_norm_hub_rts else: fnmatch_pat = "*.stm" split_and_norm_func = split_line_and_norm_hub_rts for root, _, filenames in gfile.Walk(sub_trans_dir): for filename in fnmatch.filter(filenames, fnmatch_pat): trans_file = os.path.join(root, filename) if 1 in [ ele in root for ele in [ "doc", "DOC", "mandarin", "arabic", "concatenated", "bnews", ] ]: continue with codecs.open(trans_file, "r", "utf-8") as fin: for line in fin: line = line.strip() ( sph_key, speaker, time_start, time_end, norm_trans, ) = split_and_norm_func(line, filename) # Too short, skip the wave file if time_end - time_start <= 0.1: continue if norm_trans == "": continue if speaker == "A": channel = 1 else: channel = 2 # Convert SPH to split WAV. if sph_key not in sph_files_dict: print(sph_key + " not found, please check.") continue sph_file = sph_files_dict[sph_key] wav_file = os.path.join( output_tmp_wav_dir, sph_key + "." + speaker + ".wav") if not gfile.Exists(sph_file): raise ValueError( "the sph file {} is not exists".format( sph_file)) sub_wav_filename = "{0}-{1}-{2:06d}-{3:06d}".format( sph_key, speaker, round(time_start * 100), round(time_end * 100), ) sub_wav_file = os.path.join( output_wav_dir, sub_wav_filename + ".wav") if not gfile.Exists(sub_wav_file): if not gfile.Exists(wav_file): sph2pipe_cmd = (sph2pip + " -f wav -c {} -p ".format( str(channel)) + sph_file + " " + wav_file) os.system(sph2pipe_cmd) tfm = Transformer() tfm.trim(time_start, time_end) tfm.build(wav_file, sub_wav_file) # wav_filesize = os.path.getsize(sub_wav_file) wav_length = get_wave_file_length(sub_wav_file) speaker_name = sph_key + "-" + speaker files.append( (os.path.abspath(sub_wav_file), wav_length, norm_trans, speaker_name)) # Write to CSV file which contains four columns: # "wav_filename", "wav_length_ms", "transcript", "speaker". out_csv_file = os.path.join(output_dir, subset + ".csv") df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"]) df.to_csv(out_csv_file, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(out_csv_file))
def _maybe_convert_wav(mp3_filename, wav_filename): if not path.exists(wav_filename): transformer = Transformer() transformer.convert(samplerate=SAMPLE_RATE) transformer.build(mp3_filename, wav_filename)
def convert_audio_and_split_transcript(input_dir, source_name, target_name, output_dir, output_file): """Convert FLAC to WAV and split the transcript. For audio file, convert the format from FLAC to WAV using the sox.Transformer library. For transcripts, each line contains the sequence id and the corresponding transcript (separated by space): Input data format: seq-id transcript_of_seq-id For example: 1-2-0 transcript_of_1-2-0.flac 1-2-1 transcript_of_1-2-1.flac ... Each sequence id has a corresponding .flac file. Parse the transcript file and generate a new csv file which has three columns: "wav_filename": the absolute path to a wav file. "wav_filesize": the size of the corresponding wav file. "transcript": the transcript for this audio segement. Args: input_dir: the directory which holds the input dataset. source_name: the name of the specified dataset. e.g. test-clean target_name: the directory name for the newly generated audio files. e.g. test-clean-wav output_dir: the directory to place the newly generated csv files. output_file: the name of the newly generated csv file. e.g. test-clean.csv """ tf.logging.info("Preprocessing audio and transcript for %s" % source_name) source_dir = os.path.join(input_dir, source_name) target_dir = os.path.join(input_dir, target_name) if not tf.gfile.Exists(target_dir): tf.gfile.MakeDirs(target_dir) files = [] tfm = Transformer() # Convert all FLAC file into WAV format. At the same time, generate the csv # file. for root, _, filenames in tf.gfile.Walk(source_dir): for filename in fnmatch.filter(filenames, "*.trans.txt"): trans_file = os.path.join(root, filename) with codecs.open(trans_file, "r", "utf-8") as fin: for line in fin: seqid, transcript = line.split(" ", 1) # We do a encode-decode transformation here because the output type # of encode is a bytes object, we need convert it to string. transcript = unicodedata.normalize("NFKD", transcript).encode( "ascii", "ignore").decode("ascii", "ignore").strip().lower() # Convert FLAC to WAV. flac_file = os.path.join(root, seqid + ".flac") wav_file = os.path.join(target_dir, seqid + ".wav") if not tf.gfile.Exists(wav_file): tfm.build(flac_file, wav_file) wav_filesize = os.path.getsize(wav_file) files.append((os.path.abspath(wav_file), wav_filesize, transcript)) # Write to CSV file which contains three columns: # "wav_filename", "wav_filesize", "transcript". csv_file_path = os.path.join(output_dir, output_file) df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_filesize", "transcript"]) df.to_csv(csv_file_path, index=False, sep="\t") tf.logging.info("Successfully generated csv file {}".format(csv_file_path))