def load_mfa_align(): converter = MfaTextGridConverter( use_phones=SETTINGS.training.token_type == 'phone') id_align_map = {} for tg_path in args.align_folder.glob('**/*.TextGrid'): tg = TextGrid(str(tg_path.absolute())) audio_id = tg_path.name.split('.', 1)[0] id_align_map[audio_id] = converter.convert(tg) return id_align_map
def prepare_same_close_mic(part3_path): check_dependencies() from textgrids import TextGrid recordings = [] supervisions = [] for audio_path in tqdm( (part3_path / "AudioSameCloseMic").glob("*.wav"), desc="Creating manifests for SameCloseMic", ): try: recording_id = audio_path.stem recording = Recording.from_file(audio_path) tg = TextGrid( part3_path / f"ScriptsSame/{recording_id}.TextGrid", coding="utf-16" ) segments = [ s for s in ( SupervisionSegment( id=f"{recording_id}-{idx}", recording_id=recording_id, start=segment.xmin, # We're trimming the last segment's duration as it exceeds the actual duration of the recording. # This is safe because if we end up with a zero/negative duration, the validation will catch it. duration=min( round(segment.xmax - segment.xmin, ndigits=8), recording.duration - segment.xmin, ), text=segment.text, language="Singaporean English", speaker=recording_id, ) for idx, segment in enumerate(tg[recording_id]) if segment.text not in ("<S>", "<Z>") # skip silences ) if s.duration > 0 # NSC has some bad segments ] recordings.append(recording) supervisions.extend(segments) except: print(f"Error when processing {audio_path} - skipping...") return { "recordings": RecordingSet.from_recordings(recordings), "supervisions": SupervisionSet.from_segments(supervisions), }
def read_phonemes(textgrid_fname, mfcc_len, phone_inventory): tg = TextGrid(textgrid_fname) phone_ids = np.zeros(int(tg['phones'][-1].xmax * 100), dtype=np.int64) phone_ids[:] = -1 for interval in tg['phones']: phone = interval.text.lower() if phone in ['', 'sp', 'spn']: phone = 'sil' if phone[-1] in string.digits: phone = phone[:-1] ph_id = phone_inventory.index(phone) phone_ids[int(interval.xmin * 100):int(interval.xmax * 100)] = ph_id assert (phone_ids >= 0).all(), 'missing aligned phones' phone_ids = phone_ids[1:mfcc_len + 1] # mfccs is 2-3 shorter due to edge effects return phone_ids
def prepare_separate_phone_mic(part3_path): check_dependencies() from textgrids import TextGrid recordings = [] supervisions = [] for audio_path in tqdm( (part3_path / 'AudioSeparateIVR').rglob('**/*.wav'), desc='Creating manifests for SeparateIVR' ): try: recording_id = f'{audio_path.parent.name}_{audio_path.stem}' recording = Recording.from_file(audio_path) tg = TextGrid(part3_path / f'ScriptsSeparate/{recording_id}.TextGrid', coding='utf-16') segments = [ s for s in ( SupervisionSegment( id=f'{recording_id}-{idx}', recording_id=recording_id, start=segment.xmin, # We're trimming the last segment's duration as it exceeds the actual duration of the recording. # This is safe because if we end up with a zero/negative duration, the validation will catch it. duration=min(round(segment.xmax - segment.xmin, ndigits=8), recording.duration - segment.xmin), text=segment.text, language='Singaporean English', speaker=recording_id, ) for idx, segment in enumerate(tg[recording_id]) if segment.text not in ('<S>', '<Z>') # skip silences ) if s.duration > 0 # NSC has some bad segments ] supervisions.extend(segments) recordings.append(recording) except: print(f'Error when processing {audio_path} - skipping...') return { 'recordings': RecordingSet.from_recordings(recordings), 'supervisions': SupervisionSet.from_segments(supervisions) }
def prepare_same_close_mic(part3_path): check_dependencies() from textgrids import TextGrid recordings = [] supervisions = [] for audio_path in tqdm( (part3_path / 'AudioSameCloseMic').glob('*.wav'), desc='Creating manifests for SameCloseMic' ): try: recording_id = audio_path.stem recording = Recording.from_wav(audio_path) tg = TextGrid(part3_path / f'ScriptsSame/{recording_id}.TextGrid', coding='utf-16') segments = [ s for s in ( SupervisionSegment( id=f'{recording_id}-{idx}', recording_id=recording_id, start=segment.xmin, duration=round(segment.xmax - segment.xmin, ndigits=8), text=segment.text, language='Singaporean English', speaker=recording_id, ) for idx, segment in enumerate(tg[recording_id]) if segment.text not in ('<S>', '<Z>') # skip silences ) if s.duration > 0 # NSC has some bad segments ] recordings.append(recording) supervisions.extend(segments) except: print(f'Error when processing {audio_path} - skipping...') return { 'recordings': RecordingSet.from_recordings(recordings), 'supervisions': SupervisionSet.from_segments(supervisions) }
def main(): parser = argparse.ArgumentParser() parser.add_argument('--input-folder', '-i', dest='align_folder', type=Path, required=True) args = parser.parse_args() converter = MfaTextGridConverter() ds_kwargs = dict(sr=SETTINGS.audio.sample_rate, mono=SETTINGS.audio.use_mono, words=[]) ds_path = SETTINGS.dataset.dataset_path train_ds, dev_ds, test_ds = AudioClipDatasetLoader().load_splits( ds_path, **ds_kwargs) id_align_map = {} for tg_path in args.align_folder.glob('*.TextGrid'): tg = TextGrid(str(tg_path.absolute())) audio_id = tg_path.name.split('.', 1)[0] id_align_map[audio_id] = converter.convert(tg) for ds in (train_ds, dev_ds, test_ds): with AudioClipDatasetMetadataWriter(ds_path, ds.set_type, 'aligned-', mode='w') as writer: for ex in tqdm(ds, total=len(ds)): try: transcription = id_align_map[ex.metadata.path.name.split( '.', 1)[0]] writer.write( AlignedAudioClipMetadata(path=ex.metadata.path, transcription=transcription)) except KeyError: pass
line.rstrip() for line in open("src/PhonationModeling/data/filelists/textgrid.lst") ] cnt = 1 for wf, tf in zip(wav_lst, txtgrd_lst): print(f"Processing {wf}") # Read wav sample_rate, wav_raw = wavfile.read(os.path.join( data_root, "wavs", wf)) # Convert from 16-bit int to 32-bit float wav_data = (wav_raw / pow(2, 15)).astype("float32") # Read textgrid txtgrd = TextGrid(os.path.join(data_root, tf)) tier_phone = txtgrd[f"s{cnt} - phone"] # tier containing phones tier_c = txtgrd["ipp"] # tier containing creaky voices cnt = cnt + 1 # Get creaky phone segments ph_intvls = dict_phone_interval(tier_phone)["AA1"] # NOTE: phone ph_segs = get_phone_segments(tier_c, ph_intvls, from_creaky=False) wav_segs = get_wav_segments(wav_data, sample_rate, ph_segs) # Save to wav for i, w_seg in enumerate(wav_segs): wavfile.write( os.path.join(save_dir, wf.rstrip(".wav") + f"_phone_AA1_{i:d}.wav"), sample_rate,