Example #1
0
    def load_mfa_align():
        converter = MfaTextGridConverter(
            use_phones=SETTINGS.training.token_type == 'phone')
        id_align_map = {}

        for tg_path in args.align_folder.glob('**/*.TextGrid'):
            tg = TextGrid(str(tg_path.absolute()))
            audio_id = tg_path.name.split('.', 1)[0]
            id_align_map[audio_id] = converter.convert(tg)
        return id_align_map
Example #2
0
def prepare_same_close_mic(part3_path):
    check_dependencies()
    from textgrids import TextGrid

    recordings = []
    supervisions = []
    for audio_path in tqdm(
        (part3_path / "AudioSameCloseMic").glob("*.wav"),
        desc="Creating manifests for SameCloseMic",
    ):
        try:
            recording_id = audio_path.stem
            recording = Recording.from_file(audio_path)

            tg = TextGrid(
                part3_path / f"ScriptsSame/{recording_id}.TextGrid", coding="utf-16"
            )
            segments = [
                s
                for s in (
                    SupervisionSegment(
                        id=f"{recording_id}-{idx}",
                        recording_id=recording_id,
                        start=segment.xmin,
                        # We're trimming the last segment's duration as it exceeds the actual duration of the recording.
                        # This is safe because if we end up with a zero/negative duration, the validation will catch it.
                        duration=min(
                            round(segment.xmax - segment.xmin, ndigits=8),
                            recording.duration - segment.xmin,
                        ),
                        text=segment.text,
                        language="Singaporean English",
                        speaker=recording_id,
                    )
                    for idx, segment in enumerate(tg[recording_id])
                    if segment.text not in ("<S>", "<Z>")  # skip silences
                )
                if s.duration > 0  # NSC has some bad segments
            ]

            recordings.append(recording)
            supervisions.extend(segments)
        except:
            print(f"Error when processing {audio_path} - skipping...")
    return {
        "recordings": RecordingSet.from_recordings(recordings),
        "supervisions": SupervisionSet.from_segments(supervisions),
    }
def read_phonemes(textgrid_fname, mfcc_len, phone_inventory):
    tg = TextGrid(textgrid_fname)
    phone_ids = np.zeros(int(tg['phones'][-1].xmax * 100), dtype=np.int64)
    phone_ids[:] = -1
    for interval in tg['phones']:
        phone = interval.text.lower()
        if phone in ['', 'sp', 'spn']:
            phone = 'sil'
        if phone[-1] in string.digits:
            phone = phone[:-1]
        ph_id = phone_inventory.index(phone)
        phone_ids[int(interval.xmin * 100):int(interval.xmax * 100)] = ph_id
    assert (phone_ids >= 0).all(), 'missing aligned phones'

    phone_ids = phone_ids[1:mfcc_len +
                          1]  # mfccs is 2-3 shorter due to edge effects
    return phone_ids
Example #4
0
def prepare_separate_phone_mic(part3_path):
    check_dependencies()
    from textgrids import TextGrid
    recordings = []
    supervisions = []
    for audio_path in tqdm(
            (part3_path / 'AudioSeparateIVR').rglob('**/*.wav'),
            desc='Creating manifests for SeparateIVR'
    ):
        try:
            recording_id = f'{audio_path.parent.name}_{audio_path.stem}'
            recording = Recording.from_file(audio_path)

            tg = TextGrid(part3_path / f'ScriptsSeparate/{recording_id}.TextGrid', coding='utf-16')
            segments = [
                s for s in (
                    SupervisionSegment(
                        id=f'{recording_id}-{idx}',
                        recording_id=recording_id,
                        start=segment.xmin,
                        # We're trimming the last segment's duration as it exceeds the actual duration of the recording.
                        # This is safe because if we end up with a zero/negative duration, the validation will catch it.
                        duration=min(round(segment.xmax - segment.xmin, ndigits=8), recording.duration - segment.xmin),
                        text=segment.text,
                        language='Singaporean English',
                        speaker=recording_id,
                    )
                    for idx, segment in enumerate(tg[recording_id])
                    if segment.text not in ('<S>', '<Z>')  # skip silences
                )
                if s.duration > 0  # NSC has some bad segments
            ]

            supervisions.extend(segments)
            recordings.append(recording)
        except:
            print(f'Error when processing {audio_path} - skipping...')
    return {
        'recordings': RecordingSet.from_recordings(recordings),
        'supervisions': SupervisionSet.from_segments(supervisions)
    }
Example #5
0
def prepare_same_close_mic(part3_path):
    check_dependencies()
    from textgrids import TextGrid
    recordings = []
    supervisions = []
    for audio_path in tqdm(
            (part3_path / 'AudioSameCloseMic').glob('*.wav'),
            desc='Creating manifests for SameCloseMic'
    ):
        try:
            recording_id = audio_path.stem
            recording = Recording.from_wav(audio_path)

            tg = TextGrid(part3_path / f'ScriptsSame/{recording_id}.TextGrid', coding='utf-16')
            segments = [
                s for s in (
                    SupervisionSegment(
                        id=f'{recording_id}-{idx}',
                        recording_id=recording_id,
                        start=segment.xmin,
                        duration=round(segment.xmax - segment.xmin, ndigits=8),
                        text=segment.text,
                        language='Singaporean English',
                        speaker=recording_id,
                    )
                    for idx, segment in enumerate(tg[recording_id])
                    if segment.text not in ('<S>', '<Z>')  # skip silences
                )
                if s.duration > 0  # NSC has some bad segments
            ]

            recordings.append(recording)
            supervisions.extend(segments)
        except:
            print(f'Error when processing {audio_path} - skipping...')
    return {
        'recordings': RecordingSet.from_recordings(recordings),
        'supervisions': SupervisionSet.from_segments(supervisions)
    }
Example #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input-folder',
                        '-i',
                        dest='align_folder',
                        type=Path,
                        required=True)
    args = parser.parse_args()

    converter = MfaTextGridConverter()
    ds_kwargs = dict(sr=SETTINGS.audio.sample_rate,
                     mono=SETTINGS.audio.use_mono,
                     words=[])
    ds_path = SETTINGS.dataset.dataset_path
    train_ds, dev_ds, test_ds = AudioClipDatasetLoader().load_splits(
        ds_path, **ds_kwargs)
    id_align_map = {}

    for tg_path in args.align_folder.glob('*.TextGrid'):
        tg = TextGrid(str(tg_path.absolute()))
        audio_id = tg_path.name.split('.', 1)[0]
        id_align_map[audio_id] = converter.convert(tg)

    for ds in (train_ds, dev_ds, test_ds):
        with AudioClipDatasetMetadataWriter(ds_path,
                                            ds.set_type,
                                            'aligned-',
                                            mode='w') as writer:
            for ex in tqdm(ds, total=len(ds)):
                try:
                    transcription = id_align_map[ex.metadata.path.name.split(
                        '.', 1)[0]]
                    writer.write(
                        AlignedAudioClipMetadata(path=ex.metadata.path,
                                                 transcription=transcription))
                except KeyError:
                    pass
Example #7
0
        line.rstrip()
        for line in open("src/PhonationModeling/data/filelists/textgrid.lst")
    ]

    cnt = 1
    for wf, tf in zip(wav_lst, txtgrd_lst):
        print(f"Processing {wf}")

        # Read wav
        sample_rate, wav_raw = wavfile.read(os.path.join(
            data_root, "wavs", wf))
        # Convert from 16-bit int to 32-bit float
        wav_data = (wav_raw / pow(2, 15)).astype("float32")

        # Read textgrid
        txtgrd = TextGrid(os.path.join(data_root, tf))
        tier_phone = txtgrd[f"s{cnt} - phone"]  # tier containing phones
        tier_c = txtgrd["ipp"]  # tier containing creaky voices
        cnt = cnt + 1

        # Get creaky phone segments
        ph_intvls = dict_phone_interval(tier_phone)["AA1"]  # NOTE: phone
        ph_segs = get_phone_segments(tier_c, ph_intvls, from_creaky=False)
        wav_segs = get_wav_segments(wav_data, sample_rate, ph_segs)

        # Save to wav
        for i, w_seg in enumerate(wav_segs):
            wavfile.write(
                os.path.join(save_dir,
                             wf.rstrip(".wav") + f"_phone_AA1_{i:d}.wav"),
                sample_rate,