Esempio n. 1
0
def main():
    corpus_dirs = [Path('/mnt/cfs2/asr/database/AM/aishell')]
    corpus_dir = None
    for d in corpus_dirs:
        if os.path.exists(d):
            corpus_dir = d
    if corpus_dir is None:
        print(
            "Please create a place on your system to put the downloaded Aishell data "
            "and add it to `corpus_dirs`")
        sys.exit(1)

    output_dir = Path('exp/data')
    print('aishell manifest preparation:')
    aishell_manifests = prepare_aishell(corpus_dir=corpus_dir,
                                        output_dir=output_dir)

    print('Musan manifest preparation:')
    musan_cuts_path = output_dir / 'cuts_musan.json.gz'
    musan_manifests = prepare_musan(corpus_dir='/export/corpora5/JHU/musan',
                                    output_dir=output_dir,
                                    parts=('music', 'speech', 'noise'))

    print('Feature extraction:')
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in aishell_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=Fbank(),
                storage_path=f'{output_dir}/feats_{partition}',
                num_jobs=num_jobs if ex is not None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer)
            aishell_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
        # Now onto Musan
        if not musan_cuts_path.is_file():
            print('Extracting features for Musan')
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = CutSet.from_manifests(recordings=combine(
                part['recordings']
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=Fbank(),
                            storage_path=f'{output_dir}/feats_musan',
                            num_jobs=num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer)
            musan_cuts.to_json(musan_cuts_path)
Esempio n. 2
0
def main():
    args = get_parser().parse_args()
    dataset_parts = ('dev', 'test', 'train')
    print("Parts we will prepare: ", dataset_parts)

    corpus_dir = locate_corpus(Path('/mnt/corpora/MLS_French'))
    musan_dir = locate_corpus(Path('/mnt/corpora/musan'))

    output_dir = Path('exp/data')
    print('mls manifest preparation:')
    mls_manifests = prepare_mls(corpus_dir=corpus_dir,
                                output_dir=output_dir,
                                opus=False,
                                num_jobs=args.num_jobs)

    print('Musan manifest preparation:')
    musan_cuts_path = output_dir / 'cuts_musan.json.gz'
    musan_manifests = prepare_musan(corpus_dir=musan_dir,
                                    output_dir=output_dir,
                                    parts=('music', 'speech', 'noise'))

    print('Feature extraction:')
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in mls_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f'{output_dir}/feats_{partition}',
                # when an executor is specified, make more partitions
                num_jobs=args.num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer)
            mls_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
        # Now onto Musan
        if not musan_cuts_path.is_file():
            print('Extracting features for Musan')
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = CutSet.from_manifests(recordings=combine(
                part['recordings']
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=extractor,
                            storage_path=f'{output_dir}/feats_musan',
                            num_jobs=args.num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer)
            musan_cuts.to_json(musan_cuts_path)
Esempio n. 3
0
def main():
    dataset_parts = ('dev-clean', 'test-clean', 'train-clean-100')
    print("Parts we will prepare: ", dataset_parts)

    corpus_dirs = [
        Path('/export/corpora5/LibriSpeech'),
        Path(
            '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech'
        )
    ]
    corpus_dir = None
    for d in corpus_dirs:
        if os.path.exists(d):
            corpus_dir = d
    if corpus_dir is None:
        print(
            "Please create a place on your system to put the downloaded Librispeech data "
            "and add it to `corpus_dirs`")
        sys.exit(1)

    output_dir = Path('exp/data')
    print('Manifest preparation:')
    librispeech_manifests = prepare_librispeech(corpus_dir=corpus_dir,
                                                dataset_parts=dataset_parts,
                                                output_dir=output_dir,
                                                num_jobs=num_jobs)

    print('Feature extraction:')
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in librispeech_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=Fbank(),
                executor=ex,
                storage=LilcomFilesWriter(f'{output_dir}/feats_{partition}'))
            librispeech_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
Esempio n. 4
0
def main():
    args = get_parser().parse_args()
    dataset_parts = ('devtest', 'test', 'train')

    print("Parts we will prepare: ", dataset_parts)

    corpus_dir = locate_corpus(
        Path('/mnt/corpora/LDC2006S37/data'),
    )

    output_dir = Path('exp/data')
    print('Heroico manifest preparation:')
    transcripts_dir = Path.joinpath( corpus_dir, 'transcripts' )
    heroico_manifests = prepare_heroico(
        speech_dir=corpus_dir,
        transcript_dir=transcripts_dir,
        output_dir=output_dir,
    )

    print('Feature extraction:')
    extractor = Fbank(FbankConfig(num_mel_bins=80, frame_shift=0.02))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in heroico_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions']
            )
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f'{output_dir}/feats_{partition}',
                # when an executor is specified, make more partitions
                num_jobs=args.num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer
            )
            heroico_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
Esempio n. 5
0
def main():
    args = get_parser().parse_args()

    corpus_dir = locate_corpus(
        Path("/export/corpora5/AMI/amicorpus"),
    )
    annotations_dir = Path("/export/c07/draj")

    download_ami(corpus_dir, annotations_dir=annotations_dir, mic="sdm")

    output_dir = Path("exp/data")

    print("AMI manifest preparation:")
    ami_manifests = prepare_ami(
        corpus_dir,
        annotations_dir=annotations_dir,
        output_dir=output_dir,
        mic="sdm",
        partition="full-corpus",
        max_pause=0,
    )

    print("Feature extraction:")
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in ami_manifests.items():
            if (output_dir / f"cuts_{partition}.json.gz").is_file():
                print(f"{partition} already exists - skipping.")
                continue
            print("Processing", partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests["recordings"],
                supervisions=manifests["supervisions"],
            ).cut_into_windows(duration=5)
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f"{output_dir}/feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=args.num_jobs if ex is None else min(80, len(cut_set)),
                executor=ex,
                storage_type=LilcomHdf5Writer,
            ).pad(duration=5.0)
            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
Esempio n. 6
0
def main():
    corpus_dirs = [Path('/mnt/cfs2/asr/database/AM/aishell')]
    corpus_dir = None
    for d in corpus_dirs:
        if os.path.exists(d):
            corpus_dir = d
    if corpus_dir is None:
        print(
            "Please create a place on your system to put the downloaded Aishell data "
            "and add it to `corpus_dirs`")
        sys.exit(1)

    output_dir = Path('exp/data')
    print('Manifest preparation:')
    aishell_manifests = prepare_aishell(corpus_dir=corpus_dir,
                                        output_dir=output_dir)

    print('Feature extraction:')
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in aishell_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=Fbank(),
                executor=ex,
                storage=LilcomFilesWriter(f'{output_dir}/feats_{partition}'))
            aishell_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
Esempio n. 7
0
def export_to_kaldi(recordings: RecordingSet, supervisions: SupervisionSet,
                    output_dir: Pathlike):
    """
    Export a pair of ``RecordingSet`` and ``SupervisionSet`` to a Kaldi data directory.
    Currently, it only supports single-channel recordings that have a single ``AudioSource``.

    The ``RecordingSet`` and ``SupervisionSet`` must be compatible, i.e. it must be possible to create a
    ``CutSet`` out of them.

    :param recordings: a ``RecordingSet`` manifest.
    :param supervisions: a ``SupervisionSet`` manifest.
    :param output_dir: path where the Kaldi-style data directory will be created.
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    assert all(len(r.sources) == 1 for r in recordings), "Kaldi export of Recordings with multiple audio sources " \
                                                         "is currently not supported."
    assert all(r.num_channels == 1 for r in recordings), "Kaldi export of multi-channel Recordings is currently " \
                                                         "not supported."

    # Create a simple CutSet that ties together the recording <-> supervision information.
    cuts = CutSet.from_manifests(
        recordings=recordings,
        supervisions=supervisions).trim_to_supervisions()

    # wav.scp
    save_kaldi_text_mapping(data={
        recording.id:
        f'{source.source} |' if source.type == 'command' else source.source
        for recording in recordings
        for src_idx, source in enumerate(recording.sources)
    },
                            path=output_dir / 'wav.scp')
    # segments
    save_kaldi_text_mapping(data={
        cut.supervisions[0].id: f'{cut.recording_id} {cut.start} {cut.end}'
        for cut in cuts
    },
                            path=output_dir / 'segments')
    # text
    save_kaldi_text_mapping(data={
        cut.supervisions[0].id: cut.supervisions[0].text
        for cut in cuts
    },
                            path=output_dir / 'text')
    # utt2spk
    save_kaldi_text_mapping(data={
        cut.supervisions[0].id: cut.supervisions[0].speaker
        for cut in cuts
    },
                            path=output_dir / 'utt2spk')
    # utt2dur
    save_kaldi_text_mapping(
        data={cut.supervisions[0].id: cut.duration
              for cut in cuts},
        path=output_dir / 'utt2dur')
    # reco2dur
    save_kaldi_text_mapping(
        data={recording.id: recording.duration
              for recording in recordings},
        path=output_dir / 'reco2dur')
    # utt2lang [optional]
    if all(s.language is not None for s in supervisions):
        save_kaldi_text_mapping(data={
            cut.supervisions[0].id: cut.supervisions[0].language
            for cut in cuts
        },
                                path=output_dir / 'utt2lang')
    # utt2gender [optional]
    if all(s.gender is not None for s in supervisions):
        save_kaldi_text_mapping(data={
            cut.supervisions[0].id: cut.supervisions[0].gender
            for cut in cuts
        },
                                path=output_dir / 'utt2gender')
Esempio n. 8
0
def memmap_raw_audio(wav_scp,
                     f_memmapped,
                     utt_list,
                     dtype=np.float32,
                     sampling_rate=16000,
                     do_normalize=True):
    '''
        Maps the wva.scp file from kaldi to a memory mapped numpy object.
        This allows for fast i/o when creating window minibathces from slices
        of training data.

        input args: wav_scp, f_memmapped
        output:
            utt_lens = {'utt_n': # utt_n frames, ...}
            offsets = {'utt_n': utt_n offset in memory mapped numpy file}
            data_shape = {#frames, feature_dimension}
    '''
    import os
    dataset = os.path.dirname(wav_scp)

    print(dataset)
    if not os.path.exists(os.path.join(dataset, 'reco2dur')):
        p = subprocess.Popen(['./utils/data/get_reco2dur.sh', dataset],
                             stdout=subprocess.PIPE)
        out = p.communicate()

    # Import lhotse and install if not available
    try:
        from lhotse import kaldi, CutSet
    except ImportError:
        from pip._internal import main as pip
        pip(['install', 'lhotse'])
        from lhotse import kaldi, CutSet
    from lhotse.utils import compute_num_samples

    data = kaldi.load_kaldi_data_dir(dataset, sampling_rate)
    cuts = CutSet.from_manifests(data[0], data[1])
    dim = 1

    utt_lens = {}
    for cut in cuts:
        sr = cut.recording.sampling_rate
        for sup in cut.supervisions:
            if sup.id not in utt_list:
                continue
            utt_lens[sup.id.encode()] = compute_num_samples(sup.duration, sr)
    data_shape = (sum(utt_lens.values()), dim)

    f = np.memmap(f_memmapped, mode='w+', dtype=dtype, shape=data_shape)
    offsets = {}
    offset = 0
    for cut in cuts:
        x_ = cut.recording.load_audio().T
        # Mean and variance normalize
        if do_normalize:
            x = (x_ - x_.mean()) / x_.std()
        else:
            x = x_
        sr = cut.recording.sampling_rate
        for i, supervision in enumerate(cut.supervisions):
            k = supervision.id
            print('Utterance ', i, ' : ', k, ' : ', sr)
            start, dur = supervision.start, supervision.duration
            if k not in utt_list:
                continue
            start_sample = compute_num_samples(start, sr)
            end_sample = start_sample + utt_lens[k.encode()]
            m = x[start_sample:end_sample]
            offsets[k.encode()] = offset
            utt_lens[k.encode()] = m.shape[0]
            new_offset = offset + utt_lens[k.encode()]
            f[offset:new_offset, :] = m
            offset = new_offset
        print()
    del f
    return utt_lens, offsets, data_shape
Esempio n. 9
0
def main():
    corpus_dir = locate_corpus(
        (Path('/mnt/cfs2/asr/database/AM/aishell'),
         Path('/root/fangjun/data/aishell'),
         Path(
             '/home/storage04/zhuangweiji/data/open-source-data/SLR33-aishell/data'
         )),
        msg='Please specify the directory to the AIShell dataset')

    musan_dir = locate_corpus(
        (Path('/export/corpora5/JHU/musan'),
         Path('/export/common/data/corpora/MUSAN/musan'),
         Path('/root/fangjun/data/musan')),
        msg='Please specify the directory to the MUSAN dataset')

    output_dir = Path('exp/data')
    print('aishell manifest preparation:')
    aishell_manifests = prepare_aishell(
        corpus_dir=corpus_dir,
        output_dir=output_dir
    )

    print('Musan manifest preparation:')
    musan_cuts_path = output_dir / 'cuts_musan.json.gz'
    musan_manifests = prepare_musan(
        corpus_dir=musan_dir,
        output_dir=output_dir,
        parts=('music', 'speech', 'noise')
    )

    print('Feature extraction:')
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in aishell_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions']
            )
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=Fbank(),
                storage_path=f'{output_dir}/feats_{partition}',
                num_jobs=num_jobs if ex is not None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer
            )
            aishell_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
        # Now onto Musan
        if not musan_cuts_path.is_file():
            print('Extracting features for Musan')
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = CutSet.from_manifests(
                recordings=combine(part['recordings'] for part in musan_manifests.values())
            ).cut_into_windows(10.0).filter(lambda c: c.duration > 5).compute_and_store_features(
                extractor=Fbank(),
                storage_path=f'{output_dir}/feats_musan',
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer
            )
            musan_cuts.to_json(musan_cuts_path)
Esempio n. 10
0
def main():
    args = get_parser().parse_args()
    if args.full_libri:
        dataset_parts = ('dev-clean', 'dev-other', 'test-clean', 'test-other',
                         'train-clean-100', 'train-clean-360',
                         'train-other-500')
    else:
        dataset_parts = ('dev-clean', 'dev-other', 'test-clean', 'test-other',
                         'train-clean-100')

    print("Parts we will prepare: ", dataset_parts)

    corpus_dir = locate_corpus(
        Path('/export/corpora5/LibriSpeech'),
        Path(
            '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech'
        ), Path('/root/fangjun/data/librispeech/LibriSpeech'),
        Path('/export/common/data/corpora/ASR/openslr/SLR12/LibriSpeech'))
    musan_dir = locate_corpus(
        Path('/export/corpora5/JHU/musan'),
        Path('/export/common/data/corpora/MUSAN/musan'),
        Path('/root/fangjun/data/musan'),
    )

    output_dir = Path('exp/data')
    print('LibriSpeech manifest preparation:')
    librispeech_manifests = prepare_librispeech(corpus_dir=corpus_dir,
                                                dataset_parts=dataset_parts,
                                                output_dir=output_dir,
                                                num_jobs=args.num_jobs)

    print('Musan manifest preparation:')
    musan_cuts_path = output_dir / 'cuts_musan.json.gz'
    musan_manifests = prepare_musan(corpus_dir=musan_dir,
                                    output_dir=output_dir,
                                    parts=('music', 'speech', 'noise'))

    print('Feature extraction:')
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in librispeech_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f'{output_dir}/feats_{partition}',
                # when an executor is specified, make more partitions
                num_jobs=args.num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer)
            librispeech_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
        # Now onto Musan
        if not musan_cuts_path.is_file():
            print('Extracting features for Musan')
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = CutSet.from_manifests(recordings=combine(
                part['recordings']
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=extractor,
                            storage_path=f'{output_dir}/feats_musan',
                            num_jobs=args.num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer)
            musan_cuts.to_json(musan_cuts_path)
Esempio n. 11
0
if use_data_augmentation:
    num_jobs = 1
else:
    num_jobs = os.cpu_count()
    torch.set_num_threads(1)
    torch.set_num_interop_threads(1)

num_jobs = 1
for partition, manifests in librispeech_manifests.items():
    print(partition)
    with LilcomFilesWriter(f'{output_dir}/feats_{partition}'
                           ) as storage, ProcessPoolExecutor(num_jobs) as ex:
        cut_set = CutSet.from_manifests(
            recordings=manifests['recordings'],
            supervisions=manifests['supervisions']).compute_and_store_features(
                extractor=Fbank(),
                storage=storage,
                augmenter=augmenter if 'train' in partition else None,
                executor=ex)
    librispeech_manifests[partition]['cuts'] = cut_set
    cut_set.to_json(output_dir + f'/cuts_{partition}.json.gz')

cuts_train = SpeechRecognitionDataset(
    librispeech_manifests['train-clean-100']['cuts'])
cuts_test = SpeechRecognitionDataset(
    librispeech_manifests['test-clean']['cuts'])

sample = cuts_train[0]
print('Transcript:', sample['text'])
print('Supervisions mask:', sample['supervisions_mask'])
print('Feature matrix:', sample.load_features())
Esempio n. 12
0
def main():
    if full_libri:
        dataset_parts = ('dev-clean', 'test-clean', 'train-clean-100',
                         'train-clean-360', 'train-other-500')
    else:
        dataset_parts = ('dev-clean', 'test-clean', 'train-clean-100')

    print("Parts we will prepare: ", dataset_parts)

    corpus_dirs = [
        Path('/export/corpora5/LibriSpeech'),
        Path(
            '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech'
        )
    ]
    corpus_dir = None
    for d in corpus_dirs:
        if os.path.exists(d):
            corpus_dir = d
    if corpus_dir is None:
        print(
            "Please create a place on your system to put the downloaded Librispeech data "
            "and add it to `corpus_dirs`")
        sys.exit(1)

    output_dir = Path('exp/data')
    print('LibriSpeech manifest preparation:')
    librispeech_manifests = prepare_librispeech(corpus_dir=corpus_dir,
                                                dataset_parts=dataset_parts,
                                                output_dir=output_dir,
                                                num_jobs=num_jobs)

    print('Musan manifest preparation:')
    musan_cuts_path = output_dir / 'cuts_musan.json.gz'
    musan_manifests = prepare_musan(corpus_dir='/export/corpora5/JHU/musan',
                                    output_dir=output_dir,
                                    parts=('music', 'speech', 'noise'))

    print('Feature extraction:')
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in librispeech_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=Fbank(),
                storage_path=f'{output_dir}/feats_{partition}',
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer)
            librispeech_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
        # Now onto Musan
        if not musan_cuts_path.is_file():
            print('Extracting features for Musan')
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = CutSet.from_manifests(recordings=combine(
                part['recordings']
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=Fbank(),
                            storage_path=f'{output_dir}/feats_musan',
                            num_jobs=num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer)
            musan_cuts.to_json(musan_cuts_path)
Esempio n. 13
0
def export_to_kaldi(
    recordings: RecordingSet,
    supervisions: SupervisionSet,
    output_dir: Pathlike,
    map_underscores_to: Optional[str] = None,
):
    """
    Export a pair of ``RecordingSet`` and ``SupervisionSet`` to a Kaldi data
    directory. It even supports recordings that have multiple channels but
    the recordings will still have to have a single ``AudioSource``.

    The ``RecordingSet`` and ``SupervisionSet`` must be compatible, i.e. it must
    be possible to create a ``CutSet`` out of them.

    :param recordings: a ``RecordingSet`` manifest.
    :param supervisions: a ``SupervisionSet`` manifest.
    :param output_dir: path where the Kaldi-style data directory will be created.
    :param map_underscores_to: optional string with which we will replace all
        underscores. This helps avoid issues with Kaldi data dir sorting.
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    assert all(len(r.sources) == 1 for r in recordings), (
        "Kaldi export of Recordings with multiple audio sources "
        "is currently not supported.")

    if map_underscores_to is not None:
        supervisions = supervisions.map(lambda s: fastcopy(
            s,
            id=s.id.replace("_", map_underscores_to),
            speaker=s.speaker.replace("_", map_underscores_to),
        ))

    # Create a simple CutSet that ties together
    # the recording <-> supervision information.
    cuts = CutSet.from_manifests(
        recordings=recordings,
        supervisions=supervisions).trim_to_supervisions()

    if all(r.num_channels == 1 for r in recordings):
        # if all the recordings are single channel, we won't add
        # the channel id affix to retain back compatibility
        # and the ability to receive back the same utterances after
        # importing the exported directory back
        # wav.scp
        save_kaldi_text_mapping(
            data={
                recording.id: make_wavscp_channel_string_map(
                    source, sampling_rate=recording.sampling_rate)[0]
                for recording in recordings for source in recording.sources
            },
            path=output_dir / "wav.scp",
        )
        # segments
        save_kaldi_text_mapping(
            data={
                cut.supervisions[0].id:
                f"{cut.recording_id} {cut.start} {cut.end}"
                for cut in cuts
            },
            path=output_dir / "segments",
        )
        # reco2dur
        save_kaldi_text_mapping(
            data={
                recording.id: recording.duration
                for recording in recordings
            },
            path=output_dir / "reco2dur",
        )

    else:
        # wav.scp
        save_kaldi_text_mapping(
            data={
                f"{recording.id}_{channel}": make_wavscp_channel_string_map(
                    source, sampling_rate=recording.sampling_rate)[channel]
                for recording in recordings for source in recording.sources
                for channel in source.channels
            },
            path=output_dir / "wav.scp",
        )
        # segments
        save_kaldi_text_mapping(
            data={
                cut.supervisions[0].id:
                f"{cut.recording_id}_{cut.channel} {cut.start} {cut.end}"
                for cut in cuts
            },
            path=output_dir / "segments",
        )
        # reco2dur
        save_kaldi_text_mapping(
            data={
                f"{recording.id}_{channel}": recording.duration
                for recording in recordings
                for channel in recording.sources[0].channels
            },
            path=output_dir / "reco2dur",
        )
    # text
    save_kaldi_text_mapping(
        data={
            cut.supervisions[0].id: cut.supervisions[0].text
            for cut in cuts
        },
        path=output_dir / "text",
    )
    # utt2spk
    save_kaldi_text_mapping(
        data={
            cut.supervisions[0].id: cut.supervisions[0].speaker
            for cut in cuts
        },
        path=output_dir / "utt2spk",
    )
    # utt2dur
    save_kaldi_text_mapping(
        data={cut.supervisions[0].id: cut.duration
              for cut in cuts},
        path=output_dir / "utt2dur",
    )
    # utt2lang [optional]
    if all(s.language is not None for s in supervisions):
        save_kaldi_text_mapping(
            data={
                cut.supervisions[0].id: cut.supervisions[0].language
                for cut in cuts
            },
            path=output_dir / "utt2lang",
        )
    # utt2gender [optional]
    if all(s.gender is not None for s in supervisions):
        save_kaldi_text_mapping(
            data={
                cut.supervisions[0].id: cut.supervisions[0].gender
                for cut in cuts
            },
            path=output_dir / "utt2gender",
        )
Esempio n. 14
0
def prepare_gigaspeech(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike],
    dataset_parts: Union[str, Sequence[str]] = "auto",
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    if is_module_available("speechcolab"):
        from speechcolab.datasets.gigaspeech import GigaSpeech
    else:
        raise ImportError(
            "To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab"
        )

    subsets = ("XL", "DEV", "TEST") if dataset_parts == "auto" else dataset_parts
    if isinstance(subsets, str):
        subsets = [subsets]
    corpus_dir = Path(corpus_dir)
    gigaspeech = GigaSpeech(corpus_dir)

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # Maybe some manifests already exist: we can read them and save a bit of preparation time.
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=output_dir,
        prefix="gigaspeech",
        suffix="jsonl.gz",
        lazy=True,
    )

    for part in subsets:
        logging.info(f"Processing GigaSpeech subset: {part}")
        if manifests_exist(
            part=part, output_dir=output_dir, prefix="gigaspeech", suffix="jsonl.gz"
        ):
            logging.info(f"GigaSpeech subset: {part} already prepared - skipping.")
            continue

        with RecordingSet.open_writer(
            output_dir / f"gigaspeech_recordings_{part}.jsonl.gz"
        ) as rec_writer, SupervisionSet.open_writer(
            output_dir / f"gigaspeech_supervisions_{part}.jsonl.gz"
        ) as sup_writer, CutSet.open_writer(
            output_dir / f"gigaspeech_cuts_{part}.jsonl.gz"
        ) as cut_writer:
            for recording, segments in tqdm(
                parallel_map(
                    parse_utterance,
                    gigaspeech.audios("{" + part + "}"),
                    repeat(gigaspeech.gigaspeech_dataset_dir),
                    num_jobs=num_jobs,
                ),
                desc="Processing GigaSpeech JSON entries",
            ):
                # Fix and validate the recording + supervisions
                recordings, segments = fix_manifests(
                    recordings=RecordingSet.from_recordings([recording]),
                    supervisions=SupervisionSet.from_segments(segments),
                )
                validate_recordings_and_supervisions(
                    recordings=recordings, supervisions=segments
                )
                # Create the cut since most users will need it anyway.
                # There will be exactly one cut since there's exactly one recording.
                cuts = CutSet.from_manifests(
                    recordings=recordings, supervisions=segments
                )
                # Write the manifests
                rec_writer.write(recordings[0])
                for s in segments:
                    sup_writer.write(s)
                cut_writer.write(cuts[0])

        manifests[part] = {
            "recordings": RecordingSet.from_jsonl_lazy(rec_writer.path),
            "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path),
            "cuts": CutSet.from_jsonl_lazy(cut_writer.path),
        }

    return dict(manifests)
Esempio n. 15
0
def main():
    args = get_parser().parse_args()
    dataset_parts = [args.subset, "DEV", "TEST"]

    print("Parts we will prepare: ", dataset_parts)

    corpus_dir = locate_corpus(
        Path("/export/corpora5/gigaspeech"),
        Path("/exp/pzelasko/gigaspeech"),
    )
    musan_dir = locate_corpus(
        Path("/export/corpora5/JHU/musan"),
        Path("/export/common/data/corpora/MUSAN/musan"),
        Path("/root/fangjun/data/musan"),
    )

    output_dir = Path("exp/data")
    print("GigaSpeech manifest preparation:")
    gigaspeech_manifests = prepare_gigaspeech(
        corpus_dir=corpus_dir,
        dataset_parts=dataset_parts,
        output_dir=output_dir,
        num_jobs=args.num_jobs,
    )

    print("Musan manifest preparation:")
    musan_cuts_path = output_dir / "cuts_musan.json.gz"
    musan_manifests = prepare_musan(corpus_dir=musan_dir,
                                    output_dir=output_dir,
                                    parts=("music", "speech", "noise"))

    ctx_suffix = get_context_suffix(args)

    print("Feature extraction:")
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in gigaspeech_manifests.items():
            raw_cuts_path = output_dir / f"gigaspeech_cuts_{partition}_raw.jsonl.gz"
            cuts_path = (output_dir /
                         f"gigaspeech_cuts_{partition}{ctx_suffix}.jsonl.gz")

            if raw_cuts_path.is_file():
                print(
                    f"{partition} already exists - skipping feature extraction."
                )
            else:
                # Note this step makes the recipe different than LibriSpeech:
                # We must filter out some utterances and remove punctuation to be consistent with Kaldi.
                print("Filtering OOV utterances from supervisions")
                manifests["supervisions"] = manifests["supervisions"].filter(
                    has_no_oov)
                print("Normalizing text in", partition)
                for sup in manifests["supervisions"]:
                    sup.text = normalize_text(sup.text)

                # Create long-recording cut manifests.
                print("Processing", partition)
                cut_set = CutSet.from_manifests(
                    recordings=manifests["recordings"],
                    supervisions=manifests["supervisions"],
                )

                # Run data augmentation that needs to be done in the time domain.
                if partition not in ["DEV", "TEST"]:
                    cut_set = (cut_set + cut_set.perturb_speed(0.9) +
                               cut_set.perturb_speed(1.1))

                cut_set.to_file(raw_cuts_path)

            if cuts_path.is_file():
                print(
                    f"{partition} already exists - skipping cutting into sub-segments."
                )
            else:
                try:
                    # If we skipped initializing `cut_set` because it exists on disk, we'll load it.
                    # This helps us avoid re-computing the features for different variants of
                    # context windows.
                    cut_set
                except NameError:
                    print(f"Reading {partition} raw cuts from disk.")
                    cut_set = CutSet.from_file(raw_cuts_path)
                # Note this step makes the recipe different than LibriSpeech:
                # Since recordings are long, the initial CutSet has very long cuts with a plenty of supervisions.
                # We cut these into smaller chunks centered around each supervision, possibly adding acoustic
                # context.
                print(
                    f"About to split {partition} raw cuts into smaller chunks."
                )
                cut_set = cut_set.trim_to_supervisions(
                    keep_overlapping=False,
                    min_duration=None
                    if args.context_window <= 0.0 else args.context_window,
                    context_direction=args.context_direction,
                )
                if partition in ["L", "XL"]:
                    # Before storing manifests in, we want to pre-shuffle them,
                    # as the sampler won't be able to do it later in an efficient manner.
                    cut_set = cut_set.shuffle()

                if args.precomputed_features:
                    # Extract the features after cutting large recordings into smaller cuts.
                    # Note: we support very efficient "chunked" feature reads with the argument
                    #       `storage_type=ChunkedLilcomHdf5Writer`, but we don't support efficient
                    #       data augmentation and feature computation for long recordings yet.
                    #       Therefore, we sacrifice some storage for the ability to precompute
                    #       features on shorter chunks, without memory blow-ups.
                    cut_set = cut_set.compute_and_store_features(
                        extractor=extractor,
                        storage_path=
                        f"{output_dir}/feats_gigaspeech_{partition}",
                        # when an executor is specified, make more partitions
                        num_jobs=args.num_jobs if ex is None else 80,
                        executor=ex,
                    )

                cut_set.to_file(cuts_path)

                # Remove cut_set so the next iteration can correctly infer whether it needs to
                # load the raw cuts from disk or not.
                del cut_set

        # Now onto Musan
        if not musan_cuts_path.is_file():
            print("Extracting features for Musan")
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = (CutSet.from_manifests(recordings=combine(
                part["recordings"]
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=extractor,
                            storage_path=f"{output_dir}/feats_musan",
                            num_jobs=args.num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer,
                        ))
            musan_cuts.to_file(musan_cuts_path)