Ejemplo n.º 1
0
def process(args):
    root = Path(args.data_root).absolute() / args.src_lang
    if not root.is_dir():
        raise NotADirectoryError(f"{root} does not exist")
    # Extract features
    feature_root = root / "fbank80"
    feature_root.mkdir(exist_ok=True)
    for split in CoVoST.SPLITS:
        print(f"Fetching split {split}...")
        dataset = CoVoST(root, split, args.src_lang, args.tgt_lang)
        print("Extracting log mel filter bank features...")
        for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
            extract_fbank_features(waveform, sample_rate,
                                   feature_root / f"{utt_id}.npy")
    # Pack features into ZIP
    zip_path = root / "fbank80.zip"
    print("ZIPing features...")
    create_zip(feature_root, zip_path)
    print("Fetching ZIP manifest...")
    zip_manifest = get_zip_manifest(zip_path)
    # Generate TSV manifest
    print("Generating manifest...")
    train_text = []
    task = f"asr_{args.src_lang}"
    if args.tgt_lang is not None:
        task = f"st_{args.src_lang}_{args.tgt_lang}"
    for split in CoVoST.SPLITS:
        manifest = {c: [] for c in MANIFEST_COLUMNS}
        dataset = CoVoST(root, split, args.src_lang, args.tgt_lang)
        for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset):
            manifest["id"].append(utt_id)
            manifest["audio"].append(zip_manifest[utt_id])
            duration_ms = int(wav.size(1) / sr * 1000)
            manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10))
            manifest["tgt_text"].append(
                src_utt if args.tgt_lang is None else tgt_utt)
            manifest["speaker"].append(speaker_id)
        is_train_split = split.startswith("train")
        if is_train_split:
            train_text.extend(manifest["tgt_text"])
        df = pd.DataFrame.from_dict(manifest)
        df = filter_manifest_df(df, is_train_split=is_train_split)
        save_df_to_tsv(df, root / f"{split}_{task}.tsv")
    # Generate vocab
    vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
    spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{task}"
    with NamedTemporaryFile(mode="w") as f:
        for t in train_text:
            f.write(t + "\n")
        gen_vocab(Path(f.name), root / spm_filename_prefix, args.vocab_type,
                  args.vocab_size)
    # Generate config YAML
    gen_config_yaml(
        root,
        spm_filename_prefix + ".model",
        yaml_filename=f"config_{task}.yaml",
        specaugment_policy="lb",
    )
    # Clean up
    shutil.rmtree(feature_root)
Ejemplo n.º 2
0
def process(args):
    root = op.join(args.data_root, args.src_lang)
    os.makedirs(root, exist_ok=True)
    # Extract features
    feature_root = op.join(root, 'fbank80')
    os.makedirs(feature_root, exist_ok=True)
    for split in CoVoST.SPLITS:
        print(f'Fetching split {split}...')
        dataset = CoVoST(root, split, args.src_lang, args.tgt_lang,
                         download=True)
        print('Extracting log mel filter bank features...')
        for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
            extract_fbank_features(waveform, sample_rate,
                                   op.join(feature_root, f'{utt_id}.npy'))
    # Pack features into ZIP
    zip_filename = 'fbank80.zip'
    zip_path = op.join(root, zip_filename)
    print('ZIPing features...')
    create_zip(feature_root, zip_path)
    print('Fetching ZIP manifest...')
    zip_manifest = get_zip_manifest(args.data_root,
                                    f'{args.src_lang}/{zip_filename}')
    # Generate TSV manifest
    print('Generating manifest...')
    train_text = []
    task = f'asr_{args.src_lang}'
    if args.tgt_lang is not None:
        task = f'st_{args.src_lang}_{args.tgt_lang}'
    for split in CoVoST.SPLITS:
        manifest = {c: [] for c in MANIFEST_COLUMNS}
        dataset = CoVoST(root, split, args.src_lang, args.tgt_lang)
        for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset):
            manifest['id'].append(utt_id)
            manifest['audio'].append(zip_manifest[utt_id])
            duration_ms = int(wav.size(1) / sr * 1000)
            manifest['n_frames'].append(int(1 + (duration_ms - 25) / 10))
            manifest['tgt_text'].append(
                src_utt if args.tgt_lang is None else tgt_utt
            )
            manifest['speaker'].append(speaker_id)
        is_train_split = split.startswith('train')
        if is_train_split:
            train_text.extend(manifest['tgt_text'])
        df = pd.DataFrame.from_dict(manifest)
        df = filter_manifest_df(df, is_train_split=is_train_split)
        save_df_to_tsv(df, op.join(root, f'{split}_{task}.tsv'))
    # Generate vocab
    vocab_size_str = '' if args.vocab_type == 'char' else str(args.vocab_size)
    spm_filename_prefix = f'spm_{args.vocab_type}{vocab_size_str}_{task}'
    with NamedTemporaryFile(mode='w') as f:
        for t in train_text:
            f.write(t + '\n')
        gen_vocab(f.name, op.join(root, spm_filename_prefix),
                  args.vocab_type, args.vocab_size)
    # Generate config YAML
    gen_config_yaml(root, spm_filename_prefix + '.model',
                    yaml_filename=f'config_{task}.yaml',
                    specaugment_policy='lb')
    # Clean up
    shutil.rmtree(feature_root)
Ejemplo n.º 3
0
def process_joint(args):
    assert all(
        op.isdir(op.join(args.data_root, f"en-{lang}")) for lang in MUSTC.LANGUAGES
    ), "do not have downloaded data available for all 8 languages"
    cur_root = args.data_root
    # Generate vocab
    vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
    spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{args.task}"
    with NamedTemporaryFile(mode="w") as f:
        for lang in MUSTC.LANGUAGES:
            tsv_path = op.join(cur_root, f"en-{lang}", f"train_{args.task}.tsv")
            df = load_df_from_tsv(tsv_path)
            for t in df["tgt_text"]:
                f.write(t + "\n")
        gen_vocab(
            f.name,
            op.join(cur_root, spm_filename_prefix),
            args.vocab_type,
            args.vocab_size,
        )
    # Generate config YAML
    gen_config_yaml(
        cur_root,
        spm_filename_prefix + ".model",
        yaml_filename=f"config_{args.task}.yaml",
        specaugment_policy="lb",
        prepend_tgt_lang_tag=(args.task == "st"),
    )
    # Make symbolic links to manifests
    for lang in MUSTC.LANGUAGES:
        for split in MUSTC.SPLITS:
            src_path = op.join(cur_root, f"en-{lang}", f"{split}_{args.task}.tsv")
            desc_path = op.join(cur_root, f"{split}_{lang}_{args.task}.tsv")
            if not op.islink(desc_path):
                os.symlink(src_path, desc_path)
Ejemplo n.º 4
0
def process(args):
    out_root = Path(args.output_root).absolute()
    out_root.mkdir(exist_ok=True)
    # Extract features
    feature_root = out_root / "fbank80"
    feature_root.mkdir(exist_ok=True)
    for split in SPLITS:
        print(f"Fetching split {split}...")
        dataset = LIBRISPEECH(out_root.as_posix(), url=split, download=True)
        print("Extracting log mel filter bank features...")
        for wav, sample_rate, _, spk_id, chapter_no, utt_no in tqdm(dataset):
            sample_id = f"{spk_id}-{chapter_no}-{utt_no}"
            extract_fbank_features(
                wav, sample_rate, feature_root / f"{sample_id}.npy"
            )
    # Pack features into ZIP
    zip_path = out_root / "fbank80.zip"
    print("ZIPing features...")
    create_zip(feature_root, zip_path)
    print("Fetching ZIP manifest...")
    audio_paths, audio_lengths = get_zip_manifest(zip_path)
    # Generate TSV manifest
    print("Generating manifest...")
    train_text = []
    for split in SPLITS:
        manifest = {c: [] for c in MANIFEST_COLUMNS}
        dataset = LIBRISPEECH(out_root.as_posix(), url=split)
        for _, _, utt, spk_id, chapter_no, utt_no in tqdm(dataset):
            sample_id = f"{spk_id}-{chapter_no}-{utt_no}"
            manifest["id"].append(sample_id)
            manifest["audio"].append(audio_paths[sample_id])
            manifest["n_frames"].append(audio_lengths[sample_id])
            manifest["tgt_text"].append(utt.lower())
            manifest["speaker"].append(spk_id)
        save_df_to_tsv(
            pd.DataFrame.from_dict(manifest), out_root / f"{split}.tsv"
        )
        if split.startswith("train"):
            train_text.extend(manifest["tgt_text"])
    # Generate vocab
    vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size)
    spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}"
    with NamedTemporaryFile(mode="w") as f:
        for t in train_text:
            f.write(t + "\n")
        gen_vocab(
            Path(f.name),
            out_root / spm_filename_prefix,
            args.vocab_type,
            args.vocab_size,
        )
    # Generate config YAML
    gen_config_yaml(
        out_root,
        spm_filename=spm_filename_prefix + ".model",
        specaugment_policy="ld"
    )
    # Clean up
    shutil.rmtree(feature_root)
Ejemplo n.º 5
0
def process(args):
    os.makedirs(args.output_root, exist_ok=True)
    # Extract features
    feature_root = op.join(args.output_root, "fbank80")
    os.makedirs(feature_root, exist_ok=True)
    for split in SPLITS:
        print(f"Fetching split {split}...")
        dataset = LIBRISPEECH(args.output_root, url=split, download=True)
        print("Extracting log mel filter bank features...")
        for wav, sample_rate, _, spk_id, chapter_id, utt_id in tqdm(dataset):
            sample_id = f"{spk_id}-{chapter_id}-{utt_id}"
            extract_fbank_features(wav, sample_rate,
                                   op.join(feature_root, f"{sample_id}.npy"))
    # Pack features into ZIP
    zip_filename = "fbank80.zip"
    zip_path = op.join(args.output_root, zip_filename)
    print("ZIPing features...")
    create_zip(feature_root, zip_path)
    print("Fetching ZIP manifest...")
    zip_manifest = get_zip_manifest(args.output_root, zip_filename)
    # Generate TSV manifest
    print("Generating manifest...")
    train_text = []
    for split in SPLITS:
        manifest = {c: [] for c in MANIFEST_COLUMNS}
        dataset = LIBRISPEECH(args.output_root, url=split)
        for wav, sample_rate, utt, spk_id, chapter_id, utt_id in tqdm(dataset):
            sample_id = f"{spk_id}-{chapter_id}-{utt_id}"
            manifest["id"].append(sample_id)
            manifest["audio"].append(zip_manifest[sample_id])
            duration_ms = int(wav.size(1) / sample_rate * 1000)
            manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10))
            manifest["tgt_text"].append(utt)
            manifest["speaker"].append(spk_id)
        save_df_to_tsv(pd.DataFrame.from_dict(manifest),
                       op.join(args.output_root, f"{split}.tsv"))
        if split.startswith("train"):
            train_text.extend(manifest["tgt_text"])
    # Generate vocab
    vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size)
    spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}"
    with NamedTemporaryFile(mode="w") as f:
        for t in train_text:
            f.write(t + "\n")
        gen_vocab(
            f.name,
            op.join(args.output_root, spm_filename_prefix),
            args.vocab_type,
            args.vocab_size,
        )
    # Generate config YAML
    gen_config_yaml(args.output_root,
                    spm_filename_prefix + ".model",
                    specaugment_policy="ld")
    # Clean up
    shutil.rmtree(feature_root)
Ejemplo n.º 6
0
def process(args):
    os.makedirs(args.output_root, exist_ok=True)
    # Extract features
    feature_root = op.join(args.output_root, 'fbank80')
    os.makedirs(feature_root, exist_ok=True)
    for split in SPLITS:
        print(f'Fetching split {split}...')
        dataset = LIBRISPEECH(args.output_root, url=split, download=True)
        print('Extracting log mel filter bank features...')
        for wav, sample_rate, _, spk_id, chapter_id, utt_id in tqdm(dataset):
            sample_id = f'{spk_id}-{chapter_id}-{utt_id}'
            extract_fbank_features(wav, sample_rate,
                                   op.join(feature_root, f'{sample_id}.npy'))
    # Pack features into ZIP
    zip_filename = 'fbank80.zip'
    zip_path = op.join(args.output_root, zip_filename)
    print('ZIPing features...')
    create_zip(feature_root, zip_path)
    print('Fetching ZIP manifest...')
    zip_manifest = get_zip_manifest(args.output_root, zip_filename)
    # Generate TSV manifest
    print('Generating manifest...')
    train_text = []
    for split in SPLITS:
        manifest = {c: [] for c in MANIFEST_COLUMNS}
        dataset = LIBRISPEECH(args.output_root, url=split)
        for wav, sample_rate, utt, spk_id, chapter_id, utt_id in tqdm(dataset):
            sample_id = f'{spk_id}-{chapter_id}-{utt_id}'
            manifest['id'].append(sample_id)
            manifest['audio'].append(zip_manifest[sample_id])
            duration_ms = int(wav.size(1) / sample_rate * 1000)
            manifest['n_frames'].append(int(1 + (duration_ms - 25) / 10))
            manifest['tgt_text'].append(utt)
            manifest['speaker'].append(spk_id)
        save_df_to_tsv(pd.DataFrame.from_dict(manifest),
                       op.join(args.output_root, f'{split}.tsv'))
        if split.startswith('train'):
            train_text.extend(manifest['tgt_text'])
    # Generate vocab
    vocab_size = '' if args.vocab_type == 'char' else str(args.vocab_size)
    spm_filename_prefix = f'spm_{args.vocab_type}{vocab_size}'
    with NamedTemporaryFile(mode='w') as f:
        for t in train_text:
            f.write(t + '\n')
        gen_vocab(f.name, op.join(args.output_root, spm_filename_prefix),
                  args.vocab_type, args.vocab_size)
    # Generate config YAML
    gen_config_yaml(args.output_root,
                    spm_filename_prefix + '.model',
                    specaugment_policy='ld')
    # Clean up
    shutil.rmtree(feature_root)
Ejemplo n.º 7
0
def process_joint(args):
    cur_root = Path(args.data_root)
    assert all((cur_root / f"{lang}").is_dir() for lang in mTEDx.LANGPAIRS), \
        "do not have downloaded data available for all languages"
    # Generate vocab
    vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
    spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{args.task}"
    with NamedTemporaryFile(mode="w") as f:
        for lang in mTEDx.LANGPAIRS:
            tsv_path = cur_root / f"{lang}" / f"train_{args.task}.tsv"
            df = load_df_from_tsv(tsv_path)
            for t in df["tgt_text"]:
                f.write(t + "\n")
        special_symbols = None
        if args.joint:
            # Add tgt_lang tags to dict
            special_symbols = list(
                {f'<lang:{lang.split("-")[1]}>' for lang in mTEDx.LANGPAIRS}
            )
        gen_vocab(
            Path(f.name),
            cur_root / spm_filename_prefix,
            args.vocab_type,
            args.vocab_size,
            special_symbols=special_symbols
        )
    # Generate config YAML
    gen_config_yaml(
        cur_root,
        spm_filename=spm_filename_prefix + ".model",
        yaml_filename=f"config_{args.task}.yaml",
        specaugment_policy="ld",
        prepend_tgt_lang_tag=(args.joint),
    )
    # Make symbolic links to manifests
    for lang in mTEDx.LANGPAIRS:
        for split in mTEDx.SPLITS:
            src_path = cur_root / f"{lang}" / f"{split}_{args.task}.tsv"
            desc_path = cur_root / f"{split}_{lang}_{args.task}.tsv"
            if not desc_path.is_symlink():
                os.symlink(src_path, desc_path)
Ejemplo n.º 8
0
def process(args):
    for lang in MUSTC.LANGUAGES:
        cur_root = op.join(args.data_root, f"en-{lang}")
        if not op.isdir(cur_root):
            print(f"{cur_root} does not exist. Skipped.")
            continue
        # Extract features
        feature_root = op.join(cur_root, "fbank80")
        os.makedirs(feature_root, exist_ok=True)
        for split in MUSTC.SPLITS:
            print(f"Fetching split {split}...")
            dataset = MUSTC(args.data_root, lang, split)
            print("Extracting log mel filter bank features...")
            for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
                extract_fbank_features(waveform, sample_rate,
                                       op.join(feature_root, f"{utt_id}.npy"))
        # Pack features into ZIP
        zip_filename = "fbank80.zip"
        zip_path = op.join(cur_root, zip_filename)
        print("ZIPing features...")
        create_zip(feature_root, zip_path)
        print("Fetching ZIP manifest...")
        zip_manifest = get_zip_manifest(args.data_root,
                                        f"en-{lang}/{zip_filename}")
        # Generate TSV manifest
        print("Generating manifest...")
        train_text = []
        for split in MUSTC.SPLITS:
            is_train_split = split.startswith("train")
            manifest = {c: [] for c in MANIFEST_COLUMNS}
            dataset = MUSTC(args.data_root, lang, split)
            for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset):
                manifest["id"].append(utt_id)
                manifest["audio"].append(zip_manifest[utt_id])
                duration_ms = int(wav.size(1) / sr * 1000)
                manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10))
                manifest["tgt_text"].append(src_utt if args.task ==
                                            "asr" else tgt_utt)
                manifest["speaker"].append(speaker_id)
            if is_train_split:
                train_text.extend(manifest["tgt_text"])
            df = pd.DataFrame.from_dict(manifest)
            df = filter_manifest_df(df, is_train_split=is_train_split)
            save_df_to_tsv(df, op.join(cur_root, f"{split}_{args.task}.tsv"))
        # Generate vocab
        v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
        spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}"
        with NamedTemporaryFile(mode="w") as f:
            for t in train_text:
                f.write(t + "\n")
            gen_vocab(
                f.name,
                op.join(cur_root, spm_filename_prefix),
                args.vocab_type,
                args.vocab_size,
            )
        # Generate config YAML
        gen_config_yaml(
            cur_root,
            spm_filename_prefix + ".model",
            yaml_filename=f"config_{args.task}.yaml",
            specaugment_policy="lb",
        )
        # Clean up
        shutil.rmtree(feature_root)
Ejemplo n.º 9
0
def process(args):
    root = Path(args.data_root).absolute()
    for lang in MUSTC.LANGUAGES:
        cur_root = root / f"en-{lang}"
        if not cur_root.is_dir():
            print(f"{cur_root.as_posix()} does not exist. Skipped.")
            continue
        # Extract features
        audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80")
        audio_root.mkdir(exist_ok=True)

        for split in MUSTC.SPLITS:
            print(f"Fetching split {split}...")
            dataset = MUSTC(root.as_posix(), lang, split)
            if args.use_audio_input:
                print("Converting audios...")
                for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
                    tgt_sample_rate = 16_000
                    _wavform, _ = convert_waveform(
                        waveform,
                        sample_rate,
                        to_mono=True,
                        to_sample_rate=tgt_sample_rate)
                    sf.write((audio_root / f"{utt_id}.flac").as_posix(),
                             _wavform.T.numpy(), tgt_sample_rate)
            else:
                print("Extracting log mel filter bank features...")
                gcmvn_feature_list = []
                if split == 'train' and args.cmvn_type == "global":
                    print("And estimating cepstral mean and variance stats...")

                for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
                    features = extract_fbank_features(
                        waveform, sample_rate, audio_root / f"{utt_id}.npy")
                    if split == 'train' and args.cmvn_type == "global":
                        if len(gcmvn_feature_list) < args.gcmvn_max_num:
                            gcmvn_feature_list.append(features)

                if split == 'train' and args.cmvn_type == "global":
                    # Estimate and save cmv
                    stats = cal_gcmvn_stats(gcmvn_feature_list)
                    with open(cur_root / "gcmvn.npz", "wb") as f:
                        np.savez(f, mean=stats["mean"], std=stats["std"])

        # Pack features into ZIP
        zip_path = cur_root / f"{audio_root.name}.zip"
        print("ZIPing audios/features...")
        create_zip(audio_root, zip_path)
        print("Fetching ZIP manifest...")
        audio_paths, audio_lengths = get_zip_manifest(
            zip_path,
            is_audio=args.use_audio_input,
        )
        # Generate TSV manifest
        print("Generating manifest...")
        train_text = []
        for split in MUSTC.SPLITS:
            is_train_split = split.startswith("train")
            manifest = {c: [] for c in MANIFEST_COLUMNS}
            dataset = MUSTC(args.data_root, lang, split)
            for _, _, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset):
                manifest["id"].append(utt_id)
                manifest["audio"].append(audio_paths[utt_id])
                manifest["n_frames"].append(audio_lengths[utt_id])
                manifest["tgt_text"].append(src_utt if args.task ==
                                            "asr" else tgt_utt)
                manifest["speaker"].append(speaker_id)
            if is_train_split:
                train_text.extend(manifest["tgt_text"])
            df = pd.DataFrame.from_dict(manifest)
            df = filter_manifest_df(df, is_train_split=is_train_split)
            save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv")
        # Generate vocab
        v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
        spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}"
        with NamedTemporaryFile(mode="w") as f:
            for t in train_text:
                f.write(t + "\n")
            gen_vocab(
                Path(f.name),
                cur_root / spm_filename_prefix,
                args.vocab_type,
                args.vocab_size,
            )
        # Generate config YAML
        if args.use_audio_input:
            gen_config_yaml(cur_root,
                            spm_filename=spm_filename_prefix + ".model",
                            yaml_filename=f"config_{args.task}.yaml",
                            specaugment_policy=None,
                            extra={"use_audio_input": True})
        else:
            gen_config_yaml(
                cur_root,
                spm_filename=spm_filename_prefix + ".model",
                yaml_filename=f"config_{args.task}.yaml",
                specaugment_policy="lb",
                cmvn_type=args.cmvn_type,
                gcmvn_path=(cur_root / "gcmvn.npz"
                            if args.cmvn_type == "global" else None),
            )
        # Clean up
        shutil.rmtree(audio_root)
Ejemplo n.º 10
0
def process(args):
    ''' 
    This version assume that the fbank features are extracted before e.g. for purpose of data augmentation

    Arguments:
     args.feature_root: a list containing the paths of extracted fbanks for zipping
     args.info_dict: a dict with *split* as key and file path as *value*
    '''

    os.makedirs(args.output_root, exist_ok=True)

    if args.feature_root is None:
        # Do not create new zip files
        assert args.path_fbankzip_root is not None, \
            'Please provide zipped filter banks'
        print('Load zipfile')
        zip_manifest = get_zip_manifest(args.path_fbankzip_root, 'fbank80.zip')
    else:
        zip_filename = "fbank80.zip"
        zip_path = op.join(args.output_root, zip_filename)
        print("ZIPing features...")
        create_zip_list(
            args.feature_root, zip_path
        )  # Allow fbanks to be saved over different dirs. but are gathered for one zip file
        print("Fetching ZIP manifest...")
        zip_manifest = get_zip_manifest(args.output_root, zip_filename)

    # Generate TSV manifest
    print("Generating manifest...")
    train_text = []

    # Take the info file for each split and generate the .tsv files
    # info file has 3 columns:
    #  1) n_frames, 2) utterance id, and 3) transcription
    for split, info_path in args.info_dict.items():
        manifest = {c: [] for c in MANIFEST_COLUMNS}
        with open(info_path, "r") as fin:
            for idx, l in enumerate(fin):
                line = l.strip().split()

                # Note: the utt_id here has no extra zeros for padding
                n_frames, uid, tp, tmin, tmax, total_time = line[0], line[1], \
                                                    line[2:-3], line[-3], \
                                                    line[-2], line[-1]
                _uid = uid.split("-")

                if args.augment:
                    spk_id, chapter_id, utt_id, aug_id = _uid[0], _uid[1], \
                                                            _uid[2], _uid[3]
                    sample_id = f"{spk_id}-{chapter_id}-{utt_id}-{aug_id}"
                else:
                    spk_id, chapter_id, utt_id = _uid[0], _uid[1], _uid[2]
                    sample_id = f"{spk_id}-{chapter_id}-{utt_id}"

                manifest["id"].append(sample_id)
                manifest["audio"].append(zip_manifest[sample_id])
                manifest["n_frames"].append(n_frames)
                manifest["tgt_text"].append(" ".join(tp))
                manifest["speaker"].append(spk_id)

                if split.startswith('train'):
                    manifest["align_time_min"].append(tmin)
                    manifest["align_time_max"].append(tmax)
                    manifest["total_time"].append(total_time)

            save_df_to_tsv(pd.DataFrame.from_dict(manifest),
                           op.join(args.output_root, "{}.tsv".format(split)))

            if split.startswith('train'):
                print(f'Add {split} to train_text')
                train_text.extend(manifest["tgt_text"])
                print("length of train_text: {}".format(len(train_text)))

    # Generate vocab
    vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size)
    spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}"
    with NamedTemporaryFile(mode="w") as f_tmp:
        for t in train_text:
            f_tmp.write(t + "\n")
        gen_vocab(
            f_tmp.name,
            op.join(args.output_root, spm_filename_prefix),
            args.vocab_type,
            args.vocab_size,
        )
    # Generate config YAML
    gen_config_yaml(args.output_root,
                    spm_filename_prefix + ".model",
                    specaugment_policy="ld")
Ejemplo n.º 11
0
def process(args):
    root = Path(args.data_root).absolute()
    for lang in mTEDx.LANGPAIRS:
        cur_root = root / f"{lang}"
        if not cur_root.is_dir():
            print(f"{cur_root.as_posix()} does not exist. Skipped.")
            continue
        # Extract features
        audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80")
        audio_root.mkdir(exist_ok=True)
        for split in mTEDx.SPLITS:
            print(f"Fetching split {split}...")
            dataset = mTEDx(root.as_posix(), lang, split)
            if args.use_audio_input:
                print("Converting audios...")
                for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
                    tgt_sample_rate = 16_000
                    _wavform, _ = convert_waveform(
                        waveform, sample_rate, to_mono=True,
                        to_sample_rate=tgt_sample_rate
                    )
                    sf.write(
                        (audio_root / f"{utt_id}.flac").as_posix(),
                        _wavform.numpy(), tgt_sample_rate
                    )
            else:
                print("Extracting log mel filter bank features...")
                for waveform, sample_rate, _, _, _, _, utt_id in tqdm(dataset):
                    extract_fbank_features(
                        waveform, sample_rate, audio_root / f"{utt_id}.npy"
                    )
        # Pack features into ZIP
        zip_path = cur_root / f"{audio_root.name}.zip"
        print("ZIPing audios/features...")
        create_zip(audio_root, zip_path)
        print("Fetching ZIP manifest...")
        audio_paths, audio_lengths = get_zip_manifest(zip_path)
        # Generate TSV manifest
        print("Generating manifest...")
        train_text = []
        for split in mTEDx.SPLITS:
            is_train_split = split.startswith("train")
            manifest = {c: [] for c in MANIFEST_COLUMNS}
            ds = mTEDx(args.data_root, lang, split)
            for _, _, src_utt, tgt_utt, spk_id, tgt_lang, utt_id in tqdm(ds):
                manifest["id"].append(utt_id)
                manifest["audio"].append(audio_paths[utt_id])
                manifest["n_frames"].append(audio_lengths[utt_id])
                manifest["tgt_text"].append(
                    src_utt if args.task == "asr" else tgt_utt
                )
                manifest["speaker"].append(spk_id)
                manifest["tgt_lang"].append(tgt_lang)
            if is_train_split:
                train_text.extend(manifest["tgt_text"])
            df = pd.DataFrame.from_dict(manifest)
            df = filter_manifest_df(df, is_train_split=is_train_split)
            save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv")
        # Generate vocab
        v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
        spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}"
        with NamedTemporaryFile(mode="w") as f:
            for t in train_text:
                f.write(t + "\n")
            gen_vocab(
                Path(f.name),
                cur_root / spm_filename_prefix,
                args.vocab_type,
                args.vocab_size,
            )
        # Generate config YAML
        if args.use_audio_input:
            gen_config_yaml(
                cur_root,
                spm_filename=spm_filename_prefix + ".model",
                yaml_filename=f"config_{args.task}.yaml",
                specaugment_policy=None,
                extra={"use_audio_input": True}
            )
        else:
            gen_config_yaml(
                cur_root,
                spm_filename=spm_filename_prefix + ".model",
                yaml_filename=f"config_{args.task}.yaml",
                specaugment_policy="lb",
            )
        # Clean up
        shutil.rmtree(audio_root)
Ejemplo n.º 12
0
def process(args):
    assert "train" in args.splits
    out_root = Path(args.output_root).absolute()
    out_root.mkdir(exist_ok=True)

    print("Fetching data...")
    audio_manifest_root = Path(args.audio_manifest_root).absolute()
    samples = []
    for s in args.splits:
        for e in load_tsv_to_dicts(audio_manifest_root / f"{s}.audio.tsv"):
            e["split"] = s
            samples.append(e)
    sample_ids = [s["id"] for s in samples]

    # Get alignment info
    id_to_alignment = None
    if args.textgrid_zip is not None:
        assert args.id_to_units_tsv is None
        id_to_alignment = get_mfa_alignment(
            args.textgrid_zip, sample_ids, args.sample_rate, args.hop_length
        )
    elif args.id_to_units_tsv is not None:
        # assume identical hop length on the unit sequence
        id_to_alignment = get_unit_alignment(args.id_to_units_tsv, sample_ids)

    # Extract features and pack features into ZIP
    feature_name = "logmelspec80"
    zip_path = out_root / f"{feature_name}.zip"
    pitch_zip_path = out_root / "pitch.zip"
    energy_zip_path = out_root / "energy.zip"
    gcmvn_npz_path = out_root / "gcmvn_stats.npz"
    if zip_path.exists() and gcmvn_npz_path.exists():
        print(f"{zip_path} and {gcmvn_npz_path} exist.")
    else:
        feature_root = out_root / feature_name
        feature_root.mkdir(exist_ok=True)
        pitch_root = out_root / "pitch"
        energy_root = out_root / "energy"
        if args.add_fastspeech_targets:
            pitch_root.mkdir(exist_ok=True)
            energy_root.mkdir(exist_ok=True)
        print("Extracting Mel spectrogram features...")
        for sample in tqdm(samples):
            waveform, sample_rate = torchaudio.load(sample["audio"])
            waveform, sample_rate = convert_waveform(
                waveform, sample_rate, normalize_volume=args.normalize_volume,
                to_sample_rate=args.sample_rate
            )
            sample_id = sample["id"]
            target_length = None
            if id_to_alignment is not None:
                a = id_to_alignment[sample_id]
                target_length = sum(a.frame_durations)
                if a.start_sec is not None and a.end_sec is not None:
                    start_frame = int(a.start_sec * sample_rate)
                    end_frame = int(a.end_sec * sample_rate)
                    waveform = waveform[:, start_frame: end_frame]
            extract_logmel_spectrogram(
                waveform, sample_rate, feature_root / f"{sample_id}.npy",
                win_length=args.win_length, hop_length=args.hop_length,
                n_fft=args.n_fft, n_mels=args.n_mels, f_min=args.f_min,
                f_max=args.f_max, target_length=target_length
            )
            if args.add_fastspeech_targets:
                assert id_to_alignment is not None
                extract_pitch(
                    waveform, sample_rate, pitch_root / f"{sample_id}.npy",
                    hop_length=args.hop_length, log_scale=True,
                    phoneme_durations=id_to_alignment[sample_id].frame_durations
                )
                extract_energy(
                    waveform, energy_root / f"{sample_id}.npy",
                    hop_length=args.hop_length, n_fft=args.n_fft,
                    log_scale=True,
                    phoneme_durations=id_to_alignment[sample_id].frame_durations
                )
        print("ZIPing features...")
        create_zip(feature_root, zip_path)
        get_global_cmvn(feature_root, gcmvn_npz_path)
        shutil.rmtree(feature_root)
        if args.add_fastspeech_targets:
            create_zip(pitch_root, pitch_zip_path)
            shutil.rmtree(pitch_root)
            create_zip(energy_root, energy_zip_path)
            shutil.rmtree(energy_root)

    print("Fetching ZIP manifest...")
    audio_paths, audio_lengths = get_zip_manifest(zip_path)
    pitch_paths, pitch_lengths, energy_paths, energy_lengths = [None] * 4
    if args.add_fastspeech_targets:
        pitch_paths, pitch_lengths = get_zip_manifest(pitch_zip_path)
        energy_paths, energy_lengths = get_zip_manifest(energy_zip_path)
    # Generate TSV manifest
    print("Generating manifest...")
    id_to_cer = None
    if args.cer_threshold is not None:
        assert Path(args.cer_tsv_path).is_file()
        id_to_cer = {
            x["id"]: x["uer"] for x in load_tsv_to_dicts(args.cer_tsv_path)
        }
    manifest_by_split = {split: defaultdict(list) for split in args.splits}
    for sample in tqdm(samples):
        sample_id, split = sample["id"], sample["split"]

        if args.snr_threshold is not None and "snr" in sample \
                and sample["snr"] < args.snr_threshold:
            continue
        if args.cer_threshold is not None \
                and id_to_cer[sample_id] > args.cer_threhold:
            continue

        normalized_utt = sample["tgt_text"]
        if id_to_alignment is not None:
            normalized_utt = " ".join(id_to_alignment[sample_id].tokens)
        elif args.ipa_vocab:
            normalized_utt = ipa_phonemize(
                normalized_utt, lang=args.lang, use_g2p=args.use_g2p
            )
        manifest_by_split[split]["id"].append(sample_id)
        manifest_by_split[split]["audio"].append(audio_paths[sample_id])
        manifest_by_split[split]["n_frames"].append(audio_lengths[sample_id])
        manifest_by_split[split]["tgt_text"].append(normalized_utt)
        manifest_by_split[split]["speaker"].append(sample["speaker"])
        manifest_by_split[split]["src_text"].append(sample["src_text"])
        if args.add_fastspeech_targets:
            assert id_to_alignment is not None
            duration = " ".join(
                str(d) for d in id_to_alignment[sample_id].frame_durations
            )
            manifest_by_split[split]["duration"].append(duration)
            manifest_by_split[split]["pitch"].append(pitch_paths[sample_id])
            manifest_by_split[split]["energy"].append(energy_paths[sample_id])
    for split in args.splits:
        save_df_to_tsv(
            pd.DataFrame.from_dict(manifest_by_split[split]),
            out_root / f"{split}.tsv"
        )
    # Generate vocab
    vocab_name, spm_filename = None, None
    if id_to_alignment is not None or args.ipa_vocab:
        vocab = Counter()
        for t in manifest_by_split["train"]["tgt_text"]:
            vocab.update(t.split(" "))
        vocab_name = "vocab.txt"
        with open(out_root / vocab_name, "w") as f:
            for s, c in vocab.most_common():
                f.write(f"{s} {c}\n")
    else:
        spm_filename_prefix = "spm_char"
        spm_filename = f"{spm_filename_prefix}.model"
        with NamedTemporaryFile(mode="w") as f:
            for t in manifest_by_split["train"]["tgt_text"]:
                f.write(t + "\n")
            f.flush()  # needed to ensure gen_vocab sees dumped text
            gen_vocab(Path(f.name), out_root / spm_filename_prefix, "char")
    # Generate speaker list
    speakers = sorted({sample["speaker"] for sample in samples})
    speakers_path = out_root / "speakers.txt"
    with open(speakers_path, "w") as f:
        for speaker in speakers:
            f.write(f"{speaker}\n")
    # Generate config YAML
    win_len_t = args.win_length / args.sample_rate
    hop_len_t = args.hop_length / args.sample_rate
    extra = {
        "sample_rate": args.sample_rate,
        "features": {
            "type": "spectrogram+melscale+log",
            "eps": 1e-5, "n_mels": args.n_mels, "n_fft": args.n_fft,
            "window_fn": "hann", "win_length": args.win_length,
            "hop_length": args.hop_length, "sample_rate": args.sample_rate,
            "win_len_t": win_len_t, "hop_len_t": hop_len_t,
            "f_min": args.f_min, "f_max": args.f_max,
            "n_stft": args.n_fft // 2 + 1
        }
    }
    if len(speakers) > 1:
        extra["speaker_set_filename"] = "speakers.txt"
    if args.add_fastspeech_targets:
        pitch_min, pitch_max = get_feature_value_min_max(
            [(out_root / n).as_posix() for n in pitch_paths.values()]
        )
        energy_min, energy_max = get_feature_value_min_max(
            [(out_root / n).as_posix() for n in energy_paths.values()]
        )
        extra["features"]["pitch_min"] = pitch_min
        extra["features"]["pitch_max"] = pitch_max
        extra["features"]["energy_min"] = energy_min
        extra["features"]["energy_max"] = energy_max
    gen_config_yaml(
        out_root, spm_filename=spm_filename, vocab_name=vocab_name,
        audio_root=out_root.as_posix(), input_channels=None,
        input_feat_per_channel=None, specaugment_policy=None,
        cmvn_type="global", gcmvn_path=gcmvn_npz_path, extra=extra
    )
Ejemplo n.º 13
0
def gen_voc(train_text, spm_filename_prefix):
    f = open(Path("../data/sound").absolute() / "test.txt", "a")
    for t in train_text:
        f.write(" ".join(t.split()[0:4]) + "\n")
    print(f.name)
    gen_vocab(Path(f.name), Path("../data/sound") / spm_filename_prefix)
Ejemplo n.º 14
0
def process(args):
    for lang in MUSTC.LANGUAGES:
        cur_root = op.join(args.data_root, f'en-{lang}')
        if not op.isdir(cur_root):
            print(f'{cur_root} does not exist. Skipped.')
            continue
        # Extract features
        feature_root = op.join(cur_root, 'fbank80')
        os.makedirs(feature_root, exist_ok=True)
        for split in MUSTC.SPLITS:
            print(f'Fetching split {split}...')
            dataset = MUSTC(args.data_root, lang, split)
            print('Extracting log mel filter bank features...')
            for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
                extract_fbank_features(waveform, sample_rate,
                                       op.join(feature_root, f'{utt_id}.npy'))
        # Pack features into ZIP
        zip_filename = 'fbank80.zip'
        zip_path = op.join(cur_root, zip_filename)
        print('ZIPing features...')
        create_zip(feature_root, zip_path)
        print('Fetching ZIP manifest...')
        zip_manifest = get_zip_manifest(args.data_root,
                                        f'en-{lang}/{zip_filename}')
        # Generate TSV manifest
        print('Generating manifest...')
        train_text = {task: [] for task in TASKS}
        for split in MUSTC.SPLITS:
            is_train_split = split.startswith('train')
            manifest = {c: [] for c in MANIFEST_COLUMNS}
            text = {task: [] for task in TASKS}
            dataset = MUSTC(args.data_root, lang, split)
            for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset):
                manifest['id'].append(utt_id)
                manifest['audio'].append(zip_manifest[utt_id])
                duration_ms = int(wav.size(1) / sr * 1000)
                manifest['n_frames'].append(int(1 + (duration_ms - 25) / 10))
                text['asr'].append(src_utt)
                text['st'].append(tgt_utt)
                manifest['speaker'].append(speaker_id)
            if is_train_split:
                for task in TASKS:
                    train_text[task].extend(text[task])
            for task in TASKS:
                manifest['tgt_text'] = text[task]
                df = pd.DataFrame.from_dict(manifest)
                df = filter_manifest_df(df, is_train_split=is_train_split)
                save_df_to_tsv(df, op.join(cur_root, f'{split}_{task}.tsv'))
        # Generate vocab
        for task in TASKS:
            vocab_type, vocab_size = args.asr_vocab_type, args.asr_vocab_size
            if task == 'st':
                vocab_type, vocab_size = args.st_vocab_type, args.st_vocab_size
            vocab_size_str = '' if vocab_type == 'char' else str(vocab_size)
            spm_filename_prefix = f'spm_{vocab_type}{vocab_size_str}_{task}'
            with NamedTemporaryFile(mode='w') as f:
                for t in train_text[task]:
                    f.write(t + '\n')
                gen_vocab(f.name, op.join(cur_root, spm_filename_prefix),
                          vocab_type, vocab_size)
            # Generate config YAML
            gen_config_yaml(cur_root,
                            spm_filename_prefix + '.model',
                            yaml_filename=f'config_{task}.yaml',
                            specaugment_policy='lb')
        # Clean up
        shutil.rmtree(feature_root)