Ejemplo n.º 1
0
    def __init__(
        self,
        root: str,
        split: str,
        source_language: str,
        target_language: Optional[str] = None,
        version: int = 2,
    ) -> None:
        assert version in self.VERSIONS and split in self.SPLITS
        assert source_language is not None
        self.no_translation = target_language is None
        if not self.no_translation:
            assert "en" in {source_language, target_language}
            if source_language == "en":
                assert target_language in self.EN_XX_LANGUAGES[version]
            else:
                assert source_language in self.XX_EN_LANGUAGES[version]
        else:
            # Hack here so that we can get "split" column from CoVoST TSV.
            # Note that we use CoVoST train split for ASR which is an extension
            # to Common Voice train split.
            target_language = "de" if source_language == "en" else "en"

        self.root: Path = Path(root)

        cv_tsv_path = self.root / "validated.tsv"
        assert cv_tsv_path.is_file()

        covost_url = self.COVOST_URL_TEMPLATE.format(
            src_lang=source_language, tgt_lang=target_language
        )
        covost_archive = self.root / Path(covost_url).name
        if not covost_archive.is_file():
            download_url(covost_url, self.root.as_posix(), hash_value=None)
        extract_archive(covost_archive.as_posix())

        cv_tsv = load_df_from_tsv(cv_tsv_path)
        covost_tsv = load_df_from_tsv(
            self.root / Path(covost_url).name.replace(".tar.gz", "")
        )
        df = pd.merge(
            left=cv_tsv[["path", "sentence", "client_id"]],
            right=covost_tsv[["path", "translation", "split"]],
            how="inner",
            on="path",
        )
        if split == "train":
            df = df[(df["split"] == split) | (df["split"] == f"{split}_covost")]
        else:
            df = df[df["split"] == split]
        data = df.to_dict(orient="index").items()
        data = [v for k, v in sorted(data, key=lambda x: x[0])]
        self.data = []
        for e in data:
            try:
                path = self.root / "clips" / e["path"]
                _ = torchaudio.info(path.as_posix())
                self.data.append(e)
            except RuntimeError:
                pass
Ejemplo n.º 2
0
def process_joint(args):
    assert all(
        op.isdir(op.join(args.data_root, f"en-{lang}")) for lang in MUSTC.LANGUAGES
    ), "do not have downloaded data available for all 8 languages"
    cur_root = args.data_root
    # Generate vocab
    vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
    spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{args.task}"
    with NamedTemporaryFile(mode="w") as f:
        for lang in MUSTC.LANGUAGES:
            tsv_path = op.join(cur_root, f"en-{lang}", f"train_{args.task}.tsv")
            df = load_df_from_tsv(tsv_path)
            for t in df["tgt_text"]:
                f.write(t + "\n")
        gen_vocab(
            f.name,
            op.join(cur_root, spm_filename_prefix),
            args.vocab_type,
            args.vocab_size,
        )
    # Generate config YAML
    gen_config_yaml(
        cur_root,
        spm_filename_prefix + ".model",
        yaml_filename=f"config_{args.task}.yaml",
        specaugment_policy="lb",
        prepend_tgt_lang_tag=(args.task == "st"),
    )
    # Make symbolic links to manifests
    for lang in MUSTC.LANGUAGES:
        for split in MUSTC.SPLITS:
            src_path = op.join(cur_root, f"en-{lang}", f"{split}_{args.task}.tsv")
            desc_path = op.join(cur_root, f"{split}_{lang}_{args.task}.tsv")
            if not op.islink(desc_path):
                os.symlink(src_path, desc_path)
Ejemplo n.º 3
0
def get_top_n(root: Path,
              n_speakers: int = 10,
              min_n_tokens: int = 5) -> pd.DataFrame:
    df = load_df_from_tsv(root / "validated.tsv")
    df["n_tokens"] = [len(s.split()) for s in df["sentence"]]
    df = df[df["n_tokens"] >= min_n_tokens]
    df["n_frames"] = [
        torchaudio.info((root / "clips" / p).as_posix()).num_frames
        for p in tqdm(df["path"])
    ]
    df["id"] = [Path(p).stem for p in df["path"]]
    total_duration_ms = df.groupby("client_id")["n_frames"].agg(["sum"])
    total_duration_ms = total_duration_ms.sort_values("sum", ascending=False)

    top_n_total_duration_ms = total_duration_ms.head(n_speakers)
    top_n_client_ids = set(top_n_total_duration_ms.index.tolist())
    df_top_n = df[df["client_id"].isin(top_n_client_ids)]
    return df_top_n
Ejemplo n.º 4
0
def process_joint(args):
    cur_root = Path(args.data_root)
    assert all((cur_root / f"{lang}").is_dir() for lang in mTEDx.LANGPAIRS), \
        "do not have downloaded data available for all languages"
    # Generate vocab
    vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
    spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{args.task}"
    with NamedTemporaryFile(mode="w") as f:
        for lang in mTEDx.LANGPAIRS:
            tsv_path = cur_root / f"{lang}" / f"train_{args.task}.tsv"
            df = load_df_from_tsv(tsv_path)
            for t in df["tgt_text"]:
                f.write(t + "\n")
        special_symbols = None
        if args.joint:
            # Add tgt_lang tags to dict
            special_symbols = list(
                {f'<lang:{lang.split("-")[1]}>' for lang in mTEDx.LANGPAIRS}
            )
        gen_vocab(
            Path(f.name),
            cur_root / spm_filename_prefix,
            args.vocab_type,
            args.vocab_size,
            special_symbols=special_symbols
        )
    # Generate config YAML
    gen_config_yaml(
        cur_root,
        spm_filename=spm_filename_prefix + ".model",
        yaml_filename=f"config_{args.task}.yaml",
        specaugment_policy="ld",
        prepend_tgt_lang_tag=(args.joint),
    )
    # Make symbolic links to manifests
    for lang in mTEDx.LANGPAIRS:
        for split in mTEDx.SPLITS:
            src_path = cur_root / f"{lang}" / f"{split}_{args.task}.tsv"
            desc_path = cur_root / f"{split}_{lang}_{args.task}.tsv"
            if not desc_path.is_symlink():
                os.symlink(src_path, desc_path)