def __init__(self, mix_scp: str = "", doa_scp: Union[str, List[str]] = "", emb_scp: str = "", ref_scp: Union[str, List[str]] = "", sr: int = 16000) -> None: self.mix = AudioReader(mix_scp, sr=sr) if isinstance(ref_scp, list): self.ref = [AudioReader(ref, sr=sr) for ref in ref_scp] self.num_ref = len(ref_scp) elif ref_scp: self.ref = AudioReader(ref_scp, sr=sr) self.num_ref = 1 else: self.ref = None self.num_ref = 0 self.num_doa = 0 if isinstance(doa_scp, list): self.doa = [ BaseReader(doa, value_processor=lambda x: np.float32(x)) for doa in doa_scp ] self.num_doa = len(doa_scp) elif not doa_scp: self.doa = None else: self.doa = BaseReader(doa_scp, value_processor=lambda x: np.float32(x)) self.num_doa = 1 self.emb = NumpyReader(emb_scp) if emb_scp else None
def _pre_process(self, text: str, utt2dur: str, max_token_num: int = 400, min_token_num: int = 2, skip_utts: str = "", max_dur: float = 3000, min_dur: float = 40) -> List[Dict]: """ Preprocess function to filter the utterances """ if skip_utts: with open(skip_utts, "r") as skip_fd: skip_keys = [k.strip() for k in skip_fd.readlines()] else: skip_keys = [] utt2dur = BaseReader(utt2dur, value_processor=float) if self.vocab_dict: text_reader = BaseReader(text, num_tokens=-1, restrict=False) else: text_reader = BaseReader( text, value_processor=lambda tok: list(map(int, tok)), num_tokens=-1, restrict=False) token_set = [] drop_utts = 0 for key, tokens in text_reader: num_toks = len(tokens) if num_toks > max_token_num or num_toks < min_token_num: drop_utts += 1 continue if key not in utt2dur: drop_utts += 1 continue if key in skip_keys: continue num_frames = utt2dur[key] if num_frames < min_dur or num_frames > max_dur: drop_utts += 1 continue token_set.append({ "key": key, "dur": num_frames, "len": num_toks, "tok": tokens }) # long -> short token_set = sorted(token_set, key=lambda d: d["dur"], reverse=True) if drop_utts: warnings.warn(f"Drop {drop_utts} utterances") return token_set
def run(args): scp_out = open(args.scp, "w") if args.scp else None with open(args.wav_ark, "wb") as wav_ark: reader = BaseReader(args.wav_scp, num_tokens=2, restrict=True) done = 0 for key, value in reader: wav_ark.write(str.encode(key + " ")) offset = wav_ark.tell() if value[-1] == "|": p = subprocess.Popen(value[:-1], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) [stdout, stderr] = p.communicate() if p.returncode != 0: stderr = bytes.decode(stderr) raise RuntimeError( f"Running command: \"{value[:-1]}\" failed: {stderr}") wav_ark.write(stdout) else: with open(value, "rb") as wav: wav_ark.write(wav.read()) if scp_out: scp_out.write(f"{key}\t{args.wav_ark}:{offset}\n") done += 1 if done % 200 == 0: print(f"Processed {done} utterances...", flush=True) print(f"Archive {len(reader)} utterances to {args.wav_ark}") if scp_out: scp_out.close()
def __init__(self, spk2class: Optional[str] = None, name: str = "UNK", unit: str = "UNK") -> None: self.s2c = BaseReader(spk2class) if spk2class else None self.val = defaultdict(float) self.name = name self.unit = unit
def __init__(self, simu_cfg: str, return_in_egs: List[str] = ["mix"]) -> None: self.simu_cfg = BaseReader(simu_cfg, num_tokens=-1) self.parser = make_argparse() self.return_in_egs = return_in_egs
def __init__(self, text_descriptor, cer=False): self.readers = [ BaseReader(t, num_tokens=-1, restrict=False) for t in text_descriptor.split(",") ] self.cer = cer