def main(): """Run phoneme conversion.""" parser = argparse.ArgumentParser() parser.add_argument("--g2p", type=str, required=True, help="G2P type.") parser.add_argument("--cleaner", type=str, default=None, help="Cleaner type.") parser.add_argument("--nj", type=int, default=4, help="Number of parallel jobs.") parser.add_argument("in_text", type=str, help="Input kaldi-style text.") parser.add_argument("out_text", type=str, help="Output kaldi-style text.") args = parser.parse_args() phoneme_tokenizer = PhonemeTokenizer(args.g2p) cleaner = None if args.cleaner is not None: cleaner = TextCleaner(args.cleaner) with codecs.open(args.in_text, encoding="utf8") as f: lines = [line.strip() for line in f.readlines()] text = {line.split()[0]: " ".join(line.split()[1:]) for line in lines} if cleaner is not None: text = {k: cleaner(v) for k, v in text.items()} phns_list = Parallel(n_jobs=args.nj)([ delayed(phoneme_tokenizer.text2tokens)(sentence) for sentence in text.values() ]) with codecs.open(args.out_text, "w", encoding="utf8") as g: for utt_id, phns in zip(text.keys(), phns_list): g.write(f"{utt_id} " + " ".join(phns) + "\n")
def __init__( self, train: bool, token_type: str = None, token_list: Union[Path, str, Iterable[str]] = None, bpemodel: Union[Path, str, Iterable[str]] = None, text_cleaner: Collection[str] = None, g2p_type: str = None, unk_symbol: str = "<unk>", space_symbol: str = "<space>", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, delimiter: str = None, speech_name: str = "speech", text_name: list = ["text"], ): super().__init__(train) self.train = train self.speech_name = speech_name self.text_name = text_name if token_type is not None: if token_list is None: raise ValueError( "token_list is required if token_type is not None") self.text_cleaner = TextCleaner(text_cleaner) self.tokenizer = build_tokenizer( token_type=token_type, bpemodel=bpemodel, delimiter=delimiter, space_symbol=space_symbol, non_linguistic_symbols=non_linguistic_symbols, g2p_type=g2p_type, ) self.token_id_converter = TokenIDConverter( token_list=token_list, unk_symbol=unk_symbol, ) else: self.text_cleaner = None self.tokenizer = None self.token_id_converter = None
def tokenize( input: str, output: str, field: Optional[str], delimiter: Optional[str], token_type: str, space_symbol: str, non_linguistic_symbols: Optional[str], bpemodel: Optional[str], log_level: str, write_vocabulary: bool, vocabulary_size: int, remove_non_linguistic_symbols: bool, cutoff: int, add_symbol: List[str], cleaner: Optional[str], g2p: Optional[str], ): assert check_argument_types() logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if input == "-": fin = sys.stdin else: fin = Path(input).open("r", encoding="utf-8") if output == "-": fout = sys.stdout else: p = Path(output) p.parent.mkdir(parents=True, exist_ok=True) fout = p.open("w", encoding="utf-8") cleaner = TextCleaner(cleaner) tokenizer = build_tokenizer( token_type=token_type, bpemodel=bpemodel, delimiter=delimiter, space_symbol=space_symbol, non_linguistic_symbols=non_linguistic_symbols, remove_non_linguistic_symbols=remove_non_linguistic_symbols, g2p_type=g2p, ) counter = Counter() if field is not None: field = field2slice(field) for line in fin: line = line.rstrip() if field is not None: # e.g. field="2-" # uttidA hello world!! -> hello world!! tokens = line.split(delimiter) tokens = tokens[field] if delimiter is None: line = " ".join(tokens) else: line = delimiter.join(tokens) line = cleaner(line) tokens = tokenizer.text2tokens(line) if not write_vocabulary: fout.write(" ".join(tokens) + "\n") else: for t in tokens: counter[t] += 1 if not write_vocabulary: return # ======= write_vocabulary mode from here ======= # Sort by the number of occurrences in descending order # and filter lower frequency words than cutoff value words_and_counts = list( filter(lambda x: x[1] > cutoff, sorted(counter.items(), key=lambda x: -x[1]))) # Restrict the vocabulary size if vocabulary_size > 0: if vocabulary_size < len(add_symbol): raise RuntimeError( f"vocabulary_size is too small: {vocabulary_size}") words_and_counts = words_and_counts[:vocabulary_size - len(add_symbol)] # Parse the values of --add_symbol for symbol_and_id in add_symbol: # e.g symbol="<blank>:0" try: symbol, idx = symbol_and_id.split(":") idx = int(idx) except ValueError: raise RuntimeError( f"Format error: e.g. '<blank>:0': {symbol_and_id}") symbol = symbol.strip() # e.g. idx=0 -> append as the first symbol # e.g. idx=-1 -> append as the last symbol if idx < 0: idx = len(words_and_counts) + 1 + idx words_and_counts.insert(idx, (symbol, None)) # Write words for w, c in words_and_counts: fout.write(w + "\n") # Logging total_count = sum(counter.values()) invocab_count = sum(c for w, c in words_and_counts if c is not None) logging.info( f"OOV rate = {(total_count - invocab_count) / total_count * 100} %")
def __init__( self, train: bool, token_type: str = None, token_list: Union[Path, str, Iterable[str]] = None, bpemodel: Union[Path, str, Iterable[str]] = None, text_cleaner: Collection[str] = None, g2p_type: str = None, unk_symbol: str = "<unk>", space_symbol: str = "<space>", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, delimiter: str = None, rir_scp: str = None, rir_apply_prob: float = 1.0, noise_scp: str = None, noise_apply_prob: float = 1.0, noise_db_range: str = "3_10", speech_volume_normalize: float = None, speech_name: str = "speech", text_name: str = "text", ): super().__init__(train) self.train = train self.speech_name = speech_name self.text_name = text_name self.speech_volume_normalize = speech_volume_normalize self.rir_apply_prob = rir_apply_prob self.noise_apply_prob = noise_apply_prob if token_type is not None: if token_list is None: raise ValueError( "token_list is required if token_type is not None") self.text_cleaner = TextCleaner(text_cleaner) self.tokenizer = build_tokenizer( token_type=token_type, bpemodel=bpemodel, delimiter=delimiter, space_symbol=space_symbol, non_linguistic_symbols=non_linguistic_symbols, g2p_type=g2p_type, ) self.token_id_converter = TokenIDConverter( token_list=token_list, unk_symbol=unk_symbol, ) else: self.text_cleaner = None self.tokenizer = None self.token_id_converter = None if train and rir_scp is not None: self.rirs = [] with open(rir_scp, "r", encoding="utf-8") as f: for line in f: sps = line.strip().split(None, 1) if len(sps) == 1: self.rirs.append(sps[0]) else: self.rirs.append(sps[1]) else: self.rirs = None if train and noise_scp is not None: self.noises = [] with open(noise_scp, "r", encoding="utf-8") as f: for line in f: sps = line.strip().split(None, 1) if len(sps) == 1: self.noises.append(sps[0]) else: self.noises.append(sps[1]) sps = noise_db_range.split("_") if len(sps) == 1: self.noise_db_low, self.noise_db_high = float(sps[0]) elif len(sps) == 2: self.noise_db_low, self.noise_db_high = float(sps[0]), float( sps[1]) else: raise ValueError( "Format error: '{noise_db_range}' e.g. -3_4 -> [-3db,4db]") else: self.noises = None
def cleaner(request): return TextCleaner(request.param)
def __init__( self, train: bool, token_type: List[str] = [None], token_list: List[Union[Path, str, Iterable[str]]] = [None], bpemodel: List[Union[Path, str, Iterable[str]]] = [None], text_cleaner: Collection[str] = None, g2p_type: str = None, unk_symbol: str = "<unk>", space_symbol: str = "<space>", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, delimiter: str = None, rir_scp: str = None, rir_apply_prob: float = 1.0, noise_scp: str = None, noise_apply_prob: float = 1.0, noise_db_range: str = "3_10", speech_volume_normalize: float = None, speech_name: str = "speech", text_name: List[str] = ["text"], ): # TODO(jiatong): sync with Kamo and Jing on interface for preprocessor super().__init__( train=train, token_type=token_type[0], token_list=token_list[0], bpemodel=bpemodel[0], text_cleaner=text_cleaner, g2p_type=g2p_type, unk_symbol=unk_symbol, space_symbol=space_symbol, non_linguistic_symbols=non_linguistic_symbols, delimiter=delimiter, speech_name=speech_name, text_name=text_name[0], rir_scp=rir_scp, rir_apply_prob=rir_apply_prob, noise_scp=noise_scp, noise_apply_prob=noise_apply_prob, noise_db_range=noise_db_range, speech_volume_normalize=speech_volume_normalize, ) assert ( len(token_type) == len(token_list) == len(bpemodel) == len(text_name) ), "token_type, token_list, bpemodel, or processing text_name mismatched" self.num_tokenizer = len(token_type) self.tokenizer = [] self.token_id_converter = [] for i in range(self.num_tokenizer): if token_type[i] is not None: if token_list[i] is None: raise ValueError("token_list is required if token_type is not None") self.tokenizer.append( build_tokenizer( token_type=token_type[i], bpemodel=bpemodel[i], delimiter=delimiter, space_symbol=space_symbol, non_linguistic_symbols=non_linguistic_symbols, g2p_type=g2p_type, ) ) self.token_id_converter.append( TokenIDConverter( token_list=token_list[i], unk_symbol=unk_symbol, ) ) else: self.tokenizer.append(None) self.token_id_converter.append(None) self.text_cleaner = TextCleaner(text_cleaner) self.text_name = text_name # override the text_name from CommonPreprocessor