Beispiel #1
0
def g2p(text, trans_type="char"):
    text = jaconv.normalize(text)
    if trans_type == "char":
        text = pyopenjtalk.g2p(text, kana=True)
    elif trans_type == "phn":
        text = pyopenjtalk.g2p(text, kana=False)
    else:
        assert False
    return text
def pyopenjtalk_g2p(text) -> List[str]:
    import pyopenjtalk

    # phones is a str object separated by space
    phones = pyopenjtalk.g2p(text, kana=False)
    phones = phones.split(" ")
    return phones
Beispiel #3
0
def build_vocab(df, vocab_path):
    print(f"building vocab ...")

    vocab_dict = {"<unk>": 1, "<eos>": 2, "<pad>": 3}
    vocab_set = []

    for row in tqdm(df.itertuples()):
        text = row.text.replace(" ", "")  # remove spaces

        phones = pyopenjtalk.g2p(text, join=False)
        # remove pause
        phones = [phone for phone in phones if phone != "pau"]

        for phone in phones:
            if phone not in vocab_set:
                vocab_set.append(phone)

    # alphabetical order
    vocab_set.sort()

    wlines = []
    for v in vocab_set:
        index = len(vocab_dict) + 1
        vocab_dict[v] = index

    for v, index in vocab_dict.items():
        wlines.append(f"{v} {index:d}\n")

    with open(vocab_path, "w", encoding="utf-8") as f:
        f.writelines(wlines)

    print(f"vocabulary saved to {vocab_path}")

    return Vocab(vocab_path)
Beispiel #4
0
def test_g2p_phone():
    for text, pron in [
        ("こんにちは", "k o N n i ch i w a"),
        ("ななみんです", "n a n a m i N d e s U"),
        ("ハローユーチューブ", "h a r o o y u u ch u u b u"),
    ]:
        p = pyopenjtalk.g2p(text, kana=False)
        assert p == pron
Beispiel #5
0
def test_g2p_kana():
    for text, pron in [
        ("今日もこんにちは", "キョーモコンニチワ"),
        ("いやあん", "イヤーン"),
        ("パソコンのとりあえず知っておきたい使い方", "パソコンノトリアエズシッテオキタイツカイカタ"),
    ]:
        p = pyopenjtalk.g2p(text, kana=True)
        assert p == pron
Beispiel #6
0
def frontend(text):
    """Clean text and then convert to id sequence."""
    text = pyopenjtalk.g2p(text, kana=False)
    print(f"Cleaned text: {text}")
    charseq = text.split(" ")
    idseq = []
    for c in charseq:
        if c.isspace():
            idseq += [char_to_id["<space>"]]
        elif c not in char_to_id.keys():
            idseq += [char_to_id["<unk>"]]
        else:
            idseq += [char_to_id[c]]
    idseq += [idim - 1]  # <eos>
    return torch.LongTensor(idseq).view(-1).to(device)
Beispiel #7
0
    def text_to_sequence(self, text, inference=False):
        sequence = []
        # Check for curly braces and treat their contents as ARPAbet:
        if inference:
            text = pyopenjtalk.g2p(text)
            text = text.replace("I", "i")
            text = text.replace("U", "u")
            print(f"phoneme seq: {text}")

        for symbol in text.split():
            idx = self.symbol_to_id[symbol]
            sequence.append(idx)

        # add eos tokens
        sequence += [self.eos_id]
        return sequence
Beispiel #8
0
def parse_label(meta_data):
    with open(meta_data) as f:
        for line in f:
            file_id, text = line.strip().split('|', 1)

            phone = pyopenjtalk.g2p(text)
            phone = [str(i) for i in phone.split(' ')]

            phone = list(filter(lambda p: p != 'pau', phone))
            #phone = list(filter(lambda p: p != ' ', phone))
            phone = list(filter(lambda p: p != ' ', phone))
            phone = list(filter(lambda p: p != '', phone))
            texts = ' '.join([str(i) for i in text_to_sequence(phone)]) + ' 1'

            print(f'{file_id}|{texts}')
    return
Beispiel #9
0
def main(args):
    df = pd.read_table(args.tsv_path)
    df = df.dropna(subset=["utt_id", "token_id", "text"])

    if not os.path.exists(args.vocab):
        vocab = build_vocab(df, args.vocab)
    else:
        vocab = Vocab(args.vocab)
        print(f"load vocab: {args.vocab}")

    phone_texts = []
    phone_token_ids = []
    phone_lens = []

    for row in tqdm(df.itertuples()):
        text = row.text.replace(" ", "")  # remove spaces
        phones = pyopenjtalk.g2p(text, join=False)
        phone_text = " ".join(phones)
        phone_token_id = ints2str(vocab.tokens2ids(phones))

        phone_texts.append(phone_text)
        phone_token_ids.append(phone_token_id)
        phone_lens.append(len(phones))

    df["phone_text"] = phone_texts
    df["phone_token_id"] = phone_token_ids
    df["plen"] = phone_lens

    if args.cols is not None:
        columns = [column for column in args.cols.split(",")]
        assert (("utt_id" in columns) and ("phone_text" in columns)
                and ("phone_token_id" in columns))
        df = df[columns]

    if args.out is None:
        df.to_csv(args.tsv_path.replace(".tsv", "_p2w.tsv"),
                  sep="\t",
                  index=False)
    else:
        df.to_csv(args.out, sep="\t", index=False)
Beispiel #10
0
def pyopenjtalk_g2p_kana(text) -> List[str]:
    import pyopenjtalk

    kanas = pyopenjtalk.g2p(text, kana=True)
    return list(kanas)