Esempio n. 1
0
    def frontend(self, text):
        """

        :param text:
        :return:
        """
        text = custom_english_cleaners(text)
        if self.trans_type == "phn":
            text = filter(lambda s: s != " ", self.g2p(text))
            text = " ".join(text)
            # print(f"Cleaned text: {text}")
            charseq = text.split(" ")
        else:
            # print(f"Cleaned text: {text}")
            charseq = list(text)
        idseq = []
        for c in charseq:
            if c.isspace():
                idseq += [self.char_to_id["<space>"]]
            elif c not in self.char_to_id.keys():
                print(f"{c} is unknown!")
                idseq += [self.char_to_id["<unk>"]]
            elif c == ',':
                idseq += [self.char_to_id[c]]
                # done in an attempt to create a longer pause after commas, unsure exactly how this works
                idseq += [self.char_to_id[c]]
            else:
                idseq += [self.char_to_id[c]]
        idseq += [self.idim - 1]  # <eos>
        return torch.LongTensor(idseq).view(-1).to(self.device)
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang_tag",
        type=str,
        default=None,
        nargs="?",
        help="language tag (can be used for multi lingual case)",
    )
    parser.add_argument("--spk_tag", type=str, help="speaker tag")
    parser.add_argument("jsons",
                        nargs="+",
                        type=str,
                        help="*_mls.json filenames")
    parser.add_argument("out", type=str, help="output filename")
    parser.add_argument(
        "trans_type",
        type=str,
        default="phn",
        choices=["char", "phn"],
        help="Input transcription type",
    )
    args = parser.parse_args()

    dirname = os.path.dirname(args.out)
    if len(dirname) != 0 and not os.path.exists(dirname):
        os.makedirs(dirname)

    with codecs.open(args.out, "w", encoding="utf-8") as out:
        for filename in sorted(args.jsons):
            with codecs.open(filename, "r", encoding="utf-8") as f:
                js = json.load(f)
            for key in sorted(js.keys()):
                uid = args.spk_tag + "_" + key.replace(".wav", "")

                content = js[key]["clean"]
                text = custom_english_cleaners(content.rstrip())
                if args.trans_type == "phn":
                    clean_content = text.lower()
                    text = g2p(clean_content)

                if args.lang_tag is None:
                    line = "%s %s \n" % (uid, text)
                else:
                    line = "%s <%s> %s\n" % (uid, args.lang_tag, text)
                out.write(line)
Esempio n. 3
0
def frontend(text, g2p, char_to_id, idim):
    """Clean text and then convert to id sequence."""
    text = custom_english_cleaners(text)

    if trans_type == "phn":
        text = filter(lambda s: s != " ", g2p(text))
        text = " ".join(text)
        print(f"Cleaned text: {text}")
        charseq = text.split(" ")
    else:
        print(f"Cleaned text: {text}")
        charseq = list(text)
    idseq = []
    for c in charseq:
        if c.isspace():
            idseq += [char_to_id["<space>"]]
        elif c not in char_to_id.keys():
            idseq += [char_to_id["<unk>"]]
        else:
            idseq += [char_to_id[c]]
    idseq += [idim - 1]  # <eos>
    return torch.LongTensor(idseq).view(-1).to(device)
Esempio n. 4
0
def clean_blizzard17(metadata, trans_type, char2index, phn2index):
    g2p = G2p()
    cur_dir = os.path.dirname(__file__)
    filelists_path = os.path.join(cur_dir, "../filelists")

    f_read = open(metadata, "r", encoding="utf-8")
    f_write = open(os.path.join(filelists_path, "data.csv"),
                   "w",
                   encoding="utf-8")

    for line in tqdm(f_read, desc="cleaning and nomalizing: "):
        line = line.strip("(")
        line = line.strip(")\n")
        utterence_id, content, _ = line.split("\"")
        utterence_id = utterence_id.strip()
        content = content.strip()
        clean_char = custom_english_cleaners(content)

        if trans_type == "char":
            normalized_char = []
            token_id = []
            for char in clean_char:
                if char in char2index.keys():
                    normalized_char.append(char)
                    token_id.append(char2index[char])
                elif char == " ":
                    normalized_char.append("<space>")
                    token_id.append(char2index["<space>"])
                else:
                    normalized_char.append("<unk>")
                    token_id.append(char2index["<unk>"])
            normalized_char.append("<eos>")
            token_id.append(char2index["<eos>"])

            normalized_char = " ".join(normalized_char)
            token_id = " ".join(token_id)
            f_write.write(utterence_id + "|" + content + "|" +
                          normalized_char + "|" + token_id + "\n")
        elif trans_type == "phn":
            clean_char = clean_char.lower()
            clean_phn = g2p(clean_char)
            normalized_phn = []
            token_id = []
            for phn in clean_phn:
                if phn in phn2index:
                    normalized_phn.append(phn)
                    token_id.append(phn2index[phn])
                elif phn == " ":
                    normalized_phn.append("<space>")
                    token_id.append(phn2index["<space>"])
                else:
                    normalized_phn.append("<unk>")
                    token_id.append(phn2index["<unk>"])
            normalized_phn.append("<eos>")
            token_id.append(phn2index["<eos>"])

            normalized_phn = " ".join(normalized_phn)
            token_id = " ".join(token_id)
            f_write.write(utterence_id + "|" + content + "|" + normalized_phn +
                          "|" + token_id + "\n")
        else:
            raise Exception("Wrong Type")

    f_read.close()
    f_write.close()
Esempio n. 5
0
    # NOTE: we need to download dict in initial running
    nltk.download("punkt")


def g2p(text):
    """Convert grapheme to phoneme."""
    tokens = filter(lambda s: s != " ", f_g2p(text))
    return " ".join(tokens)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("text", type=str, help="text to be cleaned")
    parser.add_argument(
        "trans_type",
        type=str,
        default="kana",
        choices=["char", "phn"],
        help="Input transcription type",
    )
    args = parser.parse_args()
    with codecs.open(args.text, "r", "utf-8") as fid:
        for line in fid.readlines():
            id, content = line.split(" ", 1)
            clean_content = custom_english_cleaners(content.rstrip())
            if args.trans_type == "phn":
                text = clean_content.lower()
                clean_content = g2p(text)

            print("%s %s" % (id, clean_content))
Esempio n. 6
0
        help="Input transcription type",
    )
    parser.add_argument("--lowercase",
                        type=bool,
                        default=False,
                        help="Lower case the result or not")
    args = parser.parse_args()

    # clean every line in transcription file first
    with codecs.open(args.transcription_path, "r", "utf-8") as fid:
        for line in fid.read().splitlines():
            segments = line.split(" ")

            # clean contents
            content = " ".join(segments[:-1])
            clean_content = custom_english_cleaners(content)

            # get id by taking off the parentheses
            id = segments[-1][1:-1]

            if args.trans_type == "phn":
                text = clean_content.lower()
                clean_content = g2p(text)

            if args.lowercase:
                clean_content = clean_content.lower()

            if args.lang_tag == "":
                print("{} {}".format(id, clean_content))
            else:
                print("{} {}".format(