Ejemplo n.º 1
0
    def _lexicon(self):
        corpus_file = "{}/data/local/corpus.txt".format(self.tmp_folder)
        texts = open(corpus_file).read()
        words = sorted(set(texts.split()))
        phones = [list(PhoneConverter.word2phone(item)) for item in words]

        nonsilence_phones = sorted(
            set([item for sublist in phones for item in sublist]))
        outfile = "{}/data/local/dict/nonsilence_phones.txt".format(
            self.tmp_folder)
        content = "\n".join(nonsilence_phones) + "\n"
        open(outfile, "w").write(content)

        silence_phones = ["sil", "spn"]
        outfile = "{}/data/local/dict/silence_phones.txt".format(
            self.tmp_folder)
        content = "\n".join(silence_phones) + "\n"
        open(outfile, "w").write(content)

        optional_silence = ["sil"]
        outfile = "{}/data/local/dict/optional_silence.txt".format(
            self.tmp_folder)
        content = "\n".join(optional_silence) + "\n"
        open(outfile, "w").write(content)

        lexicon = ([
            "{} {}".format(word, " ".join(phone))
            for word, phone in zip(words, phones)
        ])
        lexicon = ["!SIL sil", "<UNK> spn"] + lexicon
        content = "\n".join(lexicon) + "\n"
        outfile = "{}/data/local/dict/lexicon.txt".format(self.tmp_folder)
        open(outfile, "w").write(content)
Ejemplo n.º 2
0
 def _make_dictionary(self):
     lines = open(
         "{}/train/text".format(self.corpus_folder)).read().splitlines()[
             :self.N_TRAIN]
     phones = []
     for line in lines:
         fileid, word = line.split("|")
         p = PhoneConverter.word2phone(word).split()
         phones += p
     phones = sorted(set(phones))
     # create .dic files
     lines = []
     phone_units = []
     for p in phones:
         units = list(p)
         phone_units += units
         units = " ".join(units)
         line = "{:20s}{}".format(p, units)
         lines.append(line)
     open("{}/etc/tmp.dic".format(self.tmp_folder), "w").write(
         "\n".join(lines))
     phone_units = sorted(set(phone_units))
     phone_units.append("SIL")
     open("{}/etc/tmp.phone".format(self.tmp_folder), "w").write(
         "\n".join(phone_units))
Ejemplo n.º 3
0
 def _make_cleaned_text(self):
     in_file = "{}/train/text".format(self.corpus_folder)
     out_file = "{}/etc/text".format(self.tmp_folder)
     lines = open(in_file).read().splitlines()[:self.N_TRAIN]
     output = []
     for line in lines:
         fileid, word = line.split("|")
         phone = PhoneConverter.word2phone(word)
         content = "<s> {} </s>".format(phone, fileid)
         output.append(content)
     content = "\n".join(output)
     open(out_file, "w").write(content)