def _lexicon(self): corpus_file = "{}/data/local/corpus.txt".format(self.tmp_folder) texts = open(corpus_file).read() words = sorted(set(texts.split())) phones = [list(PhoneConverter.word2phone(item)) for item in words] nonsilence_phones = sorted( set([item for sublist in phones for item in sublist])) outfile = "{}/data/local/dict/nonsilence_phones.txt".format( self.tmp_folder) content = "\n".join(nonsilence_phones) + "\n" open(outfile, "w").write(content) silence_phones = ["sil", "spn"] outfile = "{}/data/local/dict/silence_phones.txt".format( self.tmp_folder) content = "\n".join(silence_phones) + "\n" open(outfile, "w").write(content) optional_silence = ["sil"] outfile = "{}/data/local/dict/optional_silence.txt".format( self.tmp_folder) content = "\n".join(optional_silence) + "\n" open(outfile, "w").write(content) lexicon = ([ "{} {}".format(word, " ".join(phone)) for word, phone in zip(words, phones) ]) lexicon = ["!SIL sil", "<UNK> spn"] + lexicon content = "\n".join(lexicon) + "\n" outfile = "{}/data/local/dict/lexicon.txt".format(self.tmp_folder) open(outfile, "w").write(content)
def _make_dictionary(self): lines = open( "{}/train/text".format(self.corpus_folder)).read().splitlines()[ :self.N_TRAIN] phones = [] for line in lines: fileid, word = line.split("|") p = PhoneConverter.word2phone(word).split() phones += p phones = sorted(set(phones)) # create .dic files lines = [] phone_units = [] for p in phones: units = list(p) phone_units += units units = " ".join(units) line = "{:20s}{}".format(p, units) lines.append(line) open("{}/etc/tmp.dic".format(self.tmp_folder), "w").write( "\n".join(lines)) phone_units = sorted(set(phone_units)) phone_units.append("SIL") open("{}/etc/tmp.phone".format(self.tmp_folder), "w").write( "\n".join(phone_units))
def _make_cleaned_text(self): in_file = "{}/train/text".format(self.corpus_folder) out_file = "{}/etc/text".format(self.tmp_folder) lines = open(in_file).read().splitlines()[:self.N_TRAIN] output = [] for line in lines: fileid, word = line.split("|") phone = PhoneConverter.word2phone(word) content = "<s> {} </s>".format(phone, fileid) output.append(content) content = "\n".join(output) open(out_file, "w").write(content)