def __generate_simple_poly_dict(): data_path = '/data1/liujshi/yunlv_research/total_zhuiyi_corup/' \ 'total_metadata_new' total_lines = read_lines(data_path)[0:] print(total_lines[0:5]) total_dict, poly_dict = defaultdict(list), defaultdict(list) for line in total_lines: phone, chars = line.split("|")[1], line.split("|")[2] chars = clean_sentence(chars.replace(" ", "")) phone = __change_tone_format(phone) try: phone_pairs = phone2pairs(chars, phone) for c, p in phone_pairs: total_dict[c].append(p) total_dict[c] = list(set(total_dict[c])) except TypeError: pass except IndexError: print("Index Error:", phone, chars) for line in read_lines("../../other_files/poly_dict"): key = line.split(":")[0] value = line.split(":")[1].split(",") poly_dict[key] = value map_phone = dict() for line in read_lines("../../other_files/phone_map_merge.txt"): key = line.split(":")[0] value = line.split(":")[1] map_phone[key] = value new_lines = [] for char in poly_dict.keys(): if char not in total_dict.keys(): pass # 未出现过的多音字移除掉 else: values = total_dict[char] value_saved = [] for value in values: # 发音词典拼音转化成标准拼音进行比对。 map_value = map_phone[value.split()[0]] + \ map_phone[value.split()[1] + value.split()[2][-1]] if map_value in poly_dict[char]: value_saved.append(value) if len(value_saved) > 1: new_line = "{}:{}".format(char, ",".join(value_saved)) new_lines.append(new_line) print("save:", new_line) else: pass # 只出现过其中一个音的多音字移除掉。 write_lines("../../other_files/simple_poly_dict", new_lines) return None
def __get_total_dict(): with open("../../other_files/in_baiduhanyu.txt") as fr: lines = fr.readlines()[0:] new_lines = [] for line in lines: line = line.replace("[", "").replace("]", "") # line = line.replace("[", "").replace("]", "").replace(":") chars = line.split(":")[0] print(chars) phones = line.split(":")[1].split(",") if len(chars) == 1 and len(phones) > 1: new_lines.append(line.strip()) write_lines("../../other_files/poly_dict", new_lines)
def build_vocab(self, data, token_limits, files): """ Build words and chars with limited sizes and write into files :param data: list of lines :param token_limits: word_limit_size, char_limit_size :param files: word_file_path, char_file_path :return: """ self._set_vocab(data, token_limits[0], token_limits[1]) utils.write_lines(files[0], self.words) utils.verbose( 'words has been dumped in {}'.format(os.path.abspath(files[0]))) utils.write_lines(files[1], self.chars) utils.verbose( 'chars has been dumped in {}'.format(os.path.abspath(files[1])))
def main(): model_dir = "/data1/liufeng/synthesis/frontend/models/psd_v1" eval_dir = os.path.join(model_dir, "eval") os.makedirs(eval_dir, exist_ok=True) # data_path = os.path.join(model_dir, "metadata_dev.txt") data_path = "/data1/liufeng/synthesis/feature/feature_taco/feat_0307/" \ "dev_psd.txt" metadata = read_lines(data_path) print(metadata[0:2]) text_path = os.path.join(eval_dir, "corpus.txt") corpus = [ rm_prosody(line.split("|")[6].replace(" ", "")).upper() for line in metadata ] write_lines(text_path, corpus) sub_count = 0 # truth_path = "output.txt" truth_path = os.path.join(eval_dir, "truth.txt") with open(truth_path, "w", encoding="utf-8") as fr: for sent_id, meta in enumerate(metadata): phone = (meta.split("|")[5]) sentence = clean_sentence( meta.split("|")[6].replace(" ", "").upper()) fr.write("\nid:{}\n{}\n".format(sent_id, sentence)) print("\nid:{}\n{}".format(sent_id, sentence)) sub_sentences = split_sentence(sentence) sub_phones = split_sentence(phone, split_type="phone") for split_id, (sent, phone) in enumerate(zip(sub_sentences, sub_phones)): fr.write("split-id:{} | {}\n{}\n".format( split_id, sent, phone)) print("split-id:{} | {} | {}".format(split_id, sent, phone)) sub_count += 1 fr.write("split-end\n") print("\nsub count:{}".format(sub_count)) print("write other_files to {}".format(truth_path))
def main(): model_dir = "/data1/liufeng/synthesis/frontend/models/v3" eval_dir = os.path.join(model_dir, "eval") os.makedirs(eval_dir, exist_ok=True) data_path = os.path.join(model_dir, "metadata_dev.txt") metadata = read_lines(data_path) print(metadata[0]) text_path = os.path.join(eval_dir, "corpus.txt") corpus = [ rm_prosody(line.split("|")[2].replace(" ", "")).upper() for line in metadata ] print(corpus[0]) write_lines(text_path, corpus) sub_count = 0 truth_path = os.path.join(eval_dir, "truth.txt") with open(truth_path, "w", encoding="utf-8") as fr: for sent_id, meta in enumerate(metadata): meta = rm_prosody(meta) sentence, phone = meta.split("|")[2].replace( " ", ""), meta.split("|")[1] sentence = clean_sentence(sentence).upper() print(sentence) fr.write("\nid:{}\n{}\n".format(sent_id, sentence)) print("\nid:{}\n{}".format(sent_id, sentence)) sub_sentences = split_sentence(sentence) sub_phones = split_sentence(phone, split_type="phone") for split_id, (sent, phone) in enumerate(zip(sub_sentences, sub_phones)): fr.write("split-id:{} | {}\n{}\n".format( split_id, sent, phone)) print("split-id:{} | {} | {}".format(split_id, sent, phone)) sub_count += 1 fr.write("split-end\n") print("\nsub count:{}".format(sub_count)) print("write files to {}".format(truth_path))
def main(): data_path = '/data1/liufeng/synthesis/TACOTRON-2-refined/data/data_0306/' \ 'metadata_tot.csv' total_lines = read_lines(data_path)[0:] print(total_lines[0]) poly_dict = load_poly_dict() new_lines = [] for line in total_lines: if has_poly_char(line.split("|")[2], poly_dict): new_lines.append(line) print("there are {} lines with poly char".format(len(new_lines))) random.shuffle(new_lines) dev_lines = new_lines[0:5000] train_lines = new_lines[5000:] poly_chars, tot_chars = 0, 0 for line in new_lines: for char in line: tot_chars += 1 if char in poly_dict.keys(): poly_chars += 1 print("there are {}chars in total {}chars ({})".format( poly_chars, tot_chars, poly_chars/tot_chars)) train_x, train_y, data_lines = __extract_feature(train_lines, poly_dict) write_lines("metadata_train.txt", data_lines) dev_x, dev_y, data_lines = __extract_feature(dev_lines, poly_dict) write_lines("metadata_dev.txt", data_lines) with open('/data1/liufeng/synthesis/frontend/models/feature.pkl', 'wb') as fw: pickle.dump((train_x, train_y, dev_x, dev_y), fw) print("save {}/{} train/dev items ".format(len(train_x), len(dev_x))) return
def main(): model_dir = "/data1/liufeng/synthesis/frontend/models/v2" eval_dir = os.path.join(model_dir, "eval") os.makedirs(eval_dir, exist_ok=True) # data_path = os.path.join(model_dir, "metadata_dev.txt") data_path = "/data1/liufeng/synthesis/feature/feature_prosody/" \ "bzn/dev.txt" metadata = read_lines(data_path) print(metadata[0:2]) # dev_corpus = [line.split("|")[6] for line in read_lines(data_path)] # dev_phones = [line.split("|")[5] for line in read_lines(data_path)] # print(dev_corpus[0]) # line = dev_corpus[0] # x, y = split_psd(line) # print(x, y) # # exit() # metadata = [line for line in metadata if "bzn" in line] # print(metadata[0:3]) text_path = os.path.join(eval_dir, "corpus.txt") corpus = [ rm_prosody(line.split("|")[6].replace(" ", "")) for line in metadata ] # print(corpus[0:3]) write_lines(text_path, corpus) # exit() sub_count = 0 # truth_path = os.path.join(eval_dir, "bc_dev.txt") truth_path = "output.txt" with open(truth_path, "w", encoding="utf-8") as fr: for sent_id, meta in enumerate(metadata): phone = (meta.split("|")[5]) sentence = clean_sentence(meta.split("|")[6].replace(" ", "")) _ss = sentence print(phone, sentence) # x, y = split_psd(sentence) # sentence = "".join(x) # assert len(y) == len(sentence) # if not check_exist_eng(sentence): # continue fr.write("\nid:{}\n{}\n".format(sent_id, _ss)) print("\nid:{}\n{}".format(sent_id, _ss)) sub_sentences = split_sentence(sentence) sub_phones = split_sentence(phone, split_type="phone") # print(len(y), len(sub_phones), len(sub_sentences)) for split_id, (sent, phone) in enumerate(zip(sub_sentences, sub_phones)): x, y = split_psd(sent) sent = "".join(x) print(sent, phone) pairs = phone2pairs(sent, phone) new_pairs = [(_x[0], _x[1], _y) for _x, _y in zip(pairs, y)] new_phone = [_y + " #" + _z for _x, _y, _z in new_pairs] new_phone = " ".join(new_phone).replace(" #0", "") fr.write("split-id:{} | {}\n{}\n".format( split_id, sent, new_phone)) print("split-id:{} | {} | {}".format(split_id, sent, new_phone)) sub_count += 1 fr.write("split-end\n") # exit() print("\nsub count:{}".format(sub_count)) print("write other_files to {}".format(truth_path))
def main(): parser = argparse.ArgumentParser() parser.add_argument('yaml_path', help='config path for frontend') parser.add_argument('input_path', help='input path(txt)') parser.add_argument('output_path', help='output path(txt)') args = parser.parse_args() # todo: add sil label hparams = __load_hparams(args.yaml_path) text_path = args.input_path frontend_path = args.output_path flag_psd = hparams.flag_psd if hparams.norm_text: raw_file_lines = read_lines(text_path) sentences = [] print("text normalize:") for line in raw_file_lines: new_line = Text().normalize(line) sentences.append(new_line.replace(" ", "")) if not new_line == line: print("{}->{}".format(line, new_line)) else: sentences = read_lines(text_path) write_lines("norm.txt", sentences) # exit() trans = TranscriptToPinyin( dic_path=hparams.dict_path, eng_dic_path=hparams.eng_dict_path, ) if hparams.nnet_psd and hparams.flag_psd: psd_predict, bert_psd_result = __compute_psd_result( hparams, sentences, hparams.load_memory_psd) else: psd_predict, bert_psd_result = None, None if hparams.nnet_phone: phone_predictor, bert_phone_result = __compute_nnet_phone_result( hparams, sentences, hparams.load_memory_phone) else: phone_predictor, bert_phone_result = None, None sub_count, count = 0, 0 with open(frontend_path, "w", encoding="utf-8") as frontend_file: for sent_id, sentence in enumerate(sentences[0:]): # sentence = num2hanzi(clean_sentence(sentence)) frontend_file.write("\nid:{}\n{}\n".format(sent_id, sentence)) print("\nid:{}\n{}".format(sent_id, sentence)) sub_sentences = split_sentence(sentence) for split_id, sub_sentence in enumerate(sub_sentences): sub_count += 1 phone_pairs = trans.get_phone_pairs( sub_sentence, change_eng_symbol=hparams.eng_symbol) new_ = [] if hparams.nnet_phone: bert_phone = bert_phone_result[count] if len(sub_sentence) == len(bert_phone): phone = phone_predictor.modify_result( bert_phone, phone_pairs) for i, (c, ph, p) in enumerate(phone_pairs): new_.append((c, phone[i], p)) phone_pairs = new_ else: print("Error for bert result") if flag_psd and not hparams.nnet_psd: phone = " ".join( [ph + " #" + psd for _, ph, psd in phone_pairs]) phone = phone.replace("#0", "").replace("#5", "") sub_sentence = "".join( [c + "#" + psd for c, _, psd in phone_pairs]) sub_sentence = sub_sentence.replace("#0", "").replace("#5", "") elif flag_psd and hparams.nnet_psd: new_pairs = [] for new_psd, (char, ph, _) in zip(bert_psd_result[count], phone_pairs): new_pairs.append((char, ph, new_psd)) new_pairs = psd_predict.change_by_rules(new_pairs) phone = " ".join( [ph + " #" + psd for _, ph, psd in new_pairs]) phone = phone.replace("#0", "").replace("#5", "") sub_sentence = "".join( [c + "#" + psd for c, _, psd in new_pairs]) sub_sentence = sub_sentence.replace("#0", "").replace("#5", "") else: phone = " ".join([ph for _, ph, _ in phone_pairs]) sub_sentence = "".join([c for c, _, _ in phone_pairs]) count += 1 frontend_file.write("split-id:{} | {}\n{}\n".format( split_id, sub_sentence, phone)) print("split-id:{} | {} | {}".format(split_id, sub_sentence, phone)) frontend_file.write("split-end\n") # todo: 改善停顿。 # todo: 重构,废弃kashagri,使用keras-bert print("\nsub count:{}".format(sub_count)) print("write output data to {}".format(frontend_path))