def process(args): utils.make_directory(args.path['model']) tokenizer = args.tokenizer(args.path['vocab']) train_x = utils.read_lines(args.path['train_x']) train_y = utils.read_lines(args.path['train_y']) dataset = train_x + train_y keywords = None if args.problem == 'lda': model = LDAModel(args) else: trainset = [tokenizer.encode_line_into_words(i) for i in dataset] train_keywords(trainset, args.path['model']) keywords = load_keywords(args.path['model']) model = TFIDFModel(args) list_toks = [] for n, line in enumerate(train_x): if not n % 10000 and n: utils.verbose('Tokenizing {} lines for {}'.format(n, args.problem)) if keywords is None: list_toks.append([str(s) for s in tokenizer.encode_line_into_words(line)]) else: list_toks.append([str(s) for s in tokenizer.encode_line_into_words(line) if s in keywords[: args.num_keywords]]) model.fit(list_toks)
def __generate_simple_poly_dict(): data_path = '/data1/liujshi/yunlv_research/total_zhuiyi_corup/' \ 'total_metadata_new' total_lines = read_lines(data_path)[0:] print(total_lines[0:5]) total_dict, poly_dict = defaultdict(list), defaultdict(list) for line in total_lines: phone, chars = line.split("|")[1], line.split("|")[2] chars = clean_sentence(chars.replace(" ", "")) phone = __change_tone_format(phone) try: phone_pairs = phone2pairs(chars, phone) for c, p in phone_pairs: total_dict[c].append(p) total_dict[c] = list(set(total_dict[c])) except TypeError: pass except IndexError: print("Index Error:", phone, chars) for line in read_lines("../../other_files/poly_dict"): key = line.split(":")[0] value = line.split(":")[1].split(",") poly_dict[key] = value map_phone = dict() for line in read_lines("../../other_files/phone_map_merge.txt"): key = line.split(":")[0] value = line.split(":")[1] map_phone[key] = value new_lines = [] for char in poly_dict.keys(): if char not in total_dict.keys(): pass # 未出现过的多音字移除掉 else: values = total_dict[char] value_saved = [] for value in values: # 发音词典拼音转化成标准拼音进行比对。 map_value = map_phone[value.split()[0]] + \ map_phone[value.split()[1] + value.split()[2][-1]] if map_value in poly_dict[char]: value_saved.append(value) if len(value_saved) > 1: new_line = "{}:{}".format(char, ",".join(value_saved)) new_lines.append(new_line) print("save:", new_line) else: pass # 只出现过其中一个音的多音字移除掉。 write_lines("../../other_files/simple_poly_dict", new_lines) return None
def __init__(self): super().__init__() self.poly_dict = dict() poly_dict_path = "/data1/liufeng/synthesis/frontend/data/simple_poly_dict" for line in read_lines(poly_dict_path): line = line.replace(" ", "").replace("*", "") key = line.split(":")[0] value = line.split(":")[1].split(",") self.poly_dict[key] = value self.model, self.model_dir = None, None self.sess = None
def main(): train_path = "/data1/liufeng/synthesis/feature/feature_taco/feat_0307/train.txt" train_lines = [ line.split("|")[6] for line in read_lines(train_path) if "#" in line ] train_data = [split_psd(line, pairs=False) for line in train_lines] print(train_data[0]) train_x = [x for x, _ in train_data] train_y = [y for _, y in train_data] dev_path = "/data1/liufeng/synthesis/feature/feature_taco/feat_0307/dev_psd.txt" dev_lines = [line.split("|")[6] for line in read_lines(dev_path)] dev_data = [split_psd(line, pairs=False) for line in dev_lines] dev_x = [x for x, _ in dev_data] dev_y = [y for _, y in dev_data] with open('/data1/liufeng/synthesis/frontend/models/feature_psd.pkl', 'wb') as fw: pickle.dump((train_x, train_y, dev_x, dev_y), fw) print("save {}/{} train/dev items".format(len(train_x), len(dev_x))) return
def process(hparam): utils.raise_inexistence(hparam.tmp_dir) tokenizer = vocab.Tokenizer() all_data = [] paths = [ join(hparam.tmp_dir, i) for i in ['train_q.txt', 'train_a.txt', 'dev_q.txt', 'dev_a.txt'] ] word_path = join(hparam.tmp_dir, '{}.vcb'.format(hparam.word_size)) char_path = join(hparam.tmp_dir, '{}.vcb'.format(hparam.char_size)) for path in paths: utils.raise_inexistence(path) all_data += utils.read_lines(path) tokenizer.build_vocab(all_data, [hparam.word_size, hparam.char_size], [word_path, char_path])
def __init__(self, files=None): """ Char-base adding word Tokenizer :param files: [word_file_path, char_file_path] """ self.word_counter = {} self.char_counter = {} if files is not None: self.words = utils.read_lines(files[0]) self.chars = utils.read_lines(files[1]) utils.verbose('loading words from file {} with word size {}'.format( files[0], self.word_size)) utils.verbose('loading chars from file {} with char size {}'.format( files[1], self.char_size)) else: self.words = [] self.chars = [] self.cutter = SubCutter() self.word_dict = dict() self.char_dict = dict() self._set_dict() self.PAD_ID = 0 self.UNK_ID = 1 self.EOS_ID = 2
def __read_from_result(data_path): lines = read_lines(data_path) count = 0 result_pairs = [] while count < len(lines): if lines[count].startswith("id:"): count += 2 elif lines[count].startswith("split-id:"): sub_sentence = lines[count].split("|")[1] psd_pairs = split_psd(sub_sentence.replace(" ", "")) result_pairs.append(psd_pairs) count += 2 elif lines[count].startswith("split-end"): count += 1 elif lines[count].lower().startswith("end"): break else: count += 1 return result_pairs
def main(): model_dir = "/data1/liufeng/synthesis/frontend/models/psd_v1" eval_dir = os.path.join(model_dir, "eval") os.makedirs(eval_dir, exist_ok=True) # data_path = os.path.join(model_dir, "metadata_dev.txt") data_path = "/data1/liufeng/synthesis/feature/feature_taco/feat_0307/" \ "dev_psd.txt" metadata = read_lines(data_path) print(metadata[0:2]) text_path = os.path.join(eval_dir, "corpus.txt") corpus = [ rm_prosody(line.split("|")[6].replace(" ", "")).upper() for line in metadata ] write_lines(text_path, corpus) sub_count = 0 # truth_path = "output.txt" truth_path = os.path.join(eval_dir, "truth.txt") with open(truth_path, "w", encoding="utf-8") as fr: for sent_id, meta in enumerate(metadata): phone = (meta.split("|")[5]) sentence = clean_sentence( meta.split("|")[6].replace(" ", "").upper()) fr.write("\nid:{}\n{}\n".format(sent_id, sentence)) print("\nid:{}\n{}".format(sent_id, sentence)) sub_sentences = split_sentence(sentence) sub_phones = split_sentence(phone, split_type="phone") for split_id, (sent, phone) in enumerate(zip(sub_sentences, sub_phones)): fr.write("split-id:{} | {}\n{}\n".format( split_id, sent, phone)) print("split-id:{} | {} | {}".format(split_id, sent, phone)) sub_count += 1 fr.write("split-end\n") print("\nsub count:{}".format(sub_count)) print("write other_files to {}".format(truth_path))
def main(): model_dir = "/data1/liufeng/synthesis/frontend/models/v3" eval_dir = os.path.join(model_dir, "eval") os.makedirs(eval_dir, exist_ok=True) data_path = os.path.join(model_dir, "metadata_dev.txt") metadata = read_lines(data_path) print(metadata[0]) text_path = os.path.join(eval_dir, "corpus.txt") corpus = [ rm_prosody(line.split("|")[2].replace(" ", "")).upper() for line in metadata ] print(corpus[0]) write_lines(text_path, corpus) sub_count = 0 truth_path = os.path.join(eval_dir, "truth.txt") with open(truth_path, "w", encoding="utf-8") as fr: for sent_id, meta in enumerate(metadata): meta = rm_prosody(meta) sentence, phone = meta.split("|")[2].replace( " ", ""), meta.split("|")[1] sentence = clean_sentence(sentence).upper() print(sentence) fr.write("\nid:{}\n{}\n".format(sent_id, sentence)) print("\nid:{}\n{}".format(sent_id, sentence)) sub_sentences = split_sentence(sentence) sub_phones = split_sentence(phone, split_type="phone") for split_id, (sent, phone) in enumerate(zip(sub_sentences, sub_phones)): fr.write("split-id:{} | {}\n{}\n".format( split_id, sent, phone)) print("split-id:{} | {} | {}".format(split_id, sent, phone)) sub_count += 1 fr.write("split-end\n") print("\nsub count:{}".format(sub_count)) print("write files to {}".format(truth_path))
def main(): data_path = '/data1/liufeng/synthesis/TACOTRON-2-refined/data/data_0306/' \ 'metadata_tot.csv' total_lines = read_lines(data_path)[0:] print(total_lines[0]) poly_dict = load_poly_dict() new_lines = [] for line in total_lines: if has_poly_char(line.split("|")[2], poly_dict): new_lines.append(line) print("there are {} lines with poly char".format(len(new_lines))) random.shuffle(new_lines) dev_lines = new_lines[0:5000] train_lines = new_lines[5000:] poly_chars, tot_chars = 0, 0 for line in new_lines: for char in line: tot_chars += 1 if char in poly_dict.keys(): poly_chars += 1 print("there are {}chars in total {}chars ({})".format( poly_chars, tot_chars, poly_chars/tot_chars)) train_x, train_y, data_lines = __extract_feature(train_lines, poly_dict) write_lines("metadata_train.txt", data_lines) dev_x, dev_y, data_lines = __extract_feature(dev_lines, poly_dict) write_lines("metadata_dev.txt", data_lines) with open('/data1/liufeng/synthesis/frontend/models/feature.pkl', 'wb') as fw: pickle.dump((train_x, train_y, dev_x, dev_y), fw) print("save {}/{} train/dev items ".format(len(train_x), len(dev_x))) return
def __init__(self, args): self.dual_encoder_searcher = searcher_lib.DualEncoderSearcher(args) self.lda_searcher = searcher_lib.LDASearcher(args) self.tfidf_searcher = searcher_lib.TFIDFSearcher(args) self.questions = utils.read_lines(args.path['train_x']) self.answers = utils.read_lines(args.path['train_y'])
def _retrieve_linked_ids(ids, already_seen_ids: Set[int], groups_definitions: GroupDefinitions) -> List[int]: linked_ids = [ linked_id for _id in ids for linked_id in groups_definitions[_id] ] not_seen_ids = [i for i in linked_ids if i not in already_seen_ids] if not_seen_ids: already_seen_ids = already_seen_ids.union(not_seen_ids) return not_seen_ids + _retrieve_linked_ids( not_seen_ids, already_seen_ids, groups_definitions) else: return [] def parse_row(row): start_program, end_programs = row.split(' <-> ') start_program = int(start_program) end_programs = [int(p.strip()) for p in end_programs.split(',')] return start_program, end_programs if __name__ == '__main__': raw_puzzle_input = read_lines('digital_plumber.txt') puzzle_input = dict(parse_row(r) for r in raw_puzzle_input) group_size = len(set(retrieve_linked_programs(0, puzzle_input))) print(f'Result for Part 1: {group_size}') groups_number = count_total_groups(puzzle_input) print(f'Result for Part 1: {groups_number}')
def main(): model_dir = "/data1/liufeng/synthesis/frontend/models/v2" eval_dir = os.path.join(model_dir, "eval") os.makedirs(eval_dir, exist_ok=True) # data_path = os.path.join(model_dir, "metadata_dev.txt") data_path = "/data1/liufeng/synthesis/feature/feature_prosody/" \ "bzn/dev.txt" metadata = read_lines(data_path) print(metadata[0:2]) # dev_corpus = [line.split("|")[6] for line in read_lines(data_path)] # dev_phones = [line.split("|")[5] for line in read_lines(data_path)] # print(dev_corpus[0]) # line = dev_corpus[0] # x, y = split_psd(line) # print(x, y) # # exit() # metadata = [line for line in metadata if "bzn" in line] # print(metadata[0:3]) text_path = os.path.join(eval_dir, "corpus.txt") corpus = [ rm_prosody(line.split("|")[6].replace(" ", "")) for line in metadata ] # print(corpus[0:3]) write_lines(text_path, corpus) # exit() sub_count = 0 # truth_path = os.path.join(eval_dir, "bc_dev.txt") truth_path = "output.txt" with open(truth_path, "w", encoding="utf-8") as fr: for sent_id, meta in enumerate(metadata): phone = (meta.split("|")[5]) sentence = clean_sentence(meta.split("|")[6].replace(" ", "")) _ss = sentence print(phone, sentence) # x, y = split_psd(sentence) # sentence = "".join(x) # assert len(y) == len(sentence) # if not check_exist_eng(sentence): # continue fr.write("\nid:{}\n{}\n".format(sent_id, _ss)) print("\nid:{}\n{}".format(sent_id, _ss)) sub_sentences = split_sentence(sentence) sub_phones = split_sentence(phone, split_type="phone") # print(len(y), len(sub_phones), len(sub_sentences)) for split_id, (sent, phone) in enumerate(zip(sub_sentences, sub_phones)): x, y = split_psd(sent) sent = "".join(x) print(sent, phone) pairs = phone2pairs(sent, phone) new_pairs = [(_x[0], _x[1], _y) for _x, _y in zip(pairs, y)] new_phone = [_y + " #" + _z for _x, _y, _z in new_pairs] new_phone = " ".join(new_phone).replace(" #0", "") fr.write("split-id:{} | {}\n{}\n".format( split_id, sent, new_phone)) print("split-id:{} | {} | {}".format(split_id, sent, new_phone)) sub_count += 1 fr.write("split-end\n") # exit() print("\nsub count:{}".format(sub_count)) print("write other_files to {}".format(truth_path))
def main(): parser = argparse.ArgumentParser() parser.add_argument('yaml_path', help='config path for frontend') parser.add_argument('input_path', help='input path(txt)') parser.add_argument('output_path', help='output path(txt)') args = parser.parse_args() # todo: add sil label hparams = __load_hparams(args.yaml_path) text_path = args.input_path frontend_path = args.output_path flag_psd = hparams.flag_psd if hparams.norm_text: raw_file_lines = read_lines(text_path) sentences = [] print("text normalize:") for line in raw_file_lines: new_line = Text().normalize(line) sentences.append(new_line.replace(" ", "")) if not new_line == line: print("{}->{}".format(line, new_line)) else: sentences = read_lines(text_path) write_lines("norm.txt", sentences) # exit() trans = TranscriptToPinyin( dic_path=hparams.dict_path, eng_dic_path=hparams.eng_dict_path, ) if hparams.nnet_psd and hparams.flag_psd: psd_predict, bert_psd_result = __compute_psd_result( hparams, sentences, hparams.load_memory_psd) else: psd_predict, bert_psd_result = None, None if hparams.nnet_phone: phone_predictor, bert_phone_result = __compute_nnet_phone_result( hparams, sentences, hparams.load_memory_phone) else: phone_predictor, bert_phone_result = None, None sub_count, count = 0, 0 with open(frontend_path, "w", encoding="utf-8") as frontend_file: for sent_id, sentence in enumerate(sentences[0:]): # sentence = num2hanzi(clean_sentence(sentence)) frontend_file.write("\nid:{}\n{}\n".format(sent_id, sentence)) print("\nid:{}\n{}".format(sent_id, sentence)) sub_sentences = split_sentence(sentence) for split_id, sub_sentence in enumerate(sub_sentences): sub_count += 1 phone_pairs = trans.get_phone_pairs( sub_sentence, change_eng_symbol=hparams.eng_symbol) new_ = [] if hparams.nnet_phone: bert_phone = bert_phone_result[count] if len(sub_sentence) == len(bert_phone): phone = phone_predictor.modify_result( bert_phone, phone_pairs) for i, (c, ph, p) in enumerate(phone_pairs): new_.append((c, phone[i], p)) phone_pairs = new_ else: print("Error for bert result") if flag_psd and not hparams.nnet_psd: phone = " ".join( [ph + " #" + psd for _, ph, psd in phone_pairs]) phone = phone.replace("#0", "").replace("#5", "") sub_sentence = "".join( [c + "#" + psd for c, _, psd in phone_pairs]) sub_sentence = sub_sentence.replace("#0", "").replace("#5", "") elif flag_psd and hparams.nnet_psd: new_pairs = [] for new_psd, (char, ph, _) in zip(bert_psd_result[count], phone_pairs): new_pairs.append((char, ph, new_psd)) new_pairs = psd_predict.change_by_rules(new_pairs) phone = " ".join( [ph + " #" + psd for _, ph, psd in new_pairs]) phone = phone.replace("#0", "").replace("#5", "") sub_sentence = "".join( [c + "#" + psd for c, _, psd in new_pairs]) sub_sentence = sub_sentence.replace("#0", "").replace("#5", "") else: phone = " ".join([ph for _, ph, _ in phone_pairs]) sub_sentence = "".join([c for c, _, _ in phone_pairs]) count += 1 frontend_file.write("split-id:{} | {}\n{}\n".format( split_id, sub_sentence, phone)) print("split-id:{} | {} | {}".format(split_id, sub_sentence, phone)) frontend_file.write("split-end\n") # todo: 改善停顿。 # todo: 重构,废弃kashagri,使用keras-bert print("\nsub count:{}".format(sub_count)) print("write output data to {}".format(frontend_path))
def process(args): utils.make_directory(args.path['model']) tokenizer = args.tokenizer(args.path['vocab']) train_batch = args.batch(tokenizer, args.max_lens) train_batch.set_data(utils.read_lines(args.path['train_x']), utils.read_lines(args.path['train_y'])) dev_batch = args.batch(tokenizer, args.max_lens) dev_batch.set_data(utils.read_lines(args.path['dev_x']), utils.read_lines(args.path['dev_y'])) model = args.model(args) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_device config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(pad_step_number=True) recorder = Recorder() starter = time.time() for i in range(args.max_steps): input_x, input_y, idx, update_epoch = train_batch.next_batch( args.batch_size, recorder.train_idx) train_features = { 'input_x_ph': input_x, 'input_y_ph': input_y, 'keep_prob_ph': args.keep_prob } recorder.train_idx = idx train_fetches, train_feed = model.train_step(train_features) _, train_loss, train_acc = sess.run(train_fetches, train_feed) recorder.train_losses.append(train_loss) recorder.train_accs.append(train_acc) if not i % args.show_steps and i: input_x, input_y, idx, update_epoch = dev_batch.next_batch( args.batch_size, recorder.dev_idx) dev_features = { 'input_x_ph': input_x, 'input_y_ph': input_y, 'keep_prob_ph': 1.0 } recorder.dev_idx = idx dev_fetches, dev_feed = model.dev_step(dev_features) dev_loss, dev_acc = sess.run(dev_fetches, dev_feed) recorder.dev_losses.append(dev_loss) recorder.dev_accs.append(dev_acc) speed = args.show_steps / (time.time() - starter) utils.verbose( r' step {:05d} | train [{:.5f} {:.5f}] | ' r'dev [{:.5f} {:.5f}] | speed {:.5f} it/s'.format( i, train_loss, train_acc, dev_loss, dev_acc, speed)) starter = time.time() if not i % args.save_steps and i: features = recorder.stats() if features['save']: saver.save(sess, args.path['model']) utils.verbose( r'step {:05d} - {:05d} | train [{:.5f} {:.5f}] | ' r'dev [{:.5f} {:.5f}]'.format(i - args.save_steps, i, features['train_loss'], features['train_acc'], features['dev_loss'], features['dev_acc'])) print('-+' * 55) utils.write_result(args, recorder.lowest_loss) utils.verbose('Start building vector space from dual encoder model') vectors = [] infer_batch = args.batch(tokenizer, args.max_lens) infer_batch.set_data(utils.read_lines(args.path['train_x']), utils.read_lines(args.path['train_y'])) starter = time.time() idx = 0 update_epoch = False i = 0 while not update_epoch: input_x, input_y, idx, update_epoch = infer_batch.next_batch( args.batch_size, idx) infer_features = {'input_x_ph': input_x, 'keep_prob_ph': 1.0} infer_fetches, infer_feed = model.infer_step(infer_features) enc_questions = sess.run(infer_fetches, infer_feed) vectors += enc_questions if not i % args.show_steps and i: speed = args.show_steps / (time.time() - starter) utils.verbose('step : {:05d} | speed: {:.5f} it/s'.format( i, speed)) starter = time.time() i += 1 vectors = np.reshape(np.array(vectors), [-1, args.hidden])[:infer_batch.data_size] vec_dim = vectors.shape[-1] ann = AnnoyIndex(vec_dim) for n, ii in enumerate(vectors): ann.add_item(n, ii) ann.build(args.num_trees) ann.save(args.path['ann']) utils.verbose('Annoy has been dump in {}'.format(args.path['ann']))