def tok(): ori_path = 'training_data/' in_files = [] out_files = [] for domain in domains: in_files.append(ori_path + 'ori/Twitter_thresholds' + '/Twitter_110k_file1_0.4.txt') in_files.append(ori_path + 'ori/Twitter_thresholds' + '/Twitter_110k_test1_5k.txt') in_files.append(ori_path + 'ori/Twitter_thresholds' + '/Twitter_110k_val1_1k.txt') in_files.append(ori_path + 'ori/Twitter_thresholds' + '/Twitter_110k_file2_0.4.txt') in_files.append(ori_path + 'ori/Twitter_thresholds' + '/Twitter_110k_test2_5k.txt') in_files.append(ori_path + 'ori/Twitter_thresholds' + '/Twitter_110k_val2_1k.txt') out_files.append(ori_path + 'dif_models_' + domain + '/source.train.tok') out_files.append(ori_path + 'dif_models_' + domain + '/source.test.tok') out_files.append(ori_path + 'dif_models_' + domain + '/source.val.tok') out_files.append(ori_path + 'dif_models_' + domain + '/target.train.tok') out_files.append(ori_path + 'dif_models_' + domain + '/target.test.tok') out_files.append(ori_path + 'dif_models_' + domain + '/target.val.tok') for f_in, f_out in zip(in_files, out_files): sens = read_file_lines(f_in) s_tok = tokenizer(sens, type='word', join=True) write_file_lines(path=f_out, lines=s_tok)
def tok(): ori_path = '../training_data/' in_files = [] out_files = [] for domain in domains: in_files.append(ori_path + 'ori/' + domain + '/train/formal') in_files.append(ori_path + 'ori/' + domain + '/train/informal') in_files.append(ori_path + 'ori/' + domain + '/tune/formal.ref0') in_files.append(ori_path + 'ori/' + domain + '/tune/informal') in_files.append(ori_path + 'ori/' + domain + '/test/formal.ref0') in_files.append(ori_path + 'ori/' + domain + '/test/informal') in_files.append(ori_path + 'ori/' + domain + '/tune/formal') in_files.append(ori_path + 'ori/' + domain + '/test/formal') out_files.append(ori_path + 'ori/' + domain + '/train/formal.tok') out_files.append(ori_path + 'ori/' + domain + '/train/informal.tok') out_files.append(ori_path + 'ori/' + domain + '/tune/formal.ref0.tok') out_files.append(ori_path + 'ori/' + domain + '/tune/informal.tok') out_files.append(ori_path + 'ori/' + domain + '/test/formal.ref0.tok') out_files.append(ori_path + 'ori/' + domain + '/test/informal.tok') out_files.append(ori_path + 'ori/' + domain + '/tune/formal.tok') out_files.append(ori_path + 'ori/' + domain + '/test/formal.tok') for f_in, f_out in zip(in_files, out_files): sens = read_file_lines(f_in) s_tok = tokenizer(sens, type='word', join=True) write_file_lines(path=f_out, lines=s_tok)
def ensemble_test(domain='fr', model_type=['ori', 'rule'], beam_size=4, max_dec_len=60, dec_alpha=0.6): model_dir = [ './models_' + domain + '/' + t + '/formality_infer/' for t in model_type ] input_path = [ '../training_data/dif_models_' + domain + '/eval.' + t for t in model_type ] output_path = '../evaluate/gyafc_model_outputs/' + domain + '_out/formal.gpt.' + '_'.join( model_type) + '.ens' gpt2 = GPT2(config_path) generator = ensemble_beam_search_generator(gpt2, beam_size=beam_size, model_directorys=model_dir, max_dec_len=max_dec_len, dec_alpha=dec_alpha) sess = generator.build_graph_and_restore( eos_id=gpt2.text_enc.encode('\n')[0], model_num=len(model_type)) lines = [read_file_lines(p) for p in input_path] result = [] line_len = [len(l) for l in lines] max_l, min_l = max(line_len), min(line_len) assert max_l == min_l for i in range(0, max_l): result.append( generator.generate( sess, [lines[j][i] for j in range(0, len(model_type))])) sess.close() write_file_lines(output_path, result)
def cat_files(f_list, out_path, tokenizer, max_len): f_lines = [read_file_lines(f) for f in f_list] new_lines = [] f_len = [len(x) for x in f_lines] print(f_len) assert max(f_len) == min(f_len) length = max(f_len) for i in range(0, length): texts = [] for j in range(0, len(f_lines)): texts.append(f_lines[j][i]) new_lines.append('\t'.join(texts)) return write_file_lines(path=out_path, lines=new_lines, tokenizer=tokenizer, max_len=max_len)
def test(model_dir, input_path, output_path, beam_size=4, max_dec_len=60, dec_alpha=0.6): gpt2 = GPT2(config_path) generator = beam_search_generator(gpt2, beam_size=beam_size, model_directory=model_dir, max_dec_len=max_dec_len, dec_alpha=dec_alpha) sess = generator.build_graph_and_restore( eos_id=gpt2.text_enc.encode('\n')[0]) lines = read_file_lines(input_path) result = [] for line in lines: result.append(generator.generate(sess, line)) sess.close() write_file_lines(output_path, result)
def test( config_path, input_num, model_dir='./models/ori_rule/formality_infer/', input_path='../training_data/dif_models/eval.ori_rule', output_path='../evaluate/gyafc_model_outputs/fr_out/formal.gpt.cat_ori_rule.old', beam_size=4, max_dec_len=60, dec_alpha=0.6): gpt2 = NMT_GPT(config_path=config_path, input_num=input_num) generator = beam_search_generator(gpt2, beam_size=beam_size, model_directory=model_dir, max_dec_len=max_dec_len, dec_alpha=dec_alpha) sess = generator.build_graph_and_restore( eos_id=gpt2.text_enc.encode('\n')[0]) lines = read_file_lines(input_path) result = [] for line in lines: result.append(generator.generate(sess, line, multi_pls=True)) print(line + ' ||| ' + result[-1].strip()) sess.close() write_file_lines(output_path, result)