def tok():
    ori_path = 'training_data/'
    in_files = []
    out_files = []
    for domain in domains:
        in_files.append(ori_path + 'ori/Twitter_thresholds' +
                        '/Twitter_110k_file1_0.4.txt')
        in_files.append(ori_path + 'ori/Twitter_thresholds' +
                        '/Twitter_110k_test1_5k.txt')
        in_files.append(ori_path + 'ori/Twitter_thresholds' +
                        '/Twitter_110k_val1_1k.txt')
        in_files.append(ori_path + 'ori/Twitter_thresholds' +
                        '/Twitter_110k_file2_0.4.txt')
        in_files.append(ori_path + 'ori/Twitter_thresholds' +
                        '/Twitter_110k_test2_5k.txt')
        in_files.append(ori_path + 'ori/Twitter_thresholds' +
                        '/Twitter_110k_val2_1k.txt')

        out_files.append(ori_path + 'dif_models_' + domain +
                         '/source.train.tok')
        out_files.append(ori_path + 'dif_models_' + domain +
                         '/source.test.tok')
        out_files.append(ori_path + 'dif_models_' + domain + '/source.val.tok')
        out_files.append(ori_path + 'dif_models_' + domain +
                         '/target.train.tok')
        out_files.append(ori_path + 'dif_models_' + domain +
                         '/target.test.tok')
        out_files.append(ori_path + 'dif_models_' + domain + '/target.val.tok')
    for f_in, f_out in zip(in_files, out_files):
        sens = read_file_lines(f_in)
        s_tok = tokenizer(sens, type='word', join=True)
        write_file_lines(path=f_out, lines=s_tok)
def tok():
    ori_path = '../training_data/'
    in_files = []
    out_files = []
    for domain in domains:
        in_files.append(ori_path + 'ori/' + domain + '/train/formal')
        in_files.append(ori_path + 'ori/' + domain + '/train/informal')
        in_files.append(ori_path + 'ori/' + domain + '/tune/formal.ref0')
        in_files.append(ori_path + 'ori/' + domain + '/tune/informal')
        in_files.append(ori_path + 'ori/' + domain + '/test/formal.ref0')
        in_files.append(ori_path + 'ori/' + domain + '/test/informal')
        in_files.append(ori_path + 'ori/' + domain + '/tune/formal')
        in_files.append(ori_path + 'ori/' + domain + '/test/formal')
        out_files.append(ori_path + 'ori/' + domain + '/train/formal.tok')
        out_files.append(ori_path + 'ori/' + domain + '/train/informal.tok')
        out_files.append(ori_path + 'ori/' + domain + '/tune/formal.ref0.tok')
        out_files.append(ori_path + 'ori/' + domain + '/tune/informal.tok')
        out_files.append(ori_path + 'ori/' + domain + '/test/formal.ref0.tok')
        out_files.append(ori_path + 'ori/' + domain + '/test/informal.tok')
        out_files.append(ori_path + 'ori/' + domain + '/tune/formal.tok')
        out_files.append(ori_path + 'ori/' + domain + '/test/formal.tok')
    for f_in, f_out in zip(in_files, out_files):
        sens = read_file_lines(f_in)
        s_tok = tokenizer(sens, type='word', join=True)
        write_file_lines(path=f_out, lines=s_tok)
def ensemble_test(domain='fr',
                  model_type=['ori', 'rule'],
                  beam_size=4,
                  max_dec_len=60,
                  dec_alpha=0.6):
    model_dir = [
        './models_' + domain + '/' + t + '/formality_infer/'
        for t in model_type
    ]
    input_path = [
        '../training_data/dif_models_' + domain + '/eval.' + t
        for t in model_type
    ]
    output_path = '../evaluate/gyafc_model_outputs/' + domain + '_out/formal.gpt.' + '_'.join(
        model_type) + '.ens'
    gpt2 = GPT2(config_path)
    generator = ensemble_beam_search_generator(gpt2,
                                               beam_size=beam_size,
                                               model_directorys=model_dir,
                                               max_dec_len=max_dec_len,
                                               dec_alpha=dec_alpha)
    sess = generator.build_graph_and_restore(
        eos_id=gpt2.text_enc.encode('\n')[0], model_num=len(model_type))
    lines = [read_file_lines(p) for p in input_path]
    result = []
    line_len = [len(l) for l in lines]
    max_l, min_l = max(line_len), min(line_len)
    assert max_l == min_l
    for i in range(0, max_l):
        result.append(
            generator.generate(
                sess, [lines[j][i] for j in range(0, len(model_type))]))
    sess.close()
    write_file_lines(output_path, result)
Beispiel #4
0
def cat_files(f_list, out_path, tokenizer, max_len):
    f_lines = [read_file_lines(f) for f in f_list]
    new_lines = []
    f_len = [len(x) for x in f_lines]
    print(f_len)
    assert max(f_len) == min(f_len)
    length = max(f_len)
    for i in range(0, length):
        texts = []
        for j in range(0, len(f_lines)):
            texts.append(f_lines[j][i])
        new_lines.append('\t'.join(texts))
    return write_file_lines(path=out_path,
                            lines=new_lines,
                            tokenizer=tokenizer,
                            max_len=max_len)
def test(model_dir,
         input_path,
         output_path,
         beam_size=4,
         max_dec_len=60,
         dec_alpha=0.6):
    gpt2 = GPT2(config_path)
    generator = beam_search_generator(gpt2,
                                      beam_size=beam_size,
                                      model_directory=model_dir,
                                      max_dec_len=max_dec_len,
                                      dec_alpha=dec_alpha)
    sess = generator.build_graph_and_restore(
        eos_id=gpt2.text_enc.encode('\n')[0])
    lines = read_file_lines(input_path)
    result = []
    for line in lines:
        result.append(generator.generate(sess, line))
    sess.close()
    write_file_lines(output_path, result)
Beispiel #6
0
def test(
        config_path,
        input_num,
        model_dir='./models/ori_rule/formality_infer/',
        input_path='../training_data/dif_models/eval.ori_rule',
        output_path='../evaluate/gyafc_model_outputs/fr_out/formal.gpt.cat_ori_rule.old',
        beam_size=4,
        max_dec_len=60,
        dec_alpha=0.6):
    gpt2 = NMT_GPT(config_path=config_path, input_num=input_num)
    generator = beam_search_generator(gpt2,
                                      beam_size=beam_size,
                                      model_directory=model_dir,
                                      max_dec_len=max_dec_len,
                                      dec_alpha=dec_alpha)
    sess = generator.build_graph_and_restore(
        eos_id=gpt2.text_enc.encode('\n')[0])
    lines = read_file_lines(input_path)
    result = []
    for line in lines:
        result.append(generator.generate(sess, line, multi_pls=True))
        print(line + ' ||| ' + result[-1].strip())
    sess.close()
    write_file_lines(output_path, result)