def test_formality_score(files=None): if files is None: files = { 'rule_based': ['./data/Family_Relationships/bpe_outputs/formal.rule_based.bpe'], 'pbmt': ['./data/Family_Relationships/bpe_outputs/formal.pbmt.bpe'], 'nmt_baseline': ['./data/Family_Relationships/bpe_outputs/formal.nmt_baseline.bpe'], 'nmt_copy': ['./data/Family_Relationships/bpe_outputs/formal.nmt_copy.bpe'], 'nmt_combined': ['./data/Family_Relationships/bpe_outputs/formal.nmt_combined.bpe'], } embedding_path = './new_exp_fr/embedding/embedding.bpe.big.txt' embedding, vocab_hash = embedding_api.load_word_embedding(embedding_path) nn = NNModel(np.array(embedding),mode='eval') nn.batch_size = 128 nn.build_basic_rnn_model() eval_log={} for key in files.keys(): if type(files[key])==type([]): fm_files=files[key]+'.bpe' else: fm_files=[files[key]+'.bpe'] data=preprocess(informal_src_list=[],formal_src_list=fm_files,embedding_path=embedding_path,shuffle=False) result = nn.predict_prob([t.x for t in data], model_path='./new_exp_fr/classifier/model/1700model.ckpt') score=0 for s in result: score+=s[1] print(key,score/len(data)) eval_log[key]=score/len(data) return eval_log
def test(): test = pickle.load(open('./new_exp_fr/classifier/test.pkl', 'rb')) embedding_path = './new_exp_fr/embedding/corpus.fine_tune_embedding.epoch.10' embedding,vocab_hash = embedding_api.load_word_embedding(embedding_path) nn = NNModel(np.array(embedding),mode='eval') nn.build_basic_rnn_model() nn.evaluate([t.x for t in test],[t.y for t in test],model_path='')
def predict(model_path,file_path='./new_exp_fr/classifier/val.pkl',embedding_path='./new_exp_fr/embedding/corpus.fine_tune_embedding.epoch.10'): test = pickle.load(open(file_path, 'rb')) embedding, vocab_hash = embedding_api.load_word_embedding(embedding_path) nn = NNModel(np.array(embedding),mode='predict') nn.batch_size=10000 nn.build_basic_rnn_model() result=nn.predict_prob([t.x for t in test], model_path=model_path) return test,result
def use_nn_model(): train = pickle.load(open('./new_exp_fr/classifier/train.pkl', 'rb')) val = pickle.load(open('./new_exp_fr/classifier/val.pkl', 'rb')) embedding_path = './new_exp_fr/embedding/embedding.bpe.big.txt' embedding, vocab_hash = embedding_api.load_word_embedding(embedding_path) nn=NNModel(np.array(embedding),mode='train') nn.build_basic_rnn_model() nn.train_model([t.x for t in train],[t.y for t in train],[t.x for t in val],[t.y for t in val], continue_train=False, previous_model_path='./new_exp_fr/classifier/model/990model.ckpt')
def preprocess(informal_src_list,formal_src_list,embedding_path,output_path=None,shuffle=True): vectors,vocab_hash=embedding_api.load_word_embedding(embedding_path) all_data=[] for src in informal_src_list: with open(src,'r',encoding='utf-8') as f: for line in f: d=Data(line.strip().split(), 0, line.strip()) d.str2index(vocab_hash,with_unk=False) all_data.append(d) for src in formal_src_list: with open(src,'r',encoding='utf-8') as f: for line in f: d=Data(line.strip().split(), 1, line.strip()) d.str2index(vocab_hash,with_unk=False) all_data.append(d) if shuffle: random.shuffle(all_data) if output_path is not None: pickle.dump(all_data,open(output_path,'wb'),protocol=True) return all_data
def cal_formality_score_for_each_sentence(output_dir,files=None): if files is None: files = { 'rule_based': ['./data/Family_Relationships/bpe_outputs/formal.rule_based.bpe'], 'pbmt': ['./data/Family_Relationships/bpe_outputs/formal.pbmt.bpe'], 'nmt_baseline': ['./data/Family_Relationships/bpe_outputs/formal.nmt_baseline.bpe'], 'nmt_copy': ['./data/Family_Relationships/bpe_outputs/formal.nmt_copy.bpe'], 'nmt_combined': ['./data/Family_Relationships/bpe_outputs/formal.nmt_combined.bpe'], } embedding_path = './new_exp_fr/embedding/embedding.bpe.big.txt' embedding, vocab_hash = embedding_api.load_word_embedding(embedding_path) nn = NNModel(np.array(embedding),mode='eval') nn.batch_size = 128 nn.build_basic_rnn_model() for key in files.keys(): data=preprocess(informal_src_list=[],formal_src_list=files[key],embedding_path=embedding_path,shuffle=False) result = nn.predict_prob([t.x for t in data], model_path='./new_exp_fr/classifier/model/1700model.ckpt') base_name=os.path.basename(files[key]) with open(os.path.join(output_dir,base_name+'.formality_score'),'w',encoding='utf-8') as fw: for r in result: fw.write(str(r[1])+'\n')
def evaluate_one_formality(input_file_path,is_inf): embedding_path = './new_exp_fr/embedding/embedding.bpe.big.txt' embedding, vocab_hash = embedding_api.load_word_embedding(embedding_path) nn = NNModel(np.array(embedding), mode='eval') nn.batch_size = 128 nn.build_basic_rnn_model() if is_inf: data = preprocess(informal_src_list=[input_file_path], formal_src_list=[], embedding_path=embedding_path, shuffle=False) else: data = preprocess(informal_src_list=[], formal_src_list=[input_file_path], embedding_path=embedding_path, shuffle=False) result = nn.predict_prob([t.x for t in data], model_path='./new_exp_fr/classifier/model/1700model.ckpt') score = 0 if is_inf: for s in result: score += s[0] else: for s in result: score += s[1] print(score / len(data)) return score/len(data)