def word2vec_sim_en(f_tuple_list): print 'load word2vec model...' idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list) model = KeyedVectors.load_word2vec_format( r'%s/%s' % (macro.DICT_DIR, 'GoogleNews-vectors-negative300.bin'), binary=True) # model = KeyedVectors.load_word2vec_format(r'%s/cn.skipgram.bin' % (macro.DICT_DIR), binary=True, unicode_errors='ignore') auto_sim_list = [] count = 0 for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list): try: auto_sim = model.similarity(w1, w2) # 将余弦相似度放到0-10得分 # 将余弦相似度-1~1放到1~10得分 auto_sim = utils.convert_sim(auto_sim, mode=0) except: auto_sim = -1 count += 1 print "w2v:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim, auto_sim) auto_sim_list.append(auto_sim) print 'count=%s/%s' % (len(manu_sim_list) - count, len(manu_sim_list)) print 'spearman=%0.5f/%0.5f' % (eval.spearman( manu_sim_list, auto_sim_list), eval.spearman(manu_sim_list, auto_sim_list, True)) print 'pearson=%0.5f/%0.5f' % (eval.pearson( manu_sim_list, auto_sim_list), eval.pearson(manu_sim_list, auto_sim_list, True)) return auto_sim_list
def calculate_sim_without_tag(self, load_model, ofname, write_flag=True): # 加载指定w2v model w2v_model = Word2Vec.load_word2vec_format( r'%s/%s' % (macro.MODELS_DIR, load_model), binary=True) # C format # 读入评测词对语料 id_list, word1_list, word2_list, headline = utils.read2wordlist( self.f_tuple_list, mode='no_tag') # 新的题头 new_headline = headline.strip() + '\tPrediction\n' # 计算相似度 auto_sim_list = [] for w1, w2 in zip(word1_list, word2_list): try: auto_sim = w2v_model.similarity(w1, w2) # 向量余弦相似度[-1,1] auto_sim = utils.convert_sim(auto_sim) # 将余弦相似度放到1-10得分 print '%-10s\t%-10s\t%-10s' % (w1, w2, auto_sim) except: auto_sim = 1 # 未登录词,为了区分1.0,赋值为1 print '%-10s\t%-10s\t%-10s' % (w1, w2, '______Not Found______') auto_sim_list.append(auto_sim) # 相似度计算的结果是否写入文件 if write_flag: print 'write result to file...' with open('%s/%s' % (macro.RESULTS_DIR, ofname), 'w') as fw: fw.write(new_headline) for w1, w2, auto_sim in zip(word1_list, word2_list, auto_sim_list): fw.write('%s\t%s\t%s\n' % (w1, w2, auto_sim)) return word1_list, word2_list, auto_sim_list, new_headline
def calculate_sim(self, load_model, ofname, write_flag=True): # 加载指定w2v model w2v_model = Word2Vec.load_word2vec_format( r'%s/%s' % (macro.MODELS_DIR, load_model), binary=True) # C format # 读入评测词对语料 id_list, word1_list, word2_list, manu_sim_list, headline = utils.read2wordlist( self.f_tuple_list, mode='tag') # 新的题头 new_headline = headline.strip() + '\tPrediction\n' # 计算相似度 auto_sim_list = [] for id, w1, w2, manu_sim in zip(id_list, word1_list, word2_list, manu_sim_list): try: auto_sim = w2v_model.similarity(w1, w2) # 向量余弦相似度[-1,1] print '%-10s\t%-10s\t%-10s\t%-10s\t%-10s' % ( id, w1, w2, manu_sim, auto_sim) except: auto_sim = 0 # 未登录词,为了区分1.0,赋值为1 print '%-10s\t%-10s\t%-10s\t%-10s\t%-10s' % ( id, w1, w2, manu_sim, '______Not Found______') auto_sim = utils.convert_sim(auto_sim, mode=1) # 将余弦相似度放到1-10得分 auto_sim_list.append(auto_sim) # 相似度计算的结果是否写入文件 if write_flag: print 'write result to file...' with open('%s/%s' % (macro.RESULTS_DIR, ofname), 'w') as fw: fw.write(new_headline) for id, w1, w2, manu_sim, auto_sim in zip( id_list, word1_list, word2_list, manu_sim_list, auto_sim_list): fw.write('%s\t%s\t%s\t%s\t%s\n' % (str(id), w1.encode('utf-8'), w2.encode('utf-8'), manu_sim, auto_sim)) # 评价结果 r = eval.spearman(manu_sim_list, auto_sim_list) p = eval.pearson(manu_sim_list, auto_sim_list) print '!!!spearman=%s; pearson=%s' % (r, p) # 可视化结果 data = { 'ID': id_list, 'Word1': word1_list, 'Word2': word2_list, 'Score': manu_sim_list, 'Prediction': auto_sim_list } frame = DataFrame(data) sns.jointplot("Score", "Prediction", frame, kind='reg', stat_func=eval.spearmanr) plt.savefig('%s/%s.jpg' % (macro.PICS_DIR, ofname)) return word1_list, word2_list, manu_sim_list, auto_sim_list, new_headline
def cilin_sim(f_tuple_list): cs = loadCilin() idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list) result1 = [] result2 = [] result3 = [] count = 0 for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list): sim1 = cs.similarity(w1, w2) sim2 = cs.sim2013(w1, w2) sim3 = cs.sim2016(w1, w2) # 字典中查找到的词 if sim3 >= 0: count += 1 # 分制转成1-10 sim1 = utils.convert_sim(sim1, mode=1) sim2 = utils.convert_sim(sim2, mode=1) sim3 = utils.convert_sim(sim3, mode=1) else: pass # 未查找到的词认为相似度很低 sim1, sim2, sim3 = -1, -1, -1 # push result1.append(sim1) result2.append(sim2) result3.append(sim3) print "cilin:proc_id= %s [%s,%s] %s (%0.2f, %0.2f, %0.2f)" % ( id, w1, w2, manu_sim, sim1, sim2, sim3) # 统计与评价 print 'found_pair=%s/%s' % (count, len(manu_sim_list)) print 'sim1: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson( manu_sim_list, result1), eval.pearson( manu_sim_list, result1, True), eval.spearman( manu_sim_list, result1), eval.spearman(manu_sim_list, result1, True)) print 'sim2: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson( manu_sim_list, result2), eval.pearson( manu_sim_list, result2, True), eval.spearman( manu_sim_list, result2), eval.spearman(manu_sim_list, result2, True)) print 'sim3: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson( manu_sim_list, result3), eval.pearson( manu_sim_list, result3, True), eval.spearman( manu_sim_list, result3), eval.spearman(manu_sim_list, result3, True)) return (result1, result2, result3)
def train_ext_vocab_choose_best(self, save_model, result_fname, last_val): # 获取评价词对 id_list, word1_list, word2_list, manu_sim_list, headline = utils.read2wordlist(self.f_tuple_list, mode='tag') # 获取语料 sentences = [] for seg_docs_dir in self.seg_docs_dir_list: if type(seg_docs_dir) == tuple: sens = utils.atxt2sens(seg_docs_dir[0], seg_docs_dir[1]) else: sens = utils.txts2sens(seg_docs_dir) sentences.extend(sens) # 得到模型方式:load之前的模型 OR 训练词向量模型 if last_val == -2: print 'load previous model....' model = Word2Vec.load_word2vec_format(r'%s/%s' % (macro.MODELS_DIR, save_model), binary=True) else: model = Word2Vec(sentences, sg=1, size=300, window=10, negative=0, hs=1, sample=1e-4, workers=8, min_count=5) # 评价相似度 auto_sim_list = [] for w1, w2, manu_sim in zip(word1_list, word2_list, manu_sim_list): try: auto_sim = model.similarity(w1, w2) # 将余弦相似度放到1-10得分 auto_sim = utils.convert_sim(auto_sim) # print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, auto_sim) except: auto_sim = 1 # 为了区分没有找到的情况,用1代替1.0 print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, '______Not Found______') auto_sim_list.append(auto_sim) # 保留val大的模型 val = eval.spearman(manu_sim_list, auto_sim_list) if val > last_val: model.save_word2vec_format('%s/%s' % (macro.MODELS_DIR, save_model), binary=True) # 保存模型 print 'write result to file...' with open('%s/%s' % (macro.RESULTS_DIR, result_fname), 'w') as fw: fw.write(headline.strip() + '\tPrediction\n') for w1, w2, manu_sim, auto_sim in zip(word1_list, word2_list, manu_sim_list, auto_sim_list): fw.write('%s\t%s\t%s\t%s\n' % (w1.encode('utf-8'), w2.encode('utf-8'), manu_sim, auto_sim)) else: print ':::::::current val=', val return val
def hnet_sim(f_tuple_list): ''' bt_xiepeiyiVerb.dic:每一行是一个协陪义动词 #对于每一个协陪义动词,得到“当前协陪义动词--glossary.dat中动词 相似度” 相似度个数 = bt_xiepeiyiVerb.dic中有多少行乘以glossary.dat中动词数量,对所有相似度从大到小排序,结果存放在result.txt中 ''' generatePlabel = False SIMILARITY = True BETA = [0.5, 0.2, 0.17, 0.13] GAMA = 0.2 DELTA = 0.2 ALFA = 1.6 glossaryfile = '%s/%s' % (macro.DICT_DIR, macro.WN_GLOSS_DICT) xiepeiyidic = '%s/%s' % (macro.DICT_DIR, macro.WN_XPY_VERB_DICT) sememefile = '%s/%s' % (macro.DICT_DIR, macro.WN_WHOLE_DICT) if generatePlabel: lines = generateSourcefile(glossaryfile, xiepeiyidic) print('There are ' + str(len(lines)) + ' lines!!') if SIMILARITY: obj = WordSimilarity() if obj.init(sememefile, glossaryfile) == False: print("[ERROR] init failed!!") count = 0 auto_sim_list = [] idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list) for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list): auto_sim = obj.calc(w1.encode('utf-8'), w2.encode('utf-8'), BETA, GAMA, DELTA, ALFA) if auto_sim >= 0: # 0-1放缩到1-10 auto_sim = utils.convert_sim(auto_sim, mode=1) count += 1 else: auto_sim = -1 auto_sim_list.append(auto_sim) print "hownet:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim, auto_sim) print 'count=%s/%s' % (count, len(manu_sim_list)) print 'spearman=%0.5f/%0.5f' % (eval.spearman(manu_sim_list, auto_sim_list), eval.spearman(manu_sim_list, auto_sim_list, True)) print 'pearson=%0.5f/%0.5f' % (eval.pearson(manu_sim_list, auto_sim_list), eval.pearson(manu_sim_list, auto_sim_list, True)) return auto_sim_list
def cilin_webtest(w1l, w2l): cs = loadCilin() result3 = [] result_str = '' count = 0 for w1, w2 in zip(w1l, w2l): sim3 = cs.sim2016(w1, w2) # 字典中查找到的词 if sim3 >= 0: count += 1 # 分制转成1-10 sim3 = utils.convert_sim(sim3, mode=1) else: pass # 未查找到的词认为相似度很低 sim3 = '__NOTFOUND__' # push result_str += '%s %s %s<br/>' % ( w1, w2, sim3) result3.append(sim3) return result_str
def combine_zh_en(): d = enchant.Dict('en_US') _, en_w1_list, en_w2_list, _, _ = utils.read2wordlist([(macro.CORPUS_DIR, 'en_'+macro.NLPCC_FML_FILE)], mode='tag') _, _, _, manu_sim_list, _ = utils.read2wordlist([(macro.CORPUS_DIR, macro.NLPCC_FML_FILE)], mode='tag') # 这里换成想要提升的结果文件 # id_list, w1_list, w2_list, manu_sim_list, auto_sim_list, headline = \ # utils.read2wordlist([(macro.RESULTS_DIR, macro.FML_ORG_BDNEWS_XIESO_RESULT)], mode='auto_tag') id_list, w1_list, w2_list, auto_sim_list, headline = \ utils.read2wordlist([(macro.RESULTS_DIR, 'lstm.result')], mode='tag') w2v_model = Word2Vec.load_word2vec_format(r'%s/%s' % (macro.MODELS_DIR, macro.GOOGLE_EN_W2V_MODEL), binary=True) # the English model fw2 = open(r'%s/%s' % (macro.RESULTS_DIR, macro.FML_ORG_GOOGLE_EN_W2V_RESULT), 'w') fw2.write(headline) new_auto_sim_list = [] count = 0 for id, w1, trans_w1, w2, trans_w2, manu_sim, auto_sim in \ zip(id_list, w1_list, en_w1_list, w2_list, en_w2_list, manu_sim_list, auto_sim_list): # print id, '====' if d.check(trans_w1) and d.check(trans_w2): if len(trans_w1.split()) <= 1 and len(trans_w2.split()) <= 1: try: auto_sim = w2v_model.similarity(trans_w1, trans_w2) auto_sim = utils.convert_sim(auto_sim, mode=0) # 将余弦相似度放到1-10得分 count += 1 except: pass print '%s\t%s[%s];%s[%s]\tmanu_sim=%s\tauto_sim=%s' % (id, w1, trans_w1, w2, trans_w2, manu_sim, auto_sim) new_auto_sim_list.append(float(auto_sim)) line2 = '%s\t%s\t%s\t%s\t%s\n' % (id, trans_w1, trans_w2, manu_sim, auto_sim) fw2.write(line2.encode('utf-8')) fw2.close() # 评价结果 print 'count=', count r = eval.spearman(manu_sim_list, new_auto_sim_list) p = eval.pearson(manu_sim_list, new_auto_sim_list) print '!!!spearman=%s; pearson=%s' % (r, p)
def analysis(word_vectoers, read2wordlist): ''' :param word_vectoers: dict类型,词向量 :return: id列表,词的列表,以及以word_vectors计算的相似度list ''' idl, w1l, w2l, score, headline = read2wordlist auto_sims = [] print 'len of pairs:', len(idl) for w1, w2,s in zip(w1l, w2l, score): sim = 0 try: sim = (1 - distance(word_vectoers[w1.encode('utf-8')], word_vectoers[w2.encode('utf-8')])) except: sim = 0 # print w1,w2,sim sim = utils.convert_sim(sim, mode=0) auto_sims.append(sim) # print w1,w2,sim,s p = spearmanr(auto_sims, score) r = pearsonr(auto_sims, score) print 'spearmanr:', p, '\tpearson:', r return idl,w1l,w2l,auto_sims, p, r
def cwordnet_sim(f_tuple_list, cmn='cmn'): print 'load cwordnet_sim...' cwordnet_sim_list = [] idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list) count = 0 for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list): auto_sim = cwn_sim(w1, w2, cmn) # 字典中查找到的词 if auto_sim >= 0: count += 1 # 分制转成1-10 auto_sim = utils.convert_sim(auto_sim, mode=1) else: pass # 未查找到的词 auto_sim = -1 print "cwordnet:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim, auto_sim) cwordnet_sim_list.append(auto_sim) print 'count=%s/%s' % (count, len(manu_sim_list)) print 'spearman=%0.5f/%0.5f' % (eval.spearman(manu_sim_list, cwordnet_sim_list), eval.spearman(manu_sim_list, cwordnet_sim_list, True)) print 'pearson=%0.5f/%0.5f' % (eval.pearson(manu_sim_list, cwordnet_sim_list), eval.pearson(manu_sim_list, cwordnet_sim_list, True)) return cwordnet_sim_list
def ir_sim(f_tuple_list, ofname='NLPCC_Formal500_single_sims_ir_nums0.pk'): print 'ir sim ...' idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list) nums_pk_path = '%s/%s' % (macro.RESULTS_DIR, ofname) if os.path.exists(nums_pk_path): print 'load nums...' f = open(nums_pk_path, 'rb') n1l, n2l, n3l = pickle.load(f) f.close() else: print 'retrieval nums...' n1l, n2l, n3l = get_nums_list(w1l, w2l) f = open(nums_pk_path, 'wb') pickle.dump((n1l, n2l, n3l), f) f.close() with open(nums_pk_path.split('.')[0]+'_nums.csv', 'w') as fw: for id, w1, w2, n1, n2, n3 in zip(idl, w1l, w2l, n1l, n2l, n3l): new_line = '%s,%s,%s,%s,%s,%s' % (id, w1, w2, n1, n2, n3) fw.write(new_line.encode('gbk')+'\n') N = pow(10, 16) jcd_list, ovl_list, dice_list, pmi_list, ngd_list = [], [], [], [], [] for num1, num2, num3, id, w1, w2, manu_sim in zip(n1l, n2l, n3l, idl, w1l, w2l, manu_sim_list): jcd = utils.convert_sim(web_jaccard(num1, num2, num3), mode=1) ovl = utils.convert_sim(web_overlap(num1, num2, num3), mode=1) dice = utils.convert_sim(web_dice(num1, num2, num3), mode=1) pmi = utils.convert_sim(web_pmi(num1, num2, num3, N), mode=1) ngd = utils.convert_sim(web_ngd(num1, num2, num3, N), mode=1) jcd_list.append(jcd) ovl_list.append(ovl) dice_list.append(dice) pmi_list.append(pmi) ngd_list.append(ngd) # print "ir:proc_id= %s [%s,%s] %s (%.5f, %.5f, %.5f, %.5f, %.5f) " % (id, w1, w2, manu_sim, jcd, ovl, dice, pmi, ngd) from prettytable import PrettyTable x = PrettyTable(["Eval", "jaccard", "overlap", "dice", "pmi", "ngd"]) x.align["Eval"] = "l" x.padding_width = 1 x.add_row(['Spearman', '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, jcd_list), eval.spearman(manu_sim_list, jcd_list, True)), '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, ovl_list), eval.spearman(manu_sim_list, ovl_list, True)), '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, dice_list), eval.spearman(manu_sim_list, dice_list, True)), '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, pmi_list), eval.spearman(manu_sim_list, pmi_list, True)), '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, ngd_list), eval.spearman(manu_sim_list, ngd_list, True))]) x.add_row(['Pearson', '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, jcd_list), eval.pearson(manu_sim_list, jcd_list, True)), '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, ovl_list), eval.pearson(manu_sim_list, ovl_list, True)), '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, dice_list), eval.pearson(manu_sim_list, dice_list, True)), '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, pmi_list), eval.pearson(manu_sim_list, pmi_list, True)), '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, ngd_list), eval.pearson(manu_sim_list, ngd_list, True)), ]) x.add_row(['Count', '%s/%s' % (len(manu_sim_list) - jcd_list.count(-1), len(manu_sim_list)), '%s/%s' % (len(manu_sim_list) - ovl_list.count(-1), len(manu_sim_list)), '%s/%s' % (len(manu_sim_list) - dice_list.count(-1), len(manu_sim_list)), '%s/%s' % (len(manu_sim_list) - pmi_list.count(-1), len(manu_sim_list)), '%s/%s' % (len(manu_sim_list) - ngd_list.count(-1), len(manu_sim_list)), ]) print x return jcd_list, ovl_list, dice_list, pmi_list, ngd_list