Ejemplo n.º 1
0
def word2vec_sim_en(f_tuple_list):
    print 'load word2vec model...'
    idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
    model = KeyedVectors.load_word2vec_format(
        r'%s/%s' % (macro.DICT_DIR, 'GoogleNews-vectors-negative300.bin'),
        binary=True)
    # model = KeyedVectors.load_word2vec_format(r'%s/cn.skipgram.bin' % (macro.DICT_DIR), binary=True, unicode_errors='ignore')
    auto_sim_list = []
    count = 0
    for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list):
        try:
            auto_sim = model.similarity(w1, w2)  # 将余弦相似度放到0-10得分
            # 将余弦相似度-1~1放到1~10得分
            auto_sim = utils.convert_sim(auto_sim, mode=0)
        except:
            auto_sim = -1
            count += 1
        print "w2v:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim,
                                                   auto_sim)
        auto_sim_list.append(auto_sim)

    print 'count=%s/%s' % (len(manu_sim_list) - count, len(manu_sim_list))
    print 'spearman=%0.5f/%0.5f' % (eval.spearman(
        manu_sim_list,
        auto_sim_list), eval.spearman(manu_sim_list, auto_sim_list, True))
    print 'pearson=%0.5f/%0.5f' % (eval.pearson(
        manu_sim_list,
        auto_sim_list), eval.pearson(manu_sim_list, auto_sim_list, True))

    return auto_sim_list
Ejemplo n.º 2
0
    def calculate_sim_without_tag(self, load_model, ofname, write_flag=True):
        # 加载指定w2v model
        w2v_model = Word2Vec.load_word2vec_format(
            r'%s/%s' % (macro.MODELS_DIR, load_model), binary=True)  # C format
        # 读入评测词对语料
        id_list, word1_list, word2_list, headline = utils.read2wordlist(
            self.f_tuple_list, mode='no_tag')
        # 新的题头
        new_headline = headline.strip() + '\tPrediction\n'
        # 计算相似度
        auto_sim_list = []
        for w1, w2 in zip(word1_list, word2_list):
            try:
                auto_sim = w2v_model.similarity(w1, w2)  # 向量余弦相似度[-1,1]
                auto_sim = utils.convert_sim(auto_sim)  # 将余弦相似度放到1-10得分
                print '%-10s\t%-10s\t%-10s' % (w1, w2, auto_sim)
            except:
                auto_sim = 1  # 未登录词,为了区分1.0,赋值为1
                print '%-10s\t%-10s\t%-10s' % (w1, w2, '______Not Found______')
            auto_sim_list.append(auto_sim)

        # 相似度计算的结果是否写入文件
        if write_flag:
            print 'write result to file...'
            with open('%s/%s' % (macro.RESULTS_DIR, ofname), 'w') as fw:
                fw.write(new_headline)
                for w1, w2, auto_sim in zip(word1_list, word2_list,
                                            auto_sim_list):
                    fw.write('%s\t%s\t%s\n' % (w1, w2, auto_sim))

        return word1_list, word2_list, auto_sim_list, new_headline
Ejemplo n.º 3
0
    def calculate_sim(self, load_model, ofname, write_flag=True):
        # 加载指定w2v model
        w2v_model = Word2Vec.load_word2vec_format(
            r'%s/%s' % (macro.MODELS_DIR, load_model), binary=True)  # C format
        # 读入评测词对语料
        id_list, word1_list, word2_list, manu_sim_list, headline = utils.read2wordlist(
            self.f_tuple_list, mode='tag')
        # 新的题头
        new_headline = headline.strip() + '\tPrediction\n'
        # 计算相似度
        auto_sim_list = []
        for id, w1, w2, manu_sim in zip(id_list, word1_list, word2_list,
                                        manu_sim_list):
            try:
                auto_sim = w2v_model.similarity(w1, w2)  # 向量余弦相似度[-1,1]
                print '%-10s\t%-10s\t%-10s\t%-10s\t%-10s' % (
                    id, w1, w2, manu_sim, auto_sim)
            except:
                auto_sim = 0  # 未登录词,为了区分1.0,赋值为1
                print '%-10s\t%-10s\t%-10s\t%-10s\t%-10s' % (
                    id, w1, w2, manu_sim, '______Not Found______')
            auto_sim = utils.convert_sim(auto_sim, mode=1)  # 将余弦相似度放到1-10得分
            auto_sim_list.append(auto_sim)

        # 相似度计算的结果是否写入文件
        if write_flag:
            print 'write result to file...'
            with open('%s/%s' % (macro.RESULTS_DIR, ofname), 'w') as fw:
                fw.write(new_headline)
                for id, w1, w2, manu_sim, auto_sim in zip(
                        id_list, word1_list, word2_list, manu_sim_list,
                        auto_sim_list):
                    fw.write('%s\t%s\t%s\t%s\t%s\n' %
                             (str(id), w1.encode('utf-8'), w2.encode('utf-8'),
                              manu_sim, auto_sim))

        # 评价结果
        r = eval.spearman(manu_sim_list, auto_sim_list)
        p = eval.pearson(manu_sim_list, auto_sim_list)
        print '!!!spearman=%s; pearson=%s' % (r, p)

        # 可视化结果
        data = {
            'ID': id_list,
            'Word1': word1_list,
            'Word2': word2_list,
            'Score': manu_sim_list,
            'Prediction': auto_sim_list
        }

        frame = DataFrame(data)
        sns.jointplot("Score",
                      "Prediction",
                      frame,
                      kind='reg',
                      stat_func=eval.spearmanr)
        plt.savefig('%s/%s.jpg' % (macro.PICS_DIR, ofname))

        return word1_list, word2_list, manu_sim_list, auto_sim_list, new_headline
Ejemplo n.º 4
0
def cilin_sim(f_tuple_list):
    cs = loadCilin()
    idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
    result1 = []
    result2 = []
    result3 = []
    count = 0
    for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list):
        sim1 = cs.similarity(w1, w2)
        sim2 = cs.sim2013(w1, w2)
        sim3 = cs.sim2016(w1, w2)
        # 字典中查找到的词
        if sim3 >= 0:
            count += 1
            # 分制转成1-10
            sim1 = utils.convert_sim(sim1, mode=1)
            sim2 = utils.convert_sim(sim2, mode=1)
            sim3 = utils.convert_sim(sim3, mode=1)
        else:
            pass
            # 未查找到的词认为相似度很低
            sim1, sim2, sim3 = -1, -1, -1
        # push
        result1.append(sim1)
        result2.append(sim2)
        result3.append(sim3)
        print "cilin:proc_id= %s [%s,%s] %s (%0.2f, %0.2f, %0.2f)" % (
            id, w1, w2, manu_sim, sim1, sim2, sim3)
    # 统计与评价
    print 'found_pair=%s/%s' % (count, len(manu_sim_list))
    print 'sim1: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson(
        manu_sim_list, result1), eval.pearson(
            manu_sim_list, result1, True), eval.spearman(
                manu_sim_list,
                result1), eval.spearman(manu_sim_list, result1, True))
    print 'sim2: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson(
        manu_sim_list, result2), eval.pearson(
            manu_sim_list, result2, True), eval.spearman(
                manu_sim_list,
                result2), eval.spearman(manu_sim_list, result2, True))
    print 'sim3: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson(
        manu_sim_list, result3), eval.pearson(
            manu_sim_list, result3, True), eval.spearman(
                manu_sim_list,
                result3), eval.spearman(manu_sim_list, result3, True))
    return (result1, result2, result3)
    def train_ext_vocab_choose_best(self, save_model, result_fname, last_val):
        # 获取评价词对
        id_list, word1_list, word2_list, manu_sim_list, headline = utils.read2wordlist(self.f_tuple_list, mode='tag')

        # 获取语料
        sentences = []
        for seg_docs_dir in self.seg_docs_dir_list:
            if type(seg_docs_dir) == tuple:
                sens = utils.atxt2sens(seg_docs_dir[0], seg_docs_dir[1])
            else:
                sens = utils.txts2sens(seg_docs_dir)
            sentences.extend(sens)

        # 得到模型方式:load之前的模型 OR 训练词向量模型
        if last_val == -2:
            print 'load previous model....'
            model = Word2Vec.load_word2vec_format(r'%s/%s' % (macro.MODELS_DIR, save_model), binary=True)
        else:
            model = Word2Vec(sentences, sg=1, size=300, window=10, negative=0, hs=1, sample=1e-4, workers=8,
                             min_count=5)

        # 评价相似度
        auto_sim_list = []
        for w1, w2, manu_sim in zip(word1_list, word2_list, manu_sim_list):
            try:
                auto_sim = model.similarity(w1, w2)  # 将余弦相似度放到1-10得分
                auto_sim = utils.convert_sim(auto_sim)
                # print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, auto_sim)
            except:
                auto_sim = 1  # 为了区分没有找到的情况,用1代替1.0
                print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, '______Not Found______')
            auto_sim_list.append(auto_sim)

        # 保留val大的模型
        val = eval.spearman(manu_sim_list, auto_sim_list)
        if val > last_val:
            model.save_word2vec_format('%s/%s' % (macro.MODELS_DIR, save_model), binary=True)  # 保存模型
            print 'write result to file...'
            with open('%s/%s' % (macro.RESULTS_DIR, result_fname), 'w') as fw:
                fw.write(headline.strip() + '\tPrediction\n')
                for w1, w2, manu_sim, auto_sim in zip(word1_list, word2_list, manu_sim_list, auto_sim_list):
                    fw.write('%s\t%s\t%s\t%s\n' % (w1.encode('utf-8'), w2.encode('utf-8'), manu_sim, auto_sim))
        else:
            print ':::::::current val=', val
        return val
Ejemplo n.º 6
0
def hnet_sim(f_tuple_list):
    '''
    bt_xiepeiyiVerb.dic:每一行是一个协陪义动词
    #对于每一个协陪义动词,得到“当前协陪义动词--glossary.dat中动词    相似度”
     相似度个数 = bt_xiepeiyiVerb.dic中有多少行乘以glossary.dat中动词数量,对所有相似度从大到小排序,结果存放在result.txt中
    '''
    generatePlabel = False
    SIMILARITY = True
    BETA = [0.5, 0.2, 0.17, 0.13]
    GAMA = 0.2
    DELTA = 0.2
    ALFA = 1.6
    glossaryfile = '%s/%s' % (macro.DICT_DIR, macro.WN_GLOSS_DICT)
    xiepeiyidic = '%s/%s' % (macro.DICT_DIR, macro.WN_XPY_VERB_DICT)
    sememefile = '%s/%s' % (macro.DICT_DIR, macro.WN_WHOLE_DICT)

    if generatePlabel:
        lines = generateSourcefile(glossaryfile, xiepeiyidic)
        print('There are ' + str(len(lines)) + ' lines!!')

    if SIMILARITY:

        obj = WordSimilarity()

        if obj.init(sememefile, glossaryfile) == False:
            print("[ERROR] init failed!!")

        count = 0
        auto_sim_list = []
        idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
        for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list):
            auto_sim = obj.calc(w1.encode('utf-8'), w2.encode('utf-8'), BETA, GAMA, DELTA, ALFA)
            if auto_sim >= 0:
                # 0-1放缩到1-10
                auto_sim = utils.convert_sim(auto_sim, mode=1)
                count += 1
            else:
                auto_sim = -1
            auto_sim_list.append(auto_sim)
            print "hownet:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim, auto_sim)
        print 'count=%s/%s' % (count, len(manu_sim_list))
        print 'spearman=%0.5f/%0.5f' % (eval.spearman(manu_sim_list, auto_sim_list), eval.spearman(manu_sim_list, auto_sim_list, True))
        print 'pearson=%0.5f/%0.5f' % (eval.pearson(manu_sim_list, auto_sim_list), eval.pearson(manu_sim_list, auto_sim_list, True))
        return auto_sim_list
Ejemplo n.º 7
0
def cilin_webtest(w1l, w2l):
    cs = loadCilin()
    result3 = []
    result_str = ''
    count = 0
    for w1, w2 in zip(w1l, w2l):
        sim3 = cs.sim2016(w1, w2)
        # 字典中查找到的词
        if sim3 >= 0:
            count += 1
            # 分制转成1-10
            sim3 = utils.convert_sim(sim3, mode=1)
        else:
            pass
            # 未查找到的词认为相似度很低
            sim3 = '__NOTFOUND__'
        # push
        result_str += '%s&nbsp;&nbsp;&nbsp;%s&nbsp;&nbsp;&nbsp;%s<br/>' % (
            w1, w2, sim3)
        result3.append(sim3)
    return result_str
Ejemplo n.º 8
0
def combine_zh_en():
    d = enchant.Dict('en_US')
    _, en_w1_list, en_w2_list, _, _ = utils.read2wordlist([(macro.CORPUS_DIR, 'en_'+macro.NLPCC_FML_FILE)], mode='tag')
    _, _, _, manu_sim_list, _ = utils.read2wordlist([(macro.CORPUS_DIR, macro.NLPCC_FML_FILE)],
                                                          mode='tag')

    # 这里换成想要提升的结果文件
    # id_list, w1_list, w2_list, manu_sim_list, auto_sim_list, headline = \
    #     utils.read2wordlist([(macro.RESULTS_DIR, macro.FML_ORG_BDNEWS_XIESO_RESULT)], mode='auto_tag')
    id_list, w1_list, w2_list,  auto_sim_list, headline = \
        utils.read2wordlist([(macro.RESULTS_DIR, 'lstm.result')], mode='tag')

    w2v_model = Word2Vec.load_word2vec_format(r'%s/%s' % (macro.MODELS_DIR, macro.GOOGLE_EN_W2V_MODEL), binary=True)   # the English model

    fw2 = open(r'%s/%s' % (macro.RESULTS_DIR, macro.FML_ORG_GOOGLE_EN_W2V_RESULT), 'w')
    fw2.write(headline)

    new_auto_sim_list = []
    count = 0
    for id, w1, trans_w1, w2, trans_w2, manu_sim, auto_sim in \
            zip(id_list, w1_list, en_w1_list, w2_list, en_w2_list, manu_sim_list, auto_sim_list):
        # print id, '===='
        if d.check(trans_w1) and d.check(trans_w2):
            if len(trans_w1.split()) <= 1 and len(trans_w2.split()) <= 1:
                try:
                    auto_sim = w2v_model.similarity(trans_w1, trans_w2)
                    auto_sim = utils.convert_sim(auto_sim, mode=0)  # 将余弦相似度放到1-10得分
                    count += 1
                except:
                    pass
                print '%s\t%s[%s];%s[%s]\tmanu_sim=%s\tauto_sim=%s' % (id, w1, trans_w1, w2, trans_w2, manu_sim, auto_sim)
        new_auto_sim_list.append(float(auto_sim))
        line2 = '%s\t%s\t%s\t%s\t%s\n' % (id, trans_w1, trans_w2, manu_sim, auto_sim)
        fw2.write(line2.encode('utf-8'))
    fw2.close()
    # 评价结果
    print 'count=', count
    r = eval.spearman(manu_sim_list, new_auto_sim_list)
    p = eval.pearson(manu_sim_list, new_auto_sim_list)
    print '!!!spearman=%s; pearson=%s' % (r, p)
Ejemplo n.º 9
0
def analysis(word_vectoers, read2wordlist):
    '''
    :param word_vectoers: dict类型,词向量
    :return: id列表,词的列表,以及以word_vectors计算的相似度list
    '''
    idl, w1l, w2l, score, headline = read2wordlist
    auto_sims = []
    print 'len of pairs:', len(idl)

    for w1, w2,s in zip(w1l, w2l, score):
        sim = 0
        try:
            sim = (1 - distance(word_vectoers[w1.encode('utf-8')], word_vectoers[w2.encode('utf-8')]))
        except:
            sim = 0
       # print w1,w2,sim
        sim = utils.convert_sim(sim, mode=0)
        auto_sims.append(sim)
        # print w1,w2,sim,s
    p = spearmanr(auto_sims, score)
    r = pearsonr(auto_sims, score)
    print 'spearmanr:', p, '\tpearson:', r
    return idl,w1l,w2l,auto_sims, p, r
Ejemplo n.º 10
0
def cwordnet_sim(f_tuple_list, cmn='cmn'):
    print 'load cwordnet_sim...'
    cwordnet_sim_list = []
    idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
    count = 0
    for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list):
        auto_sim = cwn_sim(w1, w2, cmn)
        # 字典中查找到的词
        if auto_sim >= 0:
            count += 1
            # 分制转成1-10
            auto_sim = utils.convert_sim(auto_sim, mode=1)
        else:
            pass
            # 未查找到的词
            auto_sim = -1
        print "cwordnet:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim, auto_sim)
        cwordnet_sim_list.append(auto_sim)

    print 'count=%s/%s' % (count, len(manu_sim_list))
    print 'spearman=%0.5f/%0.5f' % (eval.spearman(manu_sim_list, cwordnet_sim_list), eval.spearman(manu_sim_list, cwordnet_sim_list, True))
    print 'pearson=%0.5f/%0.5f' % (eval.pearson(manu_sim_list, cwordnet_sim_list), eval.pearson(manu_sim_list, cwordnet_sim_list, True))
    return cwordnet_sim_list
Ejemplo n.º 11
0
def ir_sim(f_tuple_list, ofname='NLPCC_Formal500_single_sims_ir_nums0.pk'):
    print 'ir sim ...'
    idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
    nums_pk_path = '%s/%s' % (macro.RESULTS_DIR, ofname)
    if os.path.exists(nums_pk_path):
        print 'load nums...'
        f = open(nums_pk_path, 'rb')
        n1l, n2l, n3l = pickle.load(f)
        f.close()
    else:
        print 'retrieval nums...'
        n1l, n2l, n3l = get_nums_list(w1l, w2l)
        f = open(nums_pk_path, 'wb')
        pickle.dump((n1l, n2l, n3l), f)
        f.close()
    with open(nums_pk_path.split('.')[0]+'_nums.csv', 'w') as fw:
        for id, w1, w2, n1, n2, n3 in zip(idl, w1l, w2l, n1l, n2l, n3l):
            new_line = '%s,%s,%s,%s,%s,%s' % (id, w1, w2, n1, n2, n3)
            fw.write(new_line.encode('gbk')+'\n')

    N = pow(10, 16)
    jcd_list, ovl_list, dice_list, pmi_list, ngd_list = [], [], [], [], []
    for num1, num2, num3, id, w1, w2, manu_sim in zip(n1l, n2l, n3l, idl, w1l, w2l, manu_sim_list):
        jcd = utils.convert_sim(web_jaccard(num1, num2, num3), mode=1)
        ovl = utils.convert_sim(web_overlap(num1, num2, num3), mode=1)
        dice = utils.convert_sim(web_dice(num1, num2, num3), mode=1)
        pmi = utils.convert_sim(web_pmi(num1, num2, num3, N), mode=1)
        ngd = utils.convert_sim(web_ngd(num1, num2, num3, N), mode=1)
        jcd_list.append(jcd)
        ovl_list.append(ovl)
        dice_list.append(dice)
        pmi_list.append(pmi)
        ngd_list.append(ngd)
        # print "ir:proc_id= %s [%s,%s] %s (%.5f, %.5f, %.5f, %.5f, %.5f) " % (id, w1, w2, manu_sim, jcd, ovl, dice, pmi, ngd)

    from prettytable import PrettyTable
    x = PrettyTable(["Eval", "jaccard", "overlap", "dice", "pmi", "ngd"])
    x.align["Eval"] = "l"
    x.padding_width = 1
    x.add_row(['Spearman',
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, jcd_list), eval.spearman(manu_sim_list, jcd_list, True)),
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, ovl_list), eval.spearman(manu_sim_list, ovl_list, True)),
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, dice_list), eval.spearman(manu_sim_list, dice_list, True)),
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, pmi_list), eval.spearman(manu_sim_list, pmi_list, True)),
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, ngd_list), eval.spearman(manu_sim_list, ngd_list, True))])
    x.add_row(['Pearson',
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, jcd_list), eval.pearson(manu_sim_list, jcd_list, True)),
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, ovl_list), eval.pearson(manu_sim_list, ovl_list, True)),
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, dice_list), eval.pearson(manu_sim_list, dice_list, True)),
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, pmi_list), eval.pearson(manu_sim_list, pmi_list, True)),
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, ngd_list), eval.pearson(manu_sim_list, ngd_list, True)),
               ])
    x.add_row(['Count',
               '%s/%s' % (len(manu_sim_list) - jcd_list.count(-1), len(manu_sim_list)),
               '%s/%s' % (len(manu_sim_list) - ovl_list.count(-1), len(manu_sim_list)),
               '%s/%s' % (len(manu_sim_list) - dice_list.count(-1), len(manu_sim_list)),
               '%s/%s' % (len(manu_sim_list) - pmi_list.count(-1), len(manu_sim_list)),
               '%s/%s' % (len(manu_sim_list) - ngd_list.count(-1), len(manu_sim_list)),
               ])
    print x

    return jcd_list, ovl_list, dice_list, pmi_list, ngd_list