Ejemplo n.º 1
0
def final_test_formal_run():
    # 读入已知词对,如果新评测词对在已知中,则直接取出
    id_list1, gordern_word1_list, gordern_word2_list, manu_sim_list, headline1 = utils.read2wordlist(
        [(macro.CORPUS_DIR, macro.NLPCC_DRY_FILE)], 'tag')
    # 模式是带有答案的
    id_list, word1_list, word2_list, manu_sim_list2, headline2 = utils.read2wordlist(
        [(macro.CORPUS_DIR, macro.NLPCC_FML_GD_FILE)], 'tag')
    model = Word2Vec.load_word2vec_format(
        r'%s/%s' % (macro.MODELS_DIR, macro.DRY_EXT_MIX_BST_W2V_MODEL),
        binary=True)
    auto_sim_list = []
    for id, w1, w2 in zip(id_list, word1_list, word2_list):
        if w1 in gordern_word1_list and w2 == gordern_word2_list[
                gordern_word1_list.index(w1)]:
            auto_sim = manu_sim_list[gordern_word1_list.index(w1)]
            print 'found it in dry run data:::(%s\t%s\t%s)' % (w1, w2,
                                                               auto_sim)
        elif w2 in gordern_word1_list and w1 == gordern_word2_list[
                gordern_word1_list.index(w2)]:
            auto_sim = manu_sim_list[gordern_word1_list.index(w1)]
            print 'found it in dry run data:::(%s\t%s\t%s)' % (w1, w2,
                                                               auto_sim)
        else:
            try:
                auto_sim = model.similarity(w1, w2)  # 将余弦相似度放到0-10得分
                if auto_sim <= 0:
                    auto_sim = 1.0
                else:
                    auto_sim = auto_sim * 9 + 1
                # auto_sim = 0.5*(auto_sim+1)*10
                print '%-10s\t%-10s\t%-10s\t%-10s' % (id, w1, w2, auto_sim)
            except:
                auto_sim = 1
                print '%-10s\t%-10s\t%-10s\t%-10s' % (id, w1, w2,
                                                      '______Not Found______')
        auto_sim_list.append(auto_sim)

    print eval.spearman(manu_sim_list2, auto_sim_list)
    # 写入文件
    fw = open('%s/%s' % (macro.RESULTS_DIR, macro.FNL_FML_EXT_MIX_BST_RESULT),
              'w')
    fw.write(headline2)
    for id, w1, w2, auto_sim in zip(id_list, word1_list, word2_list,
                                    auto_sim_list):
        fw.write('%s\t%s\t%s\t%s\n' % (id.encode('utf-8'), w1.encode('utf-8'),
                                       w2.encode('utf-8'), auto_sim))
    print 'test_formal_run:::finished!'
    return
Ejemplo n.º 2
0
def word2vec_sim_en(f_tuple_list):
    print 'load word2vec model...'
    idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
    model = KeyedVectors.load_word2vec_format(
        r'%s/%s' % (macro.DICT_DIR, 'GoogleNews-vectors-negative300.bin'),
        binary=True)
    # model = KeyedVectors.load_word2vec_format(r'%s/cn.skipgram.bin' % (macro.DICT_DIR), binary=True, unicode_errors='ignore')
    auto_sim_list = []
    count = 0
    for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list):
        try:
            auto_sim = model.similarity(w1, w2)  # 将余弦相似度放到0-10得分
            # 将余弦相似度-1~1放到1~10得分
            auto_sim = utils.convert_sim(auto_sim, mode=0)
        except:
            auto_sim = -1
            count += 1
        print "w2v:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim,
                                                   auto_sim)
        auto_sim_list.append(auto_sim)

    print 'count=%s/%s' % (len(manu_sim_list) - count, len(manu_sim_list))
    print 'spearman=%0.5f/%0.5f' % (eval.spearman(
        manu_sim_list,
        auto_sim_list), eval.spearman(manu_sim_list, auto_sim_list, True))
    print 'pearson=%0.5f/%0.5f' % (eval.pearson(
        manu_sim_list,
        auto_sim_list), eval.pearson(manu_sim_list, auto_sim_list, True))

    return auto_sim_list
Ejemplo n.º 3
0
    def calculate_sim_without_tag(self, load_model, ofname, write_flag=True):
        # 加载指定w2v model
        w2v_model = Word2Vec.load_word2vec_format(
            r'%s/%s' % (macro.MODELS_DIR, load_model), binary=True)  # C format
        # 读入评测词对语料
        id_list, word1_list, word2_list, headline = utils.read2wordlist(
            self.f_tuple_list, mode='no_tag')
        # 新的题头
        new_headline = headline.strip() + '\tPrediction\n'
        # 计算相似度
        auto_sim_list = []
        for w1, w2 in zip(word1_list, word2_list):
            try:
                auto_sim = w2v_model.similarity(w1, w2)  # 向量余弦相似度[-1,1]
                auto_sim = utils.convert_sim(auto_sim)  # 将余弦相似度放到1-10得分
                print '%-10s\t%-10s\t%-10s' % (w1, w2, auto_sim)
            except:
                auto_sim = 1  # 未登录词,为了区分1.0,赋值为1
                print '%-10s\t%-10s\t%-10s' % (w1, w2, '______Not Found______')
            auto_sim_list.append(auto_sim)

        # 相似度计算的结果是否写入文件
        if write_flag:
            print 'write result to file...'
            with open('%s/%s' % (macro.RESULTS_DIR, ofname), 'w') as fw:
                fw.write(new_headline)
                for w1, w2, auto_sim in zip(word1_list, word2_list,
                                            auto_sim_list):
                    fw.write('%s\t%s\t%s\n' % (w1, w2, auto_sim))

        return word1_list, word2_list, auto_sim_list, new_headline
Ejemplo n.º 4
0
def dry_extend_vocab_by_cilin(f_tuple_list, fdir, fdict):
    # 语料中的词
    word1_list, word2_list, manu_sim_list, headline = utils.read2wordlist(
        f_tuple_list, mode='tag')
    vocab = sorted(list(set(word1_list + word2_list)))
    with open(r'%s/%s' % (macro.DICT_DIR, macro.DRY_ORG_VOCAB_DICT),
              'w') as fw:
        new_vocab = '\n'.join(vocab) + '\n'
        fw.write(new_vocab.encode('utf-8'))

    cilin_list = utils.read_cilin2list()  # 得到长度在[1,4]的词林二维列表
    # 如果词林中的词在原始vocab中,那么就把这组近义词中加入vocab
    for row in cilin_list:
        for col in row:
            if col in vocab:
                print '%s in [%s]' % (col, ','.join(vocab))
                vocab.extend(row)  # 如果某个词出现在vocab中,就把这行的词都加入
                break  # 直接对下一行处理

    vocab = sorted(list(set(vocab)))  # 去重
    with open(r'%s/%s' % (fdir, fdict), 'w') as fw:
        ext_new_vocab = '\n'.join(vocab) + '\n'
        fw.write(ext_new_vocab.encode('utf-8'))

    return
Ejemplo n.º 5
0
    def calculate_sim(self, load_model, ofname, write_flag=True):
        # 加载指定w2v model
        w2v_model = Word2Vec.load_word2vec_format(
            r'%s/%s' % (macro.MODELS_DIR, load_model), binary=True)  # C format
        # 读入评测词对语料
        id_list, word1_list, word2_list, manu_sim_list, headline = utils.read2wordlist(
            self.f_tuple_list, mode='tag')
        # 新的题头
        new_headline = headline.strip() + '\tPrediction\n'
        # 计算相似度
        auto_sim_list = []
        for id, w1, w2, manu_sim in zip(id_list, word1_list, word2_list,
                                        manu_sim_list):
            try:
                auto_sim = w2v_model.similarity(w1, w2)  # 向量余弦相似度[-1,1]
                print '%-10s\t%-10s\t%-10s\t%-10s\t%-10s' % (
                    id, w1, w2, manu_sim, auto_sim)
            except:
                auto_sim = 0  # 未登录词,为了区分1.0,赋值为1
                print '%-10s\t%-10s\t%-10s\t%-10s\t%-10s' % (
                    id, w1, w2, manu_sim, '______Not Found______')
            auto_sim = utils.convert_sim(auto_sim, mode=1)  # 将余弦相似度放到1-10得分
            auto_sim_list.append(auto_sim)

        # 相似度计算的结果是否写入文件
        if write_flag:
            print 'write result to file...'
            with open('%s/%s' % (macro.RESULTS_DIR, ofname), 'w') as fw:
                fw.write(new_headline)
                for id, w1, w2, manu_sim, auto_sim in zip(
                        id_list, word1_list, word2_list, manu_sim_list,
                        auto_sim_list):
                    fw.write('%s\t%s\t%s\t%s\t%s\n' %
                             (str(id), w1.encode('utf-8'), w2.encode('utf-8'),
                              manu_sim, auto_sim))

        # 评价结果
        r = eval.spearman(manu_sim_list, auto_sim_list)
        p = eval.pearson(manu_sim_list, auto_sim_list)
        print '!!!spearman=%s; pearson=%s' % (r, p)

        # 可视化结果
        data = {
            'ID': id_list,
            'Word1': word1_list,
            'Word2': word2_list,
            'Score': manu_sim_list,
            'Prediction': auto_sim_list
        }

        frame = DataFrame(data)
        sns.jointplot("Score",
                      "Prediction",
                      frame,
                      kind='reg',
                      stat_func=eval.spearmanr)
        plt.savefig('%s/%s.jpg' % (macro.PICS_DIR, ofname))

        return word1_list, word2_list, manu_sim_list, auto_sim_list, new_headline
Ejemplo n.º 6
0
    def __init__(self, f_tuple_list, mode='tag'):
        self.f_tuple_list = f_tuple_list
        if mode == 'tag':
            self.id_list, self.word1_list, self.word2_list, self.manu_sim_list, self.headline = utils.read2wordlist(
                f_tuple_list, mode)
        elif mode == 'no_tag':
            self.id_list, self.word1_list, self.word2_list, self.headline = utils.read2wordlist(
                f_tuple_list, mode)

        self.ofname = '_'.join(
            [f_tuple[1].split('.')[0]
             for f_tuple in self.f_tuple_list]) + '_hownet.txt'  # 输出文件名
Ejemplo n.º 7
0
def combine_zh_en():
    d = enchant.Dict('en_US')
    _, en_w1_list, en_w2_list, _, _ = utils.read2wordlist([(macro.CORPUS_DIR, 'en_'+macro.NLPCC_FML_FILE)], mode='tag')
    _, _, _, manu_sim_list, _ = utils.read2wordlist([(macro.CORPUS_DIR, macro.NLPCC_FML_FILE)],
                                                          mode='tag')

    # 这里换成想要提升的结果文件
    # id_list, w1_list, w2_list, manu_sim_list, auto_sim_list, headline = \
    #     utils.read2wordlist([(macro.RESULTS_DIR, macro.FML_ORG_BDNEWS_XIESO_RESULT)], mode='auto_tag')
    id_list, w1_list, w2_list,  auto_sim_list, headline = \
        utils.read2wordlist([(macro.RESULTS_DIR, 'lstm.result')], mode='tag')

    w2v_model = Word2Vec.load_word2vec_format(r'%s/%s' % (macro.MODELS_DIR, macro.GOOGLE_EN_W2V_MODEL), binary=True)   # the English model

    fw2 = open(r'%s/%s' % (macro.RESULTS_DIR, macro.FML_ORG_GOOGLE_EN_W2V_RESULT), 'w')
    fw2.write(headline)

    new_auto_sim_list = []
    count = 0
    for id, w1, trans_w1, w2, trans_w2, manu_sim, auto_sim in \
            zip(id_list, w1_list, en_w1_list, w2_list, en_w2_list, manu_sim_list, auto_sim_list):
        # print id, '===='
        if d.check(trans_w1) and d.check(trans_w2):
            if len(trans_w1.split()) <= 1 and len(trans_w2.split()) <= 1:
                try:
                    auto_sim = w2v_model.similarity(trans_w1, trans_w2)
                    auto_sim = utils.convert_sim(auto_sim, mode=0)  # 将余弦相似度放到1-10得分
                    count += 1
                except:
                    pass
                print '%s\t%s[%s];%s[%s]\tmanu_sim=%s\tauto_sim=%s' % (id, w1, trans_w1, w2, trans_w2, manu_sim, auto_sim)
        new_auto_sim_list.append(float(auto_sim))
        line2 = '%s\t%s\t%s\t%s\t%s\n' % (id, trans_w1, trans_w2, manu_sim, auto_sim)
        fw2.write(line2.encode('utf-8'))
    fw2.close()
    # 评价结果
    print 'count=', count
    r = eval.spearman(manu_sim_list, new_auto_sim_list)
    p = eval.pearson(manu_sim_list, new_auto_sim_list)
    print '!!!spearman=%s; pearson=%s' % (r, p)
Ejemplo n.º 8
0
def run_experiment(hypers):
    """
    This method runs the counterfitting experiment, printing the SimLex-999 score of the initial
    vectors, then counter-fitting them using the supplied linguistic constraints.
    We then print the SimLex-999 score of the final vectors, and save them to a .txt file in the
    results directory.
    """

    current_experiment = ExperimentRun(hypers)
    if not current_experiment.pretrained_word_vectors:
        return

    print "Spearman's rho coefficient of initial vectors is:"
    # simlex_analysis(current_experiment.pretrained_word_vectors), "\n"
    _, _, _, _, pp, rr = analysis(current_experiment.pretrained_word_vectors,
                                  current_experiment.read2wordlist,
                                  mode='old')
    transformed_word_vectors = counter_fit(current_experiment)

    #
    print "\nSpearman's rho coefficient of the counter-fitted vectors is:"
    idl, w1l, w2l, sims, p, r = analysis(transformed_word_vectors,
                                         current_experiment.read2wordlist,
                                         mode='new')

    # 参数、性能字符串
    hypers_str = ','.join([str(h) for h in hypers]) + ',' + str(
        pp[0]) + ',' + str(p[0])

    # 写文件
    outfile = codecs.open(
        '%s/hypers/counter-fitting-result(%s).txt' %
        (macro.RESULTS_DIR, hypers_str), 'w', 'utf-8')
    i = 0
    _, __, ___, score, ____ = utils.read2wordlist([
        (macro.CORPUS_DIR, current_experiment.eval_fname)
    ])
    outfile.write('ID\tWord1\tWord2\tScore\tSimilarity\r\n')
    for id, w1, w2, s, ss in zip(idl, w1l, w2l, score, sims):
        outfile.write(id + '\t' + w1 + '\t' + w2 + '\t' + str(s) + '\t' +
                      str(ss) + '\r\n')

    outfile.close()

    print_word_vectors(
        current_experiment.pretrained_word_vectors, macro.RESULTS_DIR +
        r'/%s_org_vectors.txt' % current_experiment.eval_fname.split('.')[0])
    print_word_vectors(
        transformed_word_vectors,
        macro.RESULTS_DIR + r"/%s_counter_fitted_vectors.txt" %
        current_experiment.eval_fname.split('.')[0])
    return hypers_str
Ejemplo n.º 9
0
def compare():
    formal_pred_all_features = post.get_value_list(
        macro.CORPUS_DIR + '/features_golden_new.txt', [1, 1, 1, 1, 1, 1, 1])
    formal_pred_selected_features = post.get_value_list(
        macro.CORPUS_DIR + '/features_golden_new.txt', [0, 0, 0, 1, 0, 1, 0])
    dry_pred_all_features = post.get_value_list(
        macro.CORPUS_DIR + '/features_test.txt', [1, 1, 1, 1, 1, 1, 1])
    dry_pred_selected_features = post.get_value_list(
        macro.CORPUS_DIR + '/features_test.txt', [0, 0, 0, 1, 0, 1, 0])
    for result in dry_results:
        idl, w1l, w2l, scores, headline = utils.read2wordlist([
            (macro.RESULTS_DIR, result)
        ])
        print str(
            result) + ' vs dry_pred_all_featuers spearman: ', eval.spearman(
                dry_pred_all_features,
                scores)[0], 'pearson: ', eval.pearson(dry_pred_all_features,
                                                      scores)[0]

        print str(result) + ' vs dry_pred_selected_featuers spearman: ', eval.spearman(dry_pred_selected_features,
                                                                                       scores)[0], 'pearson: ', \
            eval.pearson(dry_pred_selected_features, scores)[0]

    for result in formal_results:
        idl, w1l, w2l, scores, headline = utils.read2wordlist([
            (macro.RESULTS_DIR, result)
        ])
        print str(
            result) + ' vs formal_pred_all_featuers spearman: ', eval.spearman(
                formal_pred_all_features,
                scores)[0], 'pearson: ', eval.pearson(formal_pred_all_features,
                                                      scores)[0]

        print str(result) + ' vs formal_pred_selected_featuers spearman: ', eval.spearman(formal_pred_selected_features,
                                                                                          scores)[0], 'pearson: ', \
            eval.pearson(formal_pred_selected_features, scores)[0]
Ejemplo n.º 10
0
def cilin_sim(f_tuple_list):
    cs = loadCilin()
    idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
    result1 = []
    result2 = []
    result3 = []
    count = 0
    for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list):
        sim1 = cs.similarity(w1, w2)
        sim2 = cs.sim2013(w1, w2)
        sim3 = cs.sim2016(w1, w2)
        # 字典中查找到的词
        if sim3 >= 0:
            count += 1
            # 分制转成1-10
            sim1 = utils.convert_sim(sim1, mode=1)
            sim2 = utils.convert_sim(sim2, mode=1)
            sim3 = utils.convert_sim(sim3, mode=1)
        else:
            pass
            # 未查找到的词认为相似度很低
            sim1, sim2, sim3 = -1, -1, -1
        # push
        result1.append(sim1)
        result2.append(sim2)
        result3.append(sim3)
        print "cilin:proc_id= %s [%s,%s] %s (%0.2f, %0.2f, %0.2f)" % (
            id, w1, w2, manu_sim, sim1, sim2, sim3)
    # 统计与评价
    print 'found_pair=%s/%s' % (count, len(manu_sim_list))
    print 'sim1: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson(
        manu_sim_list, result1), eval.pearson(
            manu_sim_list, result1, True), eval.spearman(
                manu_sim_list,
                result1), eval.spearman(manu_sim_list, result1, True))
    print 'sim2: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson(
        manu_sim_list, result2), eval.pearson(
            manu_sim_list, result2, True), eval.spearman(
                manu_sim_list,
                result2), eval.spearman(manu_sim_list, result2, True))
    print 'sim3: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson(
        manu_sim_list, result3), eval.pearson(
            manu_sim_list, result3, True), eval.spearman(
                manu_sim_list,
                result3), eval.spearman(manu_sim_list, result3, True))
    return (result1, result2, result3)
    def train_ext_vocab_choose_best(self, save_model, result_fname, last_val):
        # 获取评价词对
        id_list, word1_list, word2_list, manu_sim_list, headline = utils.read2wordlist(self.f_tuple_list, mode='tag')

        # 获取语料
        sentences = []
        for seg_docs_dir in self.seg_docs_dir_list:
            if type(seg_docs_dir) == tuple:
                sens = utils.atxt2sens(seg_docs_dir[0], seg_docs_dir[1])
            else:
                sens = utils.txts2sens(seg_docs_dir)
            sentences.extend(sens)

        # 得到模型方式:load之前的模型 OR 训练词向量模型
        if last_val == -2:
            print 'load previous model....'
            model = Word2Vec.load_word2vec_format(r'%s/%s' % (macro.MODELS_DIR, save_model), binary=True)
        else:
            model = Word2Vec(sentences, sg=1, size=300, window=10, negative=0, hs=1, sample=1e-4, workers=8,
                             min_count=5)

        # 评价相似度
        auto_sim_list = []
        for w1, w2, manu_sim in zip(word1_list, word2_list, manu_sim_list):
            try:
                auto_sim = model.similarity(w1, w2)  # 将余弦相似度放到1-10得分
                auto_sim = utils.convert_sim(auto_sim)
                # print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, auto_sim)
            except:
                auto_sim = 1  # 为了区分没有找到的情况,用1代替1.0
                print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, '______Not Found______')
            auto_sim_list.append(auto_sim)

        # 保留val大的模型
        val = eval.spearman(manu_sim_list, auto_sim_list)
        if val > last_val:
            model.save_word2vec_format('%s/%s' % (macro.MODELS_DIR, save_model), binary=True)  # 保存模型
            print 'write result to file...'
            with open('%s/%s' % (macro.RESULTS_DIR, result_fname), 'w') as fw:
                fw.write(headline.strip() + '\tPrediction\n')
                for w1, w2, manu_sim, auto_sim in zip(word1_list, word2_list, manu_sim_list, auto_sim_list):
                    fw.write('%s\t%s\t%s\t%s\n' % (w1.encode('utf-8'), w2.encode('utf-8'), manu_sim, auto_sim))
        else:
            print ':::::::current val=', val
        return val
Ejemplo n.º 12
0
def hnet_sim(f_tuple_list):
    '''
    bt_xiepeiyiVerb.dic:每一行是一个协陪义动词
    #对于每一个协陪义动词,得到“当前协陪义动词--glossary.dat中动词    相似度”
     相似度个数 = bt_xiepeiyiVerb.dic中有多少行乘以glossary.dat中动词数量,对所有相似度从大到小排序,结果存放在result.txt中
    '''
    generatePlabel = False
    SIMILARITY = True
    BETA = [0.5, 0.2, 0.17, 0.13]
    GAMA = 0.2
    DELTA = 0.2
    ALFA = 1.6
    glossaryfile = '%s/%s' % (macro.DICT_DIR, macro.WN_GLOSS_DICT)
    xiepeiyidic = '%s/%s' % (macro.DICT_DIR, macro.WN_XPY_VERB_DICT)
    sememefile = '%s/%s' % (macro.DICT_DIR, macro.WN_WHOLE_DICT)

    if generatePlabel:
        lines = generateSourcefile(glossaryfile, xiepeiyidic)
        print('There are ' + str(len(lines)) + ' lines!!')

    if SIMILARITY:

        obj = WordSimilarity()

        if obj.init(sememefile, glossaryfile) == False:
            print("[ERROR] init failed!!")

        count = 0
        auto_sim_list = []
        idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
        for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list):
            auto_sim = obj.calc(w1.encode('utf-8'), w2.encode('utf-8'), BETA, GAMA, DELTA, ALFA)
            if auto_sim >= 0:
                # 0-1放缩到1-10
                auto_sim = utils.convert_sim(auto_sim, mode=1)
                count += 1
            else:
                auto_sim = -1
            auto_sim_list.append(auto_sim)
            print "hownet:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim, auto_sim)
        print 'count=%s/%s' % (count, len(manu_sim_list))
        print 'spearman=%0.5f/%0.5f' % (eval.spearman(manu_sim_list, auto_sim_list), eval.spearman(manu_sim_list, auto_sim_list, True))
        print 'pearson=%0.5f/%0.5f' % (eval.pearson(manu_sim_list, auto_sim_list), eval.pearson(manu_sim_list, auto_sim_list, True))
        return auto_sim_list
Ejemplo n.º 13
0
def trans_zh_to_ch():
    translator = Translator(from_lang='zh', to_lang='en')
    f_tuple_list = [(macro.CORPUS_DIR, macro.NLPCC_FML_FILE)]
    id_list, word1_list, word2_list, manu_sim_list, headline = utils.read2wordlist(f_tuple_list, mode='tag')

    fw1 = open(r'%s/en_%s' % (macro.CORPUS_DIR, macro.NLPCC_FML_FILE), 'w')
    fw1.write(headline)

    new_auto_sim_list = []
    for id, w1, w2, manu_sim in zip(id_list, word1_list, word2_list, manu_sim_list):
        print id, '===='
        trans_w1 = translator.translate(w1.encode('utf-8')).lower()
        trans_w2 = translator.translate(w2.encode('utf-8')).lower()
        line1 = '%s\t%s\t%s\t%s\n' % (id, trans_w1, trans_w2, manu_sim)
        try:
            fw1.write(line1.encode('utf-8'))
            fw1.flush()
        except:
            pass

    fw1.close()
Ejemplo n.º 14
0
def get_text(w1=None, w2=None):
    idl, w1l, w2l, score, headline = utils.read2wordlist([(macro.CORPUS_DIR,
                                                           '500_2.csv')])
    ids = []
    scores = []
    text = []
    dis_vecs = []
    for idw, word1, word2, s in zip(idl, w1l, w2l, score):

        try:
            infile = codecs.open(
                macro.DICT_DIR + '/filter/' + word1 + '_' + word2 + '.txt',
                'r', 'utf-8')
        except:
            print word1, word2
        lines = infile.readlines()
        lines = filter_lines(lines)
        if len(lines) < 3:
            continue
        if w1 and w2:
            if word1 == w1 and word2 == w2:
                text.extend(split_lines(lines))
                scores.append(score)
                ids.append(idw)
                dis_vecs.extend(get_distance_vector(word1, word2, lines))
            else:
                continue
        else:
            text.extend(split_lines(lines))

            temp = [s] * len(lines)
            temp2 = [idw] * len(lines)
            scores.extend(temp)
            ids.extend(temp2)

            dis_vecs.extend(get_distance_vector(word1, word2, lines))
        infile.close()
    return text, scores, ids, dis_vecs
Ejemplo n.º 15
0
def cwordnet_sim(f_tuple_list, cmn='cmn'):
    print 'load cwordnet_sim...'
    cwordnet_sim_list = []
    idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
    count = 0
    for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list):
        auto_sim = cwn_sim(w1, w2, cmn)
        # 字典中查找到的词
        if auto_sim >= 0:
            count += 1
            # 分制转成1-10
            auto_sim = utils.convert_sim(auto_sim, mode=1)
        else:
            pass
            # 未查找到的词
            auto_sim = -1
        print "cwordnet:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim, auto_sim)
        cwordnet_sim_list.append(auto_sim)

    print 'count=%s/%s' % (count, len(manu_sim_list))
    print 'spearman=%0.5f/%0.5f' % (eval.spearman(manu_sim_list, cwordnet_sim_list), eval.spearman(manu_sim_list, cwordnet_sim_list, True))
    print 'pearson=%0.5f/%0.5f' % (eval.pearson(manu_sim_list, cwordnet_sim_list), eval.pearson(manu_sim_list, cwordnet_sim_list, True))
    return cwordnet_sim_list
Ejemplo n.º 16
0
def test_lstm(
        dim_proj=600,  # word embedding dimension and LSTM number of hidden units.
        n_words=100000,  # Vocabulary size
        # sgd, adadelta and rmsprop available, sgd very hard to use, not recommended (probably need momentum and decaying learning rate).
    maxlen=100,  # Sequence longer then this get ignored
        batch_size=16,  # The batch size during training.
        valid_batch_size=64,  # The batch size used for validation/test set.
        # Parameter for extra option
    noise_std=0.,
        # This frequently need a bigger model.
        test_size=-1,  # If >0, we keep only this number of test example.
        use_dropout=True,  # if False slightly faster, but worst test error
        part=1):
    start_time = time.time()
    idl, w1l, w2l, score_v, headline = utils.read2wordlist([
        (macro.RESULTS_DIR, 'fml_google_en_w2v.result')
    ])
    idl, w1l, w2l, score_goldern, headline = utils.read2wordlist([
        (macro.CORPUS_DIR, '500_2.csv')
    ])

    # Model options
    model_options = locals().copy()
    print("model options", model_options)
    load_data = prepare_input.load_data
    prepare_data = imdb.prepare_data
    print('Loading data')
    train, valid, test, dis_vecs_train, ids_train, dis_vecs_valid, ids_valid, dis_vecs_test, ids_test = load_data(
        n_words=n_words, valid_portion=0.05, maxlen=maxlen, part=part)
    if test_size > 0:
        # The test set is sorted by size, but we want to keep random
        # size example.  So we must select a random selection of the
        # examples.
        idx = numpy.arange(len(test[0]))
        numpy.random.shuffle(idx)
        idx = idx[:test_size]
        test = ([test[0][n] for n in idx], [test[1][n] for n in idx])
        dis_vecs_test = [dis_vecs_test[n] for n in idx]
        ids_test = [ids_test[n] for n in idx]
    ydim = numpy.max(train[1])

    model_options['ydim'] = ydim + 1
    print('Loading model')
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params_empty()

    load_params('lstm_model.npz' + str(part) + '.npz', params)

    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    # use_noise is for dropout
    (use_noise, x, mask, y, f_pred_prob, f_pred, cost, d1,
     d2) = build_model(tparams, model_options)
    it = get_minibatches_idx(len(test[0]), valid_batch_size)
    probs = pred_probs(f_pred_prob,
                       prepare_data,
                       test,
                       it,
                       dis_vecs=dis_vecs_test)
    scores, id_dis = probs_2_score(probs, ids_test)
    new_score = combine(idl, id_dis, score_v, scores)
    out_file = codecs.open(
        macro.RESULTS_DIR + '/lstm_w2v' + str(part) + '.txt', 'w', 'utf-8')
    out_file.write('ID\tWord1\tWord2\tScore\t\r\n')
    for id, word1, word2, score in zip(idl, w1l, w2l, new_score):
        line = id + '\t' + word1 + '\t' + word2 + '\t' + str(score) + '\r\n'
        out_file.write(line)
    out_file.close()

    idl, w1l, w2l, score_old, headline = utils.read2wordlist([
        (macro.RESULTS_DIR, 'best_without_lstm.txt')
    ])
    f_c = macro.RESULTS_DIR + '/evatestdata3_goldern500_cilin.txt'
    f_v = macro.RESULTS_DIR + '/lstm_w2v' + str(part) + '.txt'
    last_score = merge.merge_2_list(f_v, f_c, macro.MAX)
    temp = eval.spearman(last_score, score_goldern)[0]
    print(eval.spearman(score_old, score_goldern)[0])
    print(temp)

    dataset = {'pred': last_score, 'goldern': score_goldern}
    frame = DataFrame(dataset)
    sns.jointplot('goldern',
                  'pred',
                  frame,
                  kind='reg',
                  stat_func=eval.spearmanr)

    plt.xlim([1, 10])
    plt.ylim([1, 10])
    plt.savefig('%s/%s.png' % (macro.PICS_DIR, ('lstm' + str(part))))
    end_time = time.time()
    print(('Testing took %.1fs' % (end_time - start_time)), file=sys.stderr)
    return last_score
Ejemplo n.º 17
0
 def __init__(self, f_tuple_list, w2v_model_file, result_fname):
     self.f_tuple_list = f_tuple_list
     self.word1_list, self.word2_list, self.manu_sim_list, self.headline = utils.read2wordlist(self.f_tuple_list)
     self.ofname = result_fname  # f_tuple_list存着dir,file的对儿
     self.w2v_model_file = w2v_model_file
Ejemplo n.º 18
0
def single_sims(f_tuple_list, ofname='single_sims'):
    pk_path = '%s/%s.pk' % (macro.RESULTS_DIR, ofname)
    if os.path.exists(pk_path):
        f = open(pk_path, 'rb')
        d = pk.load(f)
        f.close()
    else:
        idl, w1l, w2l, score, headline = utils.read2wordlist(f_tuple_list)
        cilin_sim_list1, cilin_sim_list2, cilin_sim_list3 = cilin_sim(
            f_tuple_list)
        hownet_sim_list = hnet_sim(f_tuple_list)
        cwordnet_sim_list = cwordnet_sim(f_tuple_list)
        w2v_sim_list = word2vec_sim(f_tuple_list)
        jcd_list, ovl_list, dice_list, pmi_list, ngd_list = ir_sim(
            f_tuple_list, '%s_ir_nums0.pk' % ofname)
        d = {
            'id': idl,
            'w1': w1l,
            'w2': w2l,
            'manu_sim': score,
            # 'cilin1': cilin_sim_list1,
            # 'cilin2': cilin_sim_list2,
            'cilin3': cilin_sim_list3,
            'hownet': hownet_sim_list,
            'wordnet': cwordnet_sim_list,
            'word2vec': w2v_sim_list,
            'jaccard': jcd_list,
            'overlap': ovl_list,
            'dice': dice_list,
            'pmi': pmi_list,
            # 'ngd': ngd_list
        }
        f = open(pk_path, 'wb')
        pk.dump(d, f)
        f.close()
    # names = ['id', 'w1', 'w2', 'manu_sim', 'cilin1', 'cilin2', 'cilin3',
    #          'hownet', 'wordnet', 'word2vec', 'jaccard', 'overlap', 'dice', 'pmi']
    names = [
        'id', 'w1', 'w2', 'manu_sim', 'cilin3', 'hownet', 'wordnet',
        'word2vec', 'jaccard', 'overlap', 'dice', 'pmi'
    ]
    df = pd.DataFrame(data=d, columns=names)
    # print df
    # 评价结果
    from prettytable import PrettyTable
    # x = PrettyTable(["Eval", 'cilin1', 'cilin2', 'cilin3', 'hownet',
    #                  'wordnet', 'word2vec', 'jaccard', 'overlap', 'dice', 'pmi'])
    x = PrettyTable([
        "Eval", 'cilin3', 'hownet', 'wordnet', 'word2vec', 'jaccard',
        'overlap', 'dice', 'pmi'
    ])
    x.align["Eval"] = "l"
    x.padding_width = 1
    x.add_row([
        'Spearman',
        # '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.cilin1), eval.spearman(df.manu_sim, df.cilin1, True)),
        # '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.cilin2), eval.spearman(df.manu_sim, df.cilin2, True)),
        '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.cilin3),
                         eval.spearman(df.manu_sim, df.cilin3, True)),
        '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.hownet),
                         eval.spearman(df.manu_sim, df.hownet, True)),
        '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.wordnet),
                         eval.spearman(df.manu_sim, df.wordnet, True)),
        '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.word2vec),
                         eval.spearman(df.manu_sim, df.word2vec, True)),
        '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.jaccard),
                         eval.spearman(df.manu_sim, df.jaccard, True)),
        '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.overlap),
                         eval.spearman(df.manu_sim, df.overlap, True)),
        '%0.5f/%0.5f' % (eval.spearman(
            df.manu_sim, df.dice), eval.spearman(df.manu_sim, df.dice, True)),
        '%0.5f/%0.5f' % (eval.spearman(
            df.manu_sim, df.pmi), eval.spearman(df.manu_sim, df.pmi, True)),
    ])

    x.add_row([
        'Pearson',
        # '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.cilin1), eval.pearson(df.manu_sim, df.cilin1, True)),
        # '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.cilin2), eval.pearson(df.manu_sim, df.cilin2, True)),
        '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.cilin3),
                         eval.pearson(df.manu_sim, df.cilin3, True)),
        '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.hownet),
                         eval.pearson(df.manu_sim, df.hownet, True)),
        '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.wordnet),
                         eval.pearson(df.manu_sim, df.wordnet, True)),
        '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.word2vec),
                         eval.pearson(df.manu_sim, df.word2vec, True)),
        '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.jaccard),
                         eval.pearson(df.manu_sim, df.jaccard, True)),
        '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.overlap),
                         eval.pearson(df.manu_sim, df.overlap, True)),
        '%0.5f/%0.5f' % (eval.pearson(
            df.manu_sim, df.dice), eval.pearson(df.manu_sim, df.dice, True)),
        '%0.5f/%0.5f' % (eval.pearson(
            df.manu_sim, df.pmi), eval.pearson(df.manu_sim, df.pmi, True)),
    ])
    x.add_row([
        'Count',
        # '%s/%s' % (len(df.manu_sim) - list(df.cilin1).count(-1), len(df.manu_sim)),
        # '%s/%s' % (len(df.manu_sim) - list(df.cilin2).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.cilin3).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.hownet).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.wordnet).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.word2vec).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.jaccard).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.overlap).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.dice).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.pmi).count(-1), len(df.manu_sim)),
    ])
    print x
    df.to_csv('%s/%s.csv' % (macro.RESULTS_DIR, ofname), encoding='gbk')

    # 线性结合
    df = df.replace(-1, 0)
    # max
    linear_mean_auto_sims = [row[4:].max() for row in df.values]
    print 'MAX: pearson=%.5f;spearman=%.5f' % (
        eval.pearson(df.manu_sim, linear_mean_auto_sims),
        eval.spearman(df.manu_sim, linear_mean_auto_sims))

    # min
    linear_mean_auto_sims = [row[4:].min() for row in df.values]
    print 'MIN: pearson=%.5f;spearman=%.5f' % (
        eval.pearson(df.manu_sim, linear_mean_auto_sims),
        eval.spearman(df.manu_sim, linear_mean_auto_sims))
    # mean
    linear_mean_auto_sims = [row[4:].mean() for row in df.values]
    print 'MEAN: pearson=%.5f;spearman=%.5f' % (
        eval.pearson(df.manu_sim, linear_mean_auto_sims),
        eval.spearman(df.manu_sim, linear_mean_auto_sims))

    # gmean
    df = df.replace(0, 1)

    linear_mean_auto_sims = [geometric_mean(row[4:]) for row in df.values]
    print 'GMEAN: pearson=%.5f;spearman=%.5f' % (
        eval.pearson(df.manu_sim, linear_mean_auto_sims),
        eval.spearman(df.manu_sim, linear_mean_auto_sims))
    return df
Ejemplo n.º 19
0
    def __init__(self, hypers):
        """
        To initialise the class, we need to supply the config file, which contains the location of
        the pretrained word vectors, of the vocabulary to use, the location of (potentially many)
        collections of linguistic constraints (one pair per line), the location of the dialogue
        domain ontology to inject (optional, needs to respect DSTC format), as well as the six
        hyperparameters of the counterfitting procedure (as detailed in the NAACL paper).
        """
        pretrained_vectors_filepath = r"%s/fml_org_bdnews_xieso_w2v.bin" % macro.DICT_DIR
        # vocabulary_filepath = r"D:\MyData\NLPCC_DATA\corpus\vocab.txt"
        # eval_fname = 'test_4.csv'   # 评价文件名
        self.eval_fname = macro.NLPCC_FML_FILE  # 评价文件名
        # eval_fname = 'MC30.txt'
        # eval_fname = 'SemEval50.csv'
        self.read2wordlist = utils.read2wordlist([(macro.CORPUS_DIR, self.eval_fname)], 'tag')
        self.id_list, self.word1_list, self.word2_list, self.manu_sim_list, self.headline = self.read2wordlist
        # load list of filenames for synonyms and antonyms.
        # 同义和反义词典(固定的)
        antonym_list = [r'%s/merged.txt' % macro.DICT_DIR]
        synonym_list = [r'%s/tongyis.txt' % macro.DICT_DIR]
        # 根据single_sim们计算的词对(可变的)
        antonym_value_list = [r'%s/fanyi_value.txt' % macro.DICT_DIR]
        synonym_value_list = [r'%s/tongyi_value.txt' % macro.DICT_DIR]
        df = single_sims([(macro.CORPUS_DIR, self.eval_fname)], 'PKU500_single_sims')
        linear_max_auto_sims = [row[4:].max() for row in df.values]
        fw1 = codecs.open(antonym_value_list[0], 'w')
        fw2 = codecs.open(synonym_value_list[0], 'w')
        th = 7.5
        for auto_sim, w1, w2 in zip(linear_max_auto_sims, self.word1_list, self.word2_list):
            if auto_sim < 1 - th:
                fw1.write(w1.encode('utf-8') + '\t' + w2.encode('utf-8') + '\n')
            elif auto_sim > th:
                fw2.write(w1.encode('utf-8') + '\t' + w2.encode('utf-8') + '\n')
        fw1.close()
        fw2.close()
        # vocabulary = []
        # with open(vocabulary_filepath, "r+") as f_in:
        #     for line in f_in:
        #         vocabulary.append(line.strip())
        #
        # vocabulary = set(vocabulary)
        word_list = []

        with open('%s/%s' % (macro.CORPUS_DIR, self.eval_fname), 'r') as fr:
            for line in fr.readlines()[1:]:
                id, w1, w2, score = line.strip().split('\t')
                word_list.append(w1)
                word_list.append(w2)
        vocabulary = sorted(set(word_list))


        # load pretrained word vectors and initialise their (restricted) vocabulary.
        self.pretrained_word_vectors = load_word_vectors(pretrained_vectors_filepath, vocabulary)

        # if no vectors were loaded, exit gracefully:
        if not self.pretrained_word_vectors:
            return

        self.vocabulary = set(self.pretrained_word_vectors.keys())

        self.synonyms = set()
        self.antonyms = set()
        self.synonyms_value = set()
        self.antonyms_value = set()

        # and we then have all the information to collect all the linguistic constraints:
        for syn_filepath in synonym_list:
            self.synonyms = self.synonyms | load_constraints(syn_filepath, self.vocabulary)

        for ant_filepath in antonym_list:
            self.antonyms = self.antonyms | load_constraints(ant_filepath, self.vocabulary)

        for syn_v_filepath in synonym_value_list:
            self.synonyms_value = self.synonyms_value | load_constraints(syn_v_filepath, self.vocabulary)

        for ant_v_filepath in antonym_value_list:
            self.antonyms_value = self.antonyms_value | load_constraints(ant_v_filepath, self.vocabulary)

        # finally, load the experiment hyperparameters:
        self.hyper_k1, self.hyper_k2, self.hyper_k3, self.hyper_k4, self.hyper_k5,\
        self.delta, self.gamma, self.rho, self.theta, self.eta = hypers
Ejemplo n.º 20
0
def cilin_run1():
    '''
    有三种计算方法
    cs = CilinSimilarity()
    sim1 = cs.similarity(w1, w2)
    sim2 = cs.sim2013(w1, w2)
    sim3 = cs.sim2016(w1, w2)
    '''

    cs = loadCilin()
    # w1 = u'抄袭'
    # w2 = u'克隆'
    # code1 = cs.get_code(w1)
    # print w1, '的编码有:', code1
    # code2 = cs.get_code(w2)
    # print w2, '的编码有:', code2
    # sim = cs.similarity(w1, w2)
    # print w1, w2, '最终的相似度为', sim
    idl, w1l, w2l, score, headline = utils.read2wordlist([
        (macro.CORPUS_DIR, macro.NLPCC_FML_FILE)
    ])
    result1 = []
    result2 = []
    result3 = []
    flags = []
    outfile = codecs.open(macro.RESULTS_DIR + '/fml_cilin.txt', 'w', 'utf-8')
    outfile.write('\r\n')
    for id, w1, w2 in zip(idl, w1l, w2l):
        sim1 = cs.similarity(w1, w2)
        sim2 = cs.sim2013(w1, w2)
        sim3 = cs.sim2016(w1, w2)
        outfile.write(id + '\t' + w1 + '\t' + w2 + '\t' + str(sim3) + '\r\n')
        if sim1 == -1:
            flags.append(0)
        else:
            flags.append(1)
        result1.append(sim1)
        result2.append(sim2)
        result3.append(sim3)
    outfile.close()
    print eval.spearman(score, result1)
    print eval.spearman(score, result2)
    print eval.spearman(score, result3)
    '''
    计算全部的得分
    0.347925120242
    0.352377437382
    0.421492611614
    '''
    score_f = []
    result1_f = []
    result2_f = []
    result3_f = []
    for s, r1, r2, r3, flag in zip(score, result1, result2, result3, flags):
        if flag == 1:
            score_f.append(s)
            result1_f.append(r1)
            result2_f.append(r2)
            result3_f.append(r3)
    print '-------------------------'
    print len(score_f)
    print eval.spearman(score_f, result1_f)
    print eval.spearman(score_f, result2_f)
    print eval.spearman(score_f, result3_f)
    print eval.pearson(score_f, result3_f)
    '''
Ejemplo n.º 21
0
    cs = CilinSimilarity()
    sim1 = cs.similarity(w1, w2)
    sim2 = cs.sim2013(w1, w2)
    sim3 = cs.sim2016(w1, w2)
    '''

    cs = loadCilin()
    # w1 = u'抄袭'
    # w2 = u'克隆'
    # code1 = cs.get_code(w1)
    # print w1, '的编码有:', code1
    # code2 = cs.get_code(w2)
    # print w2, '的编码有:', code2
    # sim = cs.similarity(w1, w2)
    # print w1, w2, '最终的相似度为', sim
    idl, w1l, w2l, score, headline = utils.read2wordlist([(macro.CORPUS_DIR, macro.NLPCC_FML_FILE)])
    result1 = []
    result2 = []
    result3 = []
    flags = []
    outfile = codecs.open(macro.RESULTS_DIR+'/fml_cilin.txt','w','utf-8')
    outfile.write('\r\n')
    for id,w1, w2 in zip(idl,w1l, w2l):
        sim1 = cs.similarity(w1, w2)
        sim2 = cs.sim2013(w1, w2)
        sim3 = cs.sim2016(w1, w2)
        outfile.write(id+'\t'+w1+'\t'+w2+'\t'+str(sim3)+'\r\n')
        if sim1 == -1:
            flags.append(0)
        else:
            flags.append(1)
Ejemplo n.º 22
0
def ir_sim(f_tuple_list, ofname='NLPCC_Formal500_single_sims_ir_nums0.pk'):
    print 'ir sim ...'
    idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
    nums_pk_path = '%s/%s' % (macro.RESULTS_DIR, ofname)
    if os.path.exists(nums_pk_path):
        print 'load nums...'
        f = open(nums_pk_path, 'rb')
        n1l, n2l, n3l = pickle.load(f)
        f.close()
    else:
        print 'retrieval nums...'
        n1l, n2l, n3l = get_nums_list(w1l, w2l)
        f = open(nums_pk_path, 'wb')
        pickle.dump((n1l, n2l, n3l), f)
        f.close()
    with open(nums_pk_path.split('.')[0]+'_nums.csv', 'w') as fw:
        for id, w1, w2, n1, n2, n3 in zip(idl, w1l, w2l, n1l, n2l, n3l):
            new_line = '%s,%s,%s,%s,%s,%s' % (id, w1, w2, n1, n2, n3)
            fw.write(new_line.encode('gbk')+'\n')

    N = pow(10, 16)
    jcd_list, ovl_list, dice_list, pmi_list, ngd_list = [], [], [], [], []
    for num1, num2, num3, id, w1, w2, manu_sim in zip(n1l, n2l, n3l, idl, w1l, w2l, manu_sim_list):
        jcd = utils.convert_sim(web_jaccard(num1, num2, num3), mode=1)
        ovl = utils.convert_sim(web_overlap(num1, num2, num3), mode=1)
        dice = utils.convert_sim(web_dice(num1, num2, num3), mode=1)
        pmi = utils.convert_sim(web_pmi(num1, num2, num3, N), mode=1)
        ngd = utils.convert_sim(web_ngd(num1, num2, num3, N), mode=1)
        jcd_list.append(jcd)
        ovl_list.append(ovl)
        dice_list.append(dice)
        pmi_list.append(pmi)
        ngd_list.append(ngd)
        # print "ir:proc_id= %s [%s,%s] %s (%.5f, %.5f, %.5f, %.5f, %.5f) " % (id, w1, w2, manu_sim, jcd, ovl, dice, pmi, ngd)

    from prettytable import PrettyTable
    x = PrettyTable(["Eval", "jaccard", "overlap", "dice", "pmi", "ngd"])
    x.align["Eval"] = "l"
    x.padding_width = 1
    x.add_row(['Spearman',
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, jcd_list), eval.spearman(manu_sim_list, jcd_list, True)),
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, ovl_list), eval.spearman(manu_sim_list, ovl_list, True)),
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, dice_list), eval.spearman(manu_sim_list, dice_list, True)),
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, pmi_list), eval.spearman(manu_sim_list, pmi_list, True)),
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, ngd_list), eval.spearman(manu_sim_list, ngd_list, True))])
    x.add_row(['Pearson',
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, jcd_list), eval.pearson(manu_sim_list, jcd_list, True)),
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, ovl_list), eval.pearson(manu_sim_list, ovl_list, True)),
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, dice_list), eval.pearson(manu_sim_list, dice_list, True)),
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, pmi_list), eval.pearson(manu_sim_list, pmi_list, True)),
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, ngd_list), eval.pearson(manu_sim_list, ngd_list, True)),
               ])
    x.add_row(['Count',
               '%s/%s' % (len(manu_sim_list) - jcd_list.count(-1), len(manu_sim_list)),
               '%s/%s' % (len(manu_sim_list) - ovl_list.count(-1), len(manu_sim_list)),
               '%s/%s' % (len(manu_sim_list) - dice_list.count(-1), len(manu_sim_list)),
               '%s/%s' % (len(manu_sim_list) - pmi_list.count(-1), len(manu_sim_list)),
               '%s/%s' % (len(manu_sim_list) - ngd_list.count(-1), len(manu_sim_list)),
               ])
    print x

    return jcd_list, ovl_list, dice_list, pmi_list, ngd_list
    @step:
    @function:
"""

from Com import macro
from Eval import eval
from Com import utils
import post
import merge

lst = [1] * 7

data = post.get_value_list(macro.CORPUS_DIR + '/features_golden_new.txt', lst)
max = 0
final_list = []
idl, w1l, w2l, score, headline = utils.read2wordlist([(macro.CORPUS_DIR,
                                                       '500_2.csv')])
f_c = macro.RESULTS_DIR + '/evatestdata3_goldern500_cilin.txt'
f_v = macro.RESULTS_DIR + '/fml_org_bdnews_xieso.result'

for mode in range(1, 13):
    score_m = merge.merge_2_list(f_v, f_c, mode)
    sp = eval.spearman(data, score_m)[0]
    pe = eval.pearson(data, score_m)[0]
    temp = score_m
    print macro.MODES[mode - 1], '\t', eval.spearman(
        score, score_m)[0], '\t', eval.pearson(score,
                                               score_m)[0], '\t', sp, '\t', pe
    # idl_p, w1l_p, w2l_p, score_p, headline_p = utils.read2wordlist([(macro.RESULTS_DIR,'best_without_lstm.txt')])

    # pred = merge.merge_2_list(macro.RESULTS_DIR+'/fml_google_en_w2v.result',f_c,mode=macro.MAX)
    # print eval.spearman(pred,score),eval.pearson(pred,score)
Ejemplo n.º 24
0
            tongyis = words[1:]
            all_lists.append(tongyis)
    for l in all_lists:
        length = len(l)
        for i in range(0, length):
            for j in range(i + 1, length):
                w1 = l[i]
                w2 = l[j]
                outfile.write(w1 + '\t' + w2 + '\r\n')
    outfile.close()


if __name__ == '__main__':
    # get_tongyis()
    idl, w1l, w2l, score, headline = utils.read2wordlist([
        (macro.CORPUS_DIR, macro.NLPCC_FML_FILE)
    ])
    values = post.get_value_list(macro.CORPUS_DIR + '/features_golden_new.txt',
                                 [1, 1, 1, 1, 1, 1, 1])

    idl, w1l, w2l, sim_cwn, headline = utils.read2wordlist([
        (macro.RESULTS_DIR, 'fml_cwordnet.result')
    ])
    idl, w1l, w2l, sim_hw, headline = utils.read2wordlist([
        (macro.RESULTS_DIR, 'fml_hownet.result')
    ])
    idl, w1l, w2l, sim_cl, headline = utils.read2wordlist([
        (macro.RESULTS_DIR, 'fml_cilin.result')
    ])

    outfile = codecs.open(macro.DICT_DIR + '/tongyi_value.txt', 'w', 'utf-8')