コード例 #1
0
def main(dataset, window):

    model = gensim.models.KeyedVectors.load_word2vec_format(
        './data/embedding/vec/externel_vec/wiki.en.vec', binary=False)
    # 注:因为gensim版本更新的问题,如果下面这个load有问题,可以使用新的接口:model = gensim.models.word2vec.Word2Vec.load(MODEL_PATH)
    # model = gensim.models.Word2Vec.load_word2vec_format("wiki.en.text.vector", binary=False)
    # model.similarity("woman", "girl")
    # 计算生成经典特征
    data_dir = os.path.join('./data/embedding/', dataset)
    file_names = read_file(os.path.join(data_dir, 'abstract_list')).split(',')
    out_dir = os.path.join(data_dir, 'wiki_sim')
    for file_name in file_names:
        print(file_name)
        filtered_text = filter_text(
            read_file(os.path.join(data_dir, 'abstracts', file_name)))
        edges = get_edges(filtered_text, window=window)
        edge_sim = {}
        for edge in edges:
            word1 = edge[0]
            word2 = edge[1]
            try:
                sim = model.similarity(word1, word2)
            except:
                sim = 0
            e = tuple(
                sorted([normalized_token(word1),
                        normalized_token(word2)]))
            edge_sim[e] = [sim]
        edgefeatures2file(os.path.join(data_dir, 'wiki_sim', file_name),
                          edge_sim)

    print('.......wiki_sim_DONE........')
コード例 #2
0
def position_sum(text, nodes):
    """
    Return a dict, key is node, 
    value is the sum of the Reciprocal sum
    of all the node positions in text.

    :param text: text with no tags
    :param nodes: list of nodes in word graph, stemmed
    """
    def all_pos(obj, alist):
        pos = []
        while True:
            try:
                p = alist.index(obj)
                pos.append(p + 1)
                alist[p] = None
            except:
                return pos

    text = text.lower()
    words = [normalized_token(w) for w in word_tokenize(text) if is_word(w)]
    pos_sum = {}
    for n in nodes:
        pos = all_pos(n, words)
        weight = sum([1 / p for p in pos])
        pos_sum[n] = weight
    return pos_sum
def create_B(node_list, gold):
    keyphrases = list(normalized_token(word) for word in gold.split())
    n = len(node_list)
    B = [0] * n
    for g in keyphrases:
        if g not in node_list:
            keyphrases.pop(keyphrases.index(g))

    for keyphrase in keyphrases:
        try:
            prefer = node_list.index(keyphrase)
        except:
            continue
        b = [0] * n
        b[prefer] = 1
        B = []
        for node in node_list:
            if node not in keyphrases:
                neg = node_list.index(node)
                b[neg] = -1
                c = b[:]
                B.append(c)
                b[neg] = 0
    if B == []:
        B = [0] * n
    return np.matrix(B)
コード例 #4
0
def read_pr(fpath, vocabulary, damping):
    pr = {}
    with open(fpath, encoding='utf8') as f:
        f_csv = csv.DictReader(f)
        for row in f_csv:
            token = normalized_token(vocabulary[row['node_id']])
            score = float(row[str(damping)])
            pr[token] = score
    return pr
コード例 #5
0
def get_term_freq(text):
    """
    Return a dict, key is stemmed word, 
    value is frequency in text.

    :param text: text with no tags
    """
    text = text.lower()
    words = [normalized_token(w) for w in word_tokenize(text) if is_word(w)]
    tf = {}
    for w in words:
        tf[w] = words.count(w)
    return tf
コード例 #6
0
def evaluate(dataset):
    """
    Evaluate ranking result.

    :param dataset: name of dataset
    :param pr: dict, key is stemmed word, value is score
    """

    method_name = 'pagerank_zf'
    dataset = dataset.upper()
    abstract_dir = os.path.join('./data', dataset, 'abstracts')
    gold_dir = os.path.join('./data', dataset, 'gold')

    extracted = os.path.join('./result', dataset, 'extracted_zf')
    pr_type = 'a1b1'  #alfa=1beta=1
    pr_dir = os.path.join('./data', dataset, 'rank_zf', pr_type)
    vocabulary_path = os.path.join('./data', dataset, 'rank_zf',
                                   'vocabulary')  #对应
    damping = 0.85  #0.2 0.5 0.8 0.85

    with_tag = True
    topn = 4
    window = 2
    ngrams = 2
    weight2 = 0.6
    weight3 = 0.3

    names = [
        name for name in os.listdir(pr_dir)
        if os.path.isfile(os.path.join(pr_dir, name))
    ]
    vocabulary = id2word(vocabulary_path)

    count = 0
    gold_count = 0
    extract_count = 0
    mrr = 0
    prcs_micro = 0
    recall_micro = 0
    for name in names:
        pr = read_pr(os.path.join(pr_dir, name), vocabulary, damping)
        doc_path = os.path.join(abstract_dir, name)
        text = read_file(doc_path)
        text_candidates = filter_text(text, with_tag=with_tag)
        edge_freq = get_edge_freq(text_candidates, window=window)
        edges = dict2list(edge_freq)
        graph = build_graph(edges)
        keyphrases = get_phrases(pr,
                                 graph,
                                 doc_path,
                                 ng=ngrams,
                                 pl2=weight2,
                                 pl3=weight3,
                                 with_tag=with_tag)
        top_phrases = []
        for phrase in keyphrases:
            if phrase[0] not in str(top_phrases):
                top_phrases.append(phrase[0])
            if len(top_phrases) == topn:
                break
        if not os.path.exists(extracted):
            os.makedirs(extracted)
        with open(os.path.join(extracted, name), encoding='utf-8',
                  mode='w') as file:
            file.write('\n'.join(top_phrases))

        standard = read_file(os.path.join(gold_dir, name)).split('\n')
        if standard[-1] == '':
            standard = standard[:-1]
        # 根据phrases是否取词干决定
        standard = list(' '.join(list(normalized_token(w) for w in g.split()))
                        for g in standard)
        count_micro = 0
        position = []
        for phrase in top_phrases:
            if phrase in standard:
                count += 1
                count_micro += 1
                position.append(top_phrases.index(phrase))
        if position != []:
            mrr += 1 / (position[0] + 1)
        gold_count += len(standard)
        extract_count += len(top_phrases)
        prcs_micro += count_micro / len(top_phrases)
        recall_micro += count_micro / len(standard)

    prcs = count / extract_count
    recall = count / gold_count
    f1 = 2 * prcs * recall / (prcs + recall)
    mrr /= len(names)
    prcs_micro /= len(names)
    recall_micro /= len(names)
    f1_micro = 2 * prcs_micro * recall_micro / (prcs_micro + recall_micro)
    print(dataset, method_name, count, prcs, recall, f1, mrr)

    eval_result = method_name + pr_type + str(damping) + '@' + str(topn) + ',' + dataset + ',' + str(prcs) + ',' \
                  + str(recall) + ',' + str(f1) + ',' + str(mrr) + ',' + str(prcs_micro) \
                  + ',' + str(recall_micro) + ',' + str(f1_micro) + ',\n'
    with open(os.path.join('./result', method_name + '.csv'),
              mode='a',
              encoding='utf8') as file:
        file.write(eval_result)
コード例 #7
0
def evaluate_extraction(dataset,
                        method_name,
                        ngrams=2,
                        damping=0.85,
                        omega=None,
                        phi=None,
                        alter_topn=None,
                        alter_edge=None,
                        alter_node=None):
    """
    评价实验结果

    omega,phi, [0]代表不适用任何特征,权重设置为1。None为所有特征的简单加和。[-1]只用最后一个特征。
    """
    if dataset == 'KDD':
        abstr_dir = './data/embedding/KDD/abstracts/'
        out_dir = './result/embedding/'
        gold_dir = './data/embedding/KDD/gold/'
        edge_dir = './data/embedding/KDD/edge_features/'
        node_dir = './data/embedding/KDD/node_features/'
        file_names = read_file('./data/embedding/KDD/abstract_list').split(',')
        topn = 4
    elif dataset == 'WWW':
        abstr_dir = './data/embedding/WWW/abstracts/'
        out_dir = './result/embedding/'
        gold_dir = './data/embedding/WWW/gold/'
        edge_dir = './data/embedding/WWW/edge_features/'
        node_dir = './data/embedding/WWW/node_features/'
        file_names = read_file('./data/embedding/WWW/abstract_list').split(',')
        topn = 5
    else:
        print('wrong dataset name')
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    if alter_edge:
        edge_dir = alter_edge
    if alter_node:
        node_dir = alter_node
    if alter_topn:
        topn = alter_topn

    count = 0
    gold_count = 0
    extract_count = 0
    mrr = 0
    prcs_micro = 0
    recall_micro = 0
    for file_name in file_names:
        # print(file_name)
        pr, graph = wpr(edge_dir + file_name,
                        node_dir + file_name,
                        omega=omega,
                        phi=phi,
                        d=damping)

        gold = read_file(gold_dir + file_name)
        pl2 = 0.62
        pl3 = 0.3
        if dataset == "WWW":
            pl2 = 0.55
        file = os.path.join(abstr_dir, file_name)
        keyphrases = get_phrases(pr, graph, file, ng=ngrams, pl2=pl2, pl3=pl3)
        top_phrases = []
        for phrase in keyphrases:
            if phrase[0] not in str(top_phrases):
                top_phrases.append(phrase[0])
            if len(top_phrases) == topn:
                break
        golds = gold.split('\n')
        if golds[-1] == '':
            golds = golds[:-1]
        golds = list(' '.join(list(normalized_token(w) for w in g.split()))
                     for g in golds)
        count_micro = 0
        position = []
        for phrase in top_phrases:
            if phrase in golds:
                count += 1
                count_micro += 1
                position.append(top_phrases.index(phrase))
        if position != []:
            mrr += 1 / (position[0] + 1)
        gold_count += len(golds)
        extract_count += len(top_phrases)
        if len(top_phrases) != 0:
            prcs_micro += count_micro / len(top_phrases)
        recall_micro += count_micro / len(golds)
        # 记录每个文档关键词提取的详细结果
        # prcs_single = count_micro / len(top_phrases)
        # recall_single = count_micro / len(golds)
        # output_single = str(file_name) + ',' + str(prcs_single) + ',' + str(recall_single) + ','\
        #               + ','.join(phrase for phrase in top_phrases) + '\n'
        # with open(out_dir + dataset + 'DETAILS.csv', mode='a', encoding='utf8') as f:
        #     f.write(output_single)
    prcs = count / extract_count
    recall = count / gold_count
    f1 = 2 * prcs * recall / (prcs + recall)
    mrr /= len(file_names)
    prcs_micro /= len(file_names)
    recall_micro /= len(file_names)
    f1_micro = 2 * prcs_micro * recall_micro / (prcs_micro + recall_micro)
    print(prcs, recall, f1, mrr)

    tofile_result = method_name + ',' + str(prcs) + ',' + str(recall) + ',' + str(f1) + ',' + str(mrr) + ',' \
                    + str(prcs_micro) + ',' + str(recall_micro) + ',' + str(f1_micro) + ',\n'
    with open(out_dir + dataset + '_RESULTS.csv', mode='a',
              encoding='utf8') as f:
        f.write(tofile_result)