Ejemplo n.º 1
0
def main(dataset, window):
    # 计算生成经典特征
    data_dir = os.path.join('./data/embedding/', dataset)
    file_names = read_file(os.path.join(data_dir, 'abstract_list')).split(',')
    for file_name in file_names:
        print(file_name)
        filtered_text = filter_text(
            read_file(os.path.join(data_dir, 'abstracts', file_name)))

        # 计算保存边特征,分别为:共现次数,被引文献共现次数,引用文献共现次数
        edge_freq = get_edge_freq(filtered_text, window=window)
        cited_edge_freq = sum_cite_edge_freq(file_name,
                                             data_dir,
                                             'cited',
                                             window=window)
        citing_edge_freq = sum_cite_edge_freq(file_name,
                                              data_dir,
                                              'citing',
                                              window=window)
        save_edge_features(file_name, data_dir, edge_freq, cited_edge_freq,
                           citing_edge_freq)

        # 读取点的特征,保存为需要的格式
        node_list = filtered_text.split()
        raw_node_features = read_file(
            os.path.join(data_dir, 'raw_node_features'))
        node_features = read_node_features(node_list,
                                           raw_node_features,
                                           file_name,
                                           nfselect='07')  #023789 07
        save_node_features(file_name, data_dir, node_features)
    print('.......old_features_DONE........')
Ejemplo n.º 2
0
def get_tfidf(name, dataset):
    """
    Return a dict, key is word, value is tfidf of node,
    words not filtered.

    :param name: file name of the target doc
    :param dataset: dataset name
    """
    cfg = ConfigParser()
    cfg.read(os.path.join("./config", dataset.lower() + '.ini'))
    abstract_dir = cfg.get('dataset', 'abstract')
    filelist = cfg.get('dataset', 'filelist')

    names = read_file(filelist).split()
    docs = [stem_doc(read_file(os.path.join(abstract_dir, n))) for n in names]
    words = stem_doc(read_file(os.path.join(abstract_dir, name))).split()

    tfidf = {}
    for w in set(words):
        df = 0
        for d in docs:
            if w in d:
                df += 1
        idf = log(len(names) / df)  #log底数可调整
        tf = words.count(w)
        tfidf[w] = tf * idf
    return tfidf
def main(dataset, window):

    model = gensim.models.KeyedVectors.load_word2vec_format(
        './data/embedding/vec/externel_vec/wiki.en.vec', binary=False)
    # 注:因为gensim版本更新的问题,如果下面这个load有问题,可以使用新的接口:model = gensim.models.word2vec.Word2Vec.load(MODEL_PATH)
    # model = gensim.models.Word2Vec.load_word2vec_format("wiki.en.text.vector", binary=False)
    # model.similarity("woman", "girl")
    # 计算生成经典特征
    data_dir = os.path.join('./data/embedding/', dataset)
    file_names = read_file(os.path.join(data_dir, 'abstract_list')).split(',')
    out_dir = os.path.join(data_dir, 'wiki_sim')
    for file_name in file_names:
        print(file_name)
        filtered_text = filter_text(
            read_file(os.path.join(data_dir, 'abstracts', file_name)))
        edges = get_edges(filtered_text, window=window)
        edge_sim = {}
        for edge in edges:
            word1 = edge[0]
            word2 = edge[1]
            try:
                sim = model.similarity(word1, word2)
            except:
                sim = 0
            e = tuple(
                sorted([normalized_token(word1),
                        normalized_token(word2)]))
            edge_sim[e] = [sim]
        edgefeatures2file(os.path.join(data_dir, 'wiki_sim', file_name),
                          edge_sim)

    print('.......wiki_sim_DONE........')
def citetextrank(name, dataset):

    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join("./config", dataset + '.ini'))

    abstract_dir = cfg.get('dataset', 'abstract')
    doc_weight = int(cfg.get('ctr', 'doc_weight'))
    citing_weight = int(cfg.get('ctr', 'citing_weight'))
    cited_weight = int(cfg.get('ctr', 'cited_weight'))
    window = int(cfg.get('graph', 'window'))
    with_tag = cfg.getboolean('dataset', 'with_tag')
    damping = float(cfg.get('graph', 'damping'))

    text = filter_text(read_file(os.path.join(abstract_dir, name)),
                       with_tag=with_tag)
    edge_f = get_edge_freq(text, window=window)
    citing_edge_freq = cite_edge_freq(name, dataset, 'citing')
    cited_edge_freq = cite_edge_freq(name, dataset, 'cited')

    edge_weight = dict()
    for edge in edge_f:
        edge_weight[edge] = doc_weight * edge_f.get(edge, 0) \
                          + citing_weight * citing_edge_freq.get(edge, 0) \
                          + cited_weight * cited_edge_freq.get(edge, 0)
    edges = dict2list(edge_weight)
    graph = build_graph(edges)
    pr = nx.pagerank(graph, alpha=damping)
    return pr, graph
Ejemplo n.º 5
0
def mike(dataset):

    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join("./config", dataset + '.ini'))

    window = int(cfg.get('graph', 'window'))
    damping = float(cfg.get('graph', 'damping'))
    abstract_dir = cfg.get('dataset', 'abstract')
    filelist = cfg.get('dataset', 'filelist')
    gold_dir = cfg.get('dataset', 'gold')
    topn = int(cfg.get('dataset', 'topn'))
    extracted = cfg.get('dataset', 'extracted')
    with_tag = cfg.getboolean('dataset', 'with_tag')

    edge_dir = cfg.get('ssp', 'edge_dir')
    node_dir = cfg.get('ssp', 'node_dir')
    supervised_dir = cfg.get('ssp', 'supervised_dir')
    alpha = float(cfg.get('ssp', 'alpha'))
    step_size = float(cfg.get('ssp', 'step_size'))
    epsilon = float(cfg.get('ssp', 'epsilon'))
    max_iter = int(cfg.get('ssp', 'max_iter'))

    ngrams = int(cfg.get('phrase', 'ngrams'))
    weight2 = float(cfg.get('phrase', 'weight2'))
    weight3 = float(cfg.get('phrase', 'weight3'))

    names = read_file(filelist).split()[:3]

    for name in names:
        print(name)
        edge_features = read_edges(os.path.join(edge_dir, name))
        node_features = read_vec(os.path.join(node_dir, name))
        supervised_info = read_file(os.path.join(supervised_dir, name))

        (pi, omega, phi, node_list, iter_times,
         graph) = ssp(edge_features,
                      node_features,
                      supervised_info,
                      d=damping,
                      alpha=alpha,
                      step_size=step_size,
                      max_iter=max_iter,
                      epsilon=epsilon)
        print(iter_times)
Ejemplo n.º 6
0
def sum_cite_edge_freq(file_name, data_dir, cite_type, window=2):
    """
    读取文件,计算引用特征
    data_dir为数据集根目录,如KDD数据集为'./data/embedding/KDD/'
    """
    def get_cite_list(target_name, cite_list_all):
        # cite_list_all为引用文件名列表
        cite_list = []
        count = 0
        count_old = 0
        for name in cite_list_all.split():
            count_old = count
            if target_name in name:
                cite_list.append(name)
                count += 1
            if count > 0 and count_old == count:
                break
        return cite_list

    if cite_type == 'cited':
        cite_dir = os.path.join(data_dir, 'citedcontexts')
        cite_list_all = read_file(os.path.join(data_dir, 'cited_list'))
    elif cite_type == 'citing':
        cite_dir = os.path.join(data_dir, 'citingcontexts')
        cite_list_all = read_file(os.path.join(data_dir, 'citing_list'))
    else:
        print('wrong cite type')
    cite_list = get_cite_list(file_name, cite_list_all)
    # 目标文档
    target = filter_text(
        read_file(os.path.join(data_dir, 'abstracts', file_name)))
    cite_edge_freqs = {}
    for cite_name in cite_list:
        cite_text = filter_text(read_file(os.path.join(cite_dir, cite_name)),
                                with_tag=False)
        cite_edge_freq = single_cite_edge_freq(target,
                                               cite_text,
                                               window=window)
        for key in cite_edge_freq:
            cite_edge_freqs[key] = cite_edge_freqs.get(key,
                                                       0) + cite_edge_freq[key]

    return cite_edge_freqs
Ejemplo n.º 7
0
def extract_cossim(dataset):
    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join('./config', dataset+'.ini'))
    window = int(cfg.get('graph', 'window'))
    damping = float(cfg.get('graph', 'damping'))
    abstract_dir = cfg.get('dataset', 'abstract')
    with_tag = cfg.getboolean('dataset', 'with_tag')
    filelist = cfg.get('dataset', 'filelist')

    cfg.read('./config/global.ini')
    vec_path = cfg.get('embedding', 'wiki_vec')

    names = read_file(filelist).split()
    wvmodel = gensim.models.Word2Vec.load(vec_path)
    for name in names:
        doc_path = os.path.join(abstract_dir, name)
        text = read_file(doc_path)
        text_candidates = filter_text(text, with_tag=with_tag)
        edge_freq = get_edge_freq(text_candidates, window=window)
        save_feature(edge_freq)
Ejemplo n.º 8
0
def read_lda(lda_path):
    """
    Return a dict, key is node, value is topic prob

    :param lda_path: path to lda prob file
    """
    lda_raw = read_file(lda_path).split('\n')
    if lda_raw[-1] == '':
        lda_raw = lda_raw[:-1]
    lda = {}
    for line in lda_raw:
        key, value = line.split()
        lda[key] = float(value)
    return lda
def cite_edge_freq(name, dataset, cite_type):
    """
    Return a dict, key is edge tuple,
    value is the sum of citation frequency in all citation contexts

    :param name: file name of the target doc
    :param dataset: dataset name
    :param cite_type: citation type, citing or cited
    """
    cfg = ConfigParser()
    cfg.read(os.path.join("./config", dataset.lower() + '.ini'))
    abstract_dir = cfg.get('dataset', 'abstract')
    window = int(cfg.get('graph', 'window'))
    with_tag = cfg.getboolean('dataset', 'with_tag')
    if cite_type == 'citing':
        cite_dir = cfg.get('dataset', 'citing')
        cite_names = [n for n in os.listdir(cite_dir) if name in n]
    elif cite_type == 'cited':
        cite_dir = cfg.get('dataset', 'cited')
        cite_names = [n for n in os.listdir(cite_dir) if name in n]
    else:
        print('wrong cite type')

    target = filter_text(read_file(os.path.join(abstract_dir, name)),
                         with_tag=with_tag)
    cite_edge_freqs = {}
    for cite_name in cite_names:
        cite_text = filter_text(read_file(os.path.join(cite_dir, cite_name)),
                                with_tag=False)
        cite_edge_freq = single_cite_edge_freq(target,
                                               cite_text,
                                               window=window)
        for key in cite_edge_freq:
            cite_edge_freqs[key] = cite_edge_freqs.get(key,
                                                       0) + cite_edge_freq[key]

    return cite_edge_freqs
def singletpr(name, dataset):

    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join("./config", dataset+'.ini'))

    window = int(cfg.get('graph', 'window'))
    damping = float(cfg.get('graph', 'damping'))
    abstract_dir = cfg.get('dataset', 'abstract')
    with_tag = cfg.getboolean('dataset', 'with_tag')
    lda_dir = cfg.get('dataset', 'lda')

    doc_path = os.path.join(abstract_dir, name)
    text = read_file(doc_path)
    candidates = filter_text(text, with_tag=with_tag)
    edges = dict2list(get_edge_freq(candidates, window=window))
    graph = build_graph(edges)
    lda = read_lda(os.path.join(lda_dir, name))
    pr = nx.pagerank(graph, alpha=damping, personalization=lda)
    return pr, graph
def wordattractionrank(name, dataset):

    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join('./config', dataset + '.ini'))
    window = int(cfg.get('graph', 'window'))
    damping = float(cfg.get('graph', 'damping'))
    abstract_dir = cfg.get('dataset', 'abstract')
    with_tag = cfg.getboolean('dataset', 'with_tag')

    cfg.read('./config/global.ini')
    vec_path = cfg.get('embedding', 'wiki_vec')

    doc_path = os.path.join(abstract_dir, name)
    text = read_file(doc_path)
    stemdict = stem2word(text)
    text_candidate = filter_text(text, with_tag=with_tag)
    edge_freq = get_edge_freq(text_candidate, window=window)
    wvmodel = gensim.models.KeyedVectors.load_word2vec_format(vec_path,
                                                              binary=False)
    edge_weight = {}
    for edge in edge_freq:
        word1 = edge[0]
        word2 = edge[1]
        try:
            distance = 1 - wvmodel.similarity(stemdict[word1], stemdict[word2])
        except:
            distance = 1
        words = text_candidate.split()
        tf1 = words.count(word1)
        tf2 = words.count(word2)
        cf = edge_freq[edge]
        force = calc_force(tf1, tf2, distance)
        dice = calc_dice(tf1, tf2, cf)
        edge_weight[edge] = force * dice
    edges = dict2list(edge_weight)
    graph = build_graph(edges)
    pr = nx.pagerank(graph, alpha=damping)
    return pr, graph
def positionrank(name, dataset):

    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join("./config", dataset + '.ini'))

    window = int(cfg.get('graph', 'window'))
    damping = float(cfg.get('graph', 'damping'))
    abstract_dir = cfg.get('dataset', 'abstract')
    with_tag = cfg.getboolean('dataset', 'with_tag')

    doc_path = os.path.join(abstract_dir, name)
    text = read_file(doc_path)
    candidates = filter_text(text, with_tag=with_tag)
    edges = dict2list(get_edge_freq(candidates, window=window))
    graph = build_graph(edges)
    nodes = graph.nodes()
    if with_tag:
        text = rm_tags(text)
    pos_sum = position_sum(text, nodes)
    pr = nx.pagerank(graph, alpha=damping, personalization=pos_sum)
    return pr, graph
Ejemplo n.º 13
0
def kee(name, dataset):
    
    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join("./config", dataset+'.ini'))

    abstract_dir = cfg.get('dataset', 'abstract')
    window = int(cfg.get('graph', 'window'))
    with_tag = cfg.getboolean('dataset', 'with_tag')
    damping = float(cfg.get('graph', 'damping'))

    cfg.read('./config/kee.ini')
    feature_select = cfg.get('kee', 'features')

    text = read_file(os.path.join(abstract_dir, name))
    text_candidates = filter_text(text, with_tag=with_tag)
    edge_freq = get_edge_freq(text_candidates, window=window)
    tf = get_term_freq(text)
    edges = dict2list(edge_weight)
    graph = build_graph(edges)
    pr = nx.pagerank(graph, alpha=damping)
    return pr, graph
def textrank(name, dataset):

    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join("./config", dataset + '.ini'))

    window = int(cfg.get('graph', 'window'))
    damping = float(cfg.get('graph', 'damping'))
    abstract_dir = cfg.get('dataset', 'abstract')
    with_tag = cfg.getboolean('dataset', 'with_tag')

    cfg.read('./config/global.ini')
    use_edge_weight = cfg.getboolean('textrank', 'use_edge_weight')

    doc_path = os.path.join(abstract_dir, name)
    text = read_file(doc_path)
    text_candidates = filter_text(text, with_tag=with_tag)
    edge_freq = get_edge_freq(text_candidates, window=window)
    if not use_edge_weight:
        edge_freq = {e: 1 for e in edge_freq}
    edges = dict2list(edge_freq)
    graph = build_graph(edges)
    pr = nx.pagerank_numpy(graph, alpha=damping)
    return pr, graph
    citing_dir = cfg.get('dataset', 'citing')
    with_tag = cfg.getboolean('dataset', 'with_tag')
    gold_dir = cfg.get('dataset', 'gold')

    out_dir = os.path.join('./data', d, 'abs_filtered')

    # names = read_file(filelist).split()
    names = [
        name for name in os.listdir(gold_dir)
        if os.path.isfile(os.path.join(gold_dir, name))
    ]

    outpath1 = os.path.join('./data/jy/', d + '_1.txt')  # abstract
    outpath2 = os.path.join('./data/jy/', d + '_2.txt')  # gold
    out1 = []
    out2 = []
    for name in names:
        text = read_file(os.path.join(abstract_dir, name))
        words = set(text.split())
        for w in words:
            out1.append(','.join([name, w]))
        gold = read_file(os.path.join(gold_dir, name)).split('\n')
        if gold[-1] == '':
            gold = gold[:-1]
        for g in gold:
            out2.append(','.join([name, g]))

    with open(outpath1, 'w', encoding='utf-8') as file:
        file.write('\n'.join(out1))
    with open(outpath2, 'w', encoding='utf-8') as file:
        file.write('\n'.join(out2))
def evaluate(dataset):
    """
    Evaluate ranking result.

    :param dataset: name of dataset
    :param pr: dict, key is stemmed word, value is score
    """

    method_name = 'pagerank_zf'
    dataset = dataset.upper()
    abstract_dir = os.path.join('./data', dataset, 'abstracts')
    gold_dir = os.path.join('./data', dataset, 'gold')

    extracted = os.path.join('./result', dataset, 'extracted_zf')
    pr_type = 'a1b1'  #alfa=1beta=1
    pr_dir = os.path.join('./data', dataset, 'rank_zf', pr_type)
    vocabulary_path = os.path.join('./data', dataset, 'rank_zf',
                                   'vocabulary')  #对应
    damping = 0.85  #0.2 0.5 0.8 0.85

    with_tag = True
    topn = 4
    window = 2
    ngrams = 2
    weight2 = 0.6
    weight3 = 0.3

    names = [
        name for name in os.listdir(pr_dir)
        if os.path.isfile(os.path.join(pr_dir, name))
    ]
    vocabulary = id2word(vocabulary_path)

    count = 0
    gold_count = 0
    extract_count = 0
    mrr = 0
    prcs_micro = 0
    recall_micro = 0
    for name in names:
        pr = read_pr(os.path.join(pr_dir, name), vocabulary, damping)
        doc_path = os.path.join(abstract_dir, name)
        text = read_file(doc_path)
        text_candidates = filter_text(text, with_tag=with_tag)
        edge_freq = get_edge_freq(text_candidates, window=window)
        edges = dict2list(edge_freq)
        graph = build_graph(edges)
        keyphrases = get_phrases(pr,
                                 graph,
                                 doc_path,
                                 ng=ngrams,
                                 pl2=weight2,
                                 pl3=weight3,
                                 with_tag=with_tag)
        top_phrases = []
        for phrase in keyphrases:
            if phrase[0] not in str(top_phrases):
                top_phrases.append(phrase[0])
            if len(top_phrases) == topn:
                break
        if not os.path.exists(extracted):
            os.makedirs(extracted)
        with open(os.path.join(extracted, name), encoding='utf-8',
                  mode='w') as file:
            file.write('\n'.join(top_phrases))

        standard = read_file(os.path.join(gold_dir, name)).split('\n')
        if standard[-1] == '':
            standard = standard[:-1]
        # 根据phrases是否取词干决定
        standard = list(' '.join(list(normalized_token(w) for w in g.split()))
                        for g in standard)
        count_micro = 0
        position = []
        for phrase in top_phrases:
            if phrase in standard:
                count += 1
                count_micro += 1
                position.append(top_phrases.index(phrase))
        if position != []:
            mrr += 1 / (position[0] + 1)
        gold_count += len(standard)
        extract_count += len(top_phrases)
        prcs_micro += count_micro / len(top_phrases)
        recall_micro += count_micro / len(standard)

    prcs = count / extract_count
    recall = count / gold_count
    f1 = 2 * prcs * recall / (prcs + recall)
    mrr /= len(names)
    prcs_micro /= len(names)
    recall_micro /= len(names)
    f1_micro = 2 * prcs_micro * recall_micro / (prcs_micro + recall_micro)
    print(dataset, method_name, count, prcs, recall, f1, mrr)

    eval_result = method_name + pr_type + str(damping) + '@' + str(topn) + ',' + dataset + ',' + str(prcs) + ',' \
                  + str(recall) + ',' + str(f1) + ',' + str(mrr) + ',' + str(prcs_micro) \
                  + ',' + str(recall_micro) + ',' + str(f1_micro) + ',\n'
    with open(os.path.join('./result', method_name + '.csv'),
              mode='a',
              encoding='utf8') as file:
        file.write(eval_result)
    filelist = cfg.get('dataset', 'filelist')
    abstract_dir = cfg.get('dataset', 'abstract')
    cited_dir = cfg.get('dataset', 'cited')
    citing_dir = cfg.get('dataset', 'citing')
    with_tag = cfg.getboolean('dataset', 'with_tag')
    gold_dir = cfg.get('dataset', 'gold')

    # names = read_file(filelist).split()
    names = [
        name for name in os.listdir(gold_dir)
        if os.path.isfile(os.path.join(gold_dir, name))
    ]
    phrases = []
    words = []
    for name in names:
        gold = read_file(os.path.join(gold_dir, name)).split('\n')
        if gold[-1] == '':
            gold = gold[:-1]
        gold = [g.lower() for g in gold]
        phrases += gold
        for g in gold:
            words += g.split()
    phrase_set = set(phrases)
    word_set = set(words)
    phrase_count = {}
    word_count = {}
    for p in phrase_set:
        phrase_count[p] = phrases.count(p)
    for w in word_set:
        word_count[w] = words.count(w)
    with open(os.path.join('./data', d + '_phrase.csv'),
Ejemplo n.º 18
0
def evaluate_pagerank(dataset, extract_method):

    # setup logger
    logger = logging.getLogger('evaluate')
    formatter = logging.Formatter('%(message)s')
    logfilename = '_'.join(time.asctime().replace(':', '_').split()) + '.log'
    file_handler = logging.FileHandler('./log/' + logfilename)
    file_handler.setFormatter(formatter)
    # console_handler = logging.StreamHandler(sys.stdout)
    logger.addHandler(file_handler)

    logger.setLevel(logging.DEBUG)

    # read config
    method_name = extract_method.__name__
    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join("./config", dataset + '.ini'))

    filelist = cfg.get('dataset', 'filelist')
    abstract_dir = cfg.get('dataset', 'abstract')
    gold_dir = cfg.get('dataset', 'gold')
    topn = int(cfg.get('dataset', 'topn'))
    extracted = cfg.get('dataset', 'extracted')
    with_tag = cfg.getboolean('dataset', 'with_tag')

    ngrams = int(cfg.get('phrase', 'ngrams'))
    weight2 = float(cfg.get('phrase', 'weight2'))
    weight3 = float(cfg.get('phrase', 'weight3'))

    # names = [name for name in os.listdir(gold_dir)
    #          if os.path.isfile(os.path.join(gold_dir, name))]
    names = read_file(filelist).split()

    count = 0
    gold_count = 0
    extract_count = 0
    mrr = 0
    prcs_micro = 0
    recall_micro = 0
    for name in names:

        pr, graph = extract_method(name, dataset)
        # logger.debug(str(pr)) #Python3.6后字典有序,此处未做处理
        doc_path = os.path.join(abstract_dir, name)
        keyphrases = get_phrases(pr,
                                 graph,
                                 doc_path,
                                 ng=ngrams,
                                 pl2=weight2,
                                 pl3=weight3,
                                 with_tag=with_tag)
        logger.debug(str(keyphrases))
        top_phrases = []
        for phrase in keyphrases:
            if phrase[0] not in str(top_phrases):
                top_phrases.append(phrase[0])
            if len(top_phrases) == topn:
                break
        detailedresult_dir = os.path.join(extracted, method_name)
        if not os.path.exists(detailedresult_dir):
            os.makedirs(detailedresult_dir)
        with open(os.path.join(detailedresult_dir, name),
                  encoding='utf-8',
                  mode='w') as file:
            file.write('\n'.join(top_phrases))

        standard = read_file(os.path.join(gold_dir, name)).split('\n')
        if standard[-1] == '':
            standard = standard[:-1]
        # standard = list(' '.join(list(normalized_token(w) for w in g.split())) for g in standard)
        count_micro = 0
        position = []
        for phrase in top_phrases:
            if phrase in standard:
                count += 1
                count_micro += 1
                position.append(top_phrases.index(phrase))
        if position != []:
            mrr += 1 / (position[0] + 1)
        gold_count += len(standard)
        extract_count += len(top_phrases)
        prcs_micro += count_micro / len(top_phrases)
        recall_micro += count_micro / len(standard)

    prcs = count / extract_count
    recall = count / gold_count
    f1 = 2 * prcs * recall / (prcs + recall)
    mrr /= len(names)
    prcs_micro /= len(names)
    recall_micro /= len(names)
    f1_micro = 2 * prcs_micro * recall_micro / (prcs_micro + recall_micro)
    result_print = (dataset, method_name, count, prcs, recall, f1, mrr)
    print(str(result_print))
    logger.info(str(result_print))

    eval_result = method_name + '@' + str(topn) + ',' + dataset + ',' + str(prcs) + ',' \
                  + str(recall) + ',' + str(f1) + ',' + str(mrr) + ',' + str(prcs_micro) \
                  + ',' + str(recall_micro) + ',' + str(f1_micro) + ',\n'
    with open(os.path.join('./result', dataset + '.csv'),
              mode='a',
              encoding='utf8') as file:
        file.write(eval_result)
    with open(os.path.join('./result', 'all.csv'), mode='a',
              encoding='utf8') as file:
        file.write(eval_result)
Ejemplo n.º 19
0
def evaluate_extraction(dataset,
                        method_name,
                        ngrams=2,
                        damping=0.85,
                        omega=None,
                        phi=None,
                        alter_topn=None,
                        alter_edge=None,
                        alter_node=None):
    """
    评价实验结果

    omega,phi, [0]代表不适用任何特征,权重设置为1。None为所有特征的简单加和。[-1]只用最后一个特征。
    """
    if dataset == 'KDD':
        abstr_dir = './data/embedding/KDD/abstracts/'
        out_dir = './result/embedding/'
        gold_dir = './data/embedding/KDD/gold/'
        edge_dir = './data/embedding/KDD/edge_features/'
        node_dir = './data/embedding/KDD/node_features/'
        file_names = read_file('./data/embedding/KDD/abstract_list').split(',')
        topn = 4
    elif dataset == 'WWW':
        abstr_dir = './data/embedding/WWW/abstracts/'
        out_dir = './result/embedding/'
        gold_dir = './data/embedding/WWW/gold/'
        edge_dir = './data/embedding/WWW/edge_features/'
        node_dir = './data/embedding/WWW/node_features/'
        file_names = read_file('./data/embedding/WWW/abstract_list').split(',')
        topn = 5
    else:
        print('wrong dataset name')
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    if alter_edge:
        edge_dir = alter_edge
    if alter_node:
        node_dir = alter_node
    if alter_topn:
        topn = alter_topn

    count = 0
    gold_count = 0
    extract_count = 0
    mrr = 0
    prcs_micro = 0
    recall_micro = 0
    for file_name in file_names:
        # print(file_name)
        pr, graph = wpr(edge_dir + file_name,
                        node_dir + file_name,
                        omega=omega,
                        phi=phi,
                        d=damping)

        gold = read_file(gold_dir + file_name)
        pl2 = 0.62
        pl3 = 0.3
        if dataset == "WWW":
            pl2 = 0.55
        file = os.path.join(abstr_dir, file_name)
        keyphrases = get_phrases(pr, graph, file, ng=ngrams, pl2=pl2, pl3=pl3)
        top_phrases = []
        for phrase in keyphrases:
            if phrase[0] not in str(top_phrases):
                top_phrases.append(phrase[0])
            if len(top_phrases) == topn:
                break
        golds = gold.split('\n')
        if golds[-1] == '':
            golds = golds[:-1]
        golds = list(' '.join(list(normalized_token(w) for w in g.split()))
                     for g in golds)
        count_micro = 0
        position = []
        for phrase in top_phrases:
            if phrase in golds:
                count += 1
                count_micro += 1
                position.append(top_phrases.index(phrase))
        if position != []:
            mrr += 1 / (position[0] + 1)
        gold_count += len(golds)
        extract_count += len(top_phrases)
        if len(top_phrases) != 0:
            prcs_micro += count_micro / len(top_phrases)
        recall_micro += count_micro / len(golds)
        # 记录每个文档关键词提取的详细结果
        # prcs_single = count_micro / len(top_phrases)
        # recall_single = count_micro / len(golds)
        # output_single = str(file_name) + ',' + str(prcs_single) + ',' + str(recall_single) + ','\
        #               + ','.join(phrase for phrase in top_phrases) + '\n'
        # with open(out_dir + dataset + 'DETAILS.csv', mode='a', encoding='utf8') as f:
        #     f.write(output_single)
    prcs = count / extract_count
    recall = count / gold_count
    f1 = 2 * prcs * recall / (prcs + recall)
    mrr /= len(file_names)
    prcs_micro /= len(file_names)
    recall_micro /= len(file_names)
    f1_micro = 2 * prcs_micro * recall_micro / (prcs_micro + recall_micro)
    print(prcs, recall, f1, mrr)

    tofile_result = method_name + ',' + str(prcs) + ',' + str(recall) + ',' + str(f1) + ',' + str(mrr) + ',' \
                    + str(prcs_micro) + ',' + str(recall_micro) + ',' + str(f1_micro) + ',\n'
    with open(out_dir + dataset + '_RESULTS.csv', mode='a',
              encoding='utf8') as f:
        f.write(tofile_result)
    output = []
    for node in nodefeatures:
        row = [node] + nodefeatures[node]
        output.append(row)
    with open(path, mode='w', encoding='utf-8', newline='') as f:
        f_csv = csv.writer(f)
        f_csv.writerows(output)

if __name__ == "__main__":
    dataset = 'KDD'
    vec_type = 'total'

    dataset_dir = path.join('./data/embedding/', dataset)
    # edgefeature_dir = path.join(dataset_dir, 'edge_features')
    nodefeature_dir = path.join(dataset_dir, 'node_features')
    filenames = read_file(path.join(dataset_dir, 'abstract_list')).split(',')
    vecdir = path.join('./data/embedding/vec/liu/data_8_11/Word', dataset)

    if vec_type == 'total' and dataset == 'KDD':
        vec_dict = read_vec('./data/embedding/vec/kdd.words.emb0.119')
    elif vec_type == 'total' and dataset == 'WWW':
        vec_dict = read_vec('./data/embedding/vec/WWW0.128')

    # # 主题概率作为点特征
    # topic_num = topic
    # ldadir = path.join('./data/embedding/data_lda/', text_type, dataset+'_'+topic_num)

    for filename in filenames:
        print(filename)

        filtered_text = filter_text(read_file(path.join(dataset_dir, 'abstracts', filename)))