def main(dataset, window): # 计算生成经典特征 data_dir = os.path.join('./data/embedding/', dataset) file_names = read_file(os.path.join(data_dir, 'abstract_list')).split(',') for file_name in file_names: print(file_name) filtered_text = filter_text( read_file(os.path.join(data_dir, 'abstracts', file_name))) # 计算保存边特征,分别为:共现次数,被引文献共现次数,引用文献共现次数 edge_freq = get_edge_freq(filtered_text, window=window) cited_edge_freq = sum_cite_edge_freq(file_name, data_dir, 'cited', window=window) citing_edge_freq = sum_cite_edge_freq(file_name, data_dir, 'citing', window=window) save_edge_features(file_name, data_dir, edge_freq, cited_edge_freq, citing_edge_freq) # 读取点的特征,保存为需要的格式 node_list = filtered_text.split() raw_node_features = read_file( os.path.join(data_dir, 'raw_node_features')) node_features = read_node_features(node_list, raw_node_features, file_name, nfselect='07') #023789 07 save_node_features(file_name, data_dir, node_features) print('.......old_features_DONE........')
def get_tfidf(name, dataset): """ Return a dict, key is word, value is tfidf of node, words not filtered. :param name: file name of the target doc :param dataset: dataset name """ cfg = ConfigParser() cfg.read(os.path.join("./config", dataset.lower() + '.ini')) abstract_dir = cfg.get('dataset', 'abstract') filelist = cfg.get('dataset', 'filelist') names = read_file(filelist).split() docs = [stem_doc(read_file(os.path.join(abstract_dir, n))) for n in names] words = stem_doc(read_file(os.path.join(abstract_dir, name))).split() tfidf = {} for w in set(words): df = 0 for d in docs: if w in d: df += 1 idf = log(len(names) / df) #log底数可调整 tf = words.count(w) tfidf[w] = tf * idf return tfidf
def main(dataset, window): model = gensim.models.KeyedVectors.load_word2vec_format( './data/embedding/vec/externel_vec/wiki.en.vec', binary=False) # 注:因为gensim版本更新的问题,如果下面这个load有问题,可以使用新的接口:model = gensim.models.word2vec.Word2Vec.load(MODEL_PATH) # model = gensim.models.Word2Vec.load_word2vec_format("wiki.en.text.vector", binary=False) # model.similarity("woman", "girl") # 计算生成经典特征 data_dir = os.path.join('./data/embedding/', dataset) file_names = read_file(os.path.join(data_dir, 'abstract_list')).split(',') out_dir = os.path.join(data_dir, 'wiki_sim') for file_name in file_names: print(file_name) filtered_text = filter_text( read_file(os.path.join(data_dir, 'abstracts', file_name))) edges = get_edges(filtered_text, window=window) edge_sim = {} for edge in edges: word1 = edge[0] word2 = edge[1] try: sim = model.similarity(word1, word2) except: sim = 0 e = tuple( sorted([normalized_token(word1), normalized_token(word2)])) edge_sim[e] = [sim] edgefeatures2file(os.path.join(data_dir, 'wiki_sim', file_name), edge_sim) print('.......wiki_sim_DONE........')
def citetextrank(name, dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join("./config", dataset + '.ini')) abstract_dir = cfg.get('dataset', 'abstract') doc_weight = int(cfg.get('ctr', 'doc_weight')) citing_weight = int(cfg.get('ctr', 'citing_weight')) cited_weight = int(cfg.get('ctr', 'cited_weight')) window = int(cfg.get('graph', 'window')) with_tag = cfg.getboolean('dataset', 'with_tag') damping = float(cfg.get('graph', 'damping')) text = filter_text(read_file(os.path.join(abstract_dir, name)), with_tag=with_tag) edge_f = get_edge_freq(text, window=window) citing_edge_freq = cite_edge_freq(name, dataset, 'citing') cited_edge_freq = cite_edge_freq(name, dataset, 'cited') edge_weight = dict() for edge in edge_f: edge_weight[edge] = doc_weight * edge_f.get(edge, 0) \ + citing_weight * citing_edge_freq.get(edge, 0) \ + cited_weight * cited_edge_freq.get(edge, 0) edges = dict2list(edge_weight) graph = build_graph(edges) pr = nx.pagerank(graph, alpha=damping) return pr, graph
def mike(dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join("./config", dataset + '.ini')) window = int(cfg.get('graph', 'window')) damping = float(cfg.get('graph', 'damping')) abstract_dir = cfg.get('dataset', 'abstract') filelist = cfg.get('dataset', 'filelist') gold_dir = cfg.get('dataset', 'gold') topn = int(cfg.get('dataset', 'topn')) extracted = cfg.get('dataset', 'extracted') with_tag = cfg.getboolean('dataset', 'with_tag') edge_dir = cfg.get('ssp', 'edge_dir') node_dir = cfg.get('ssp', 'node_dir') supervised_dir = cfg.get('ssp', 'supervised_dir') alpha = float(cfg.get('ssp', 'alpha')) step_size = float(cfg.get('ssp', 'step_size')) epsilon = float(cfg.get('ssp', 'epsilon')) max_iter = int(cfg.get('ssp', 'max_iter')) ngrams = int(cfg.get('phrase', 'ngrams')) weight2 = float(cfg.get('phrase', 'weight2')) weight3 = float(cfg.get('phrase', 'weight3')) names = read_file(filelist).split()[:3] for name in names: print(name) edge_features = read_edges(os.path.join(edge_dir, name)) node_features = read_vec(os.path.join(node_dir, name)) supervised_info = read_file(os.path.join(supervised_dir, name)) (pi, omega, phi, node_list, iter_times, graph) = ssp(edge_features, node_features, supervised_info, d=damping, alpha=alpha, step_size=step_size, max_iter=max_iter, epsilon=epsilon) print(iter_times)
def sum_cite_edge_freq(file_name, data_dir, cite_type, window=2): """ 读取文件,计算引用特征 data_dir为数据集根目录,如KDD数据集为'./data/embedding/KDD/' """ def get_cite_list(target_name, cite_list_all): # cite_list_all为引用文件名列表 cite_list = [] count = 0 count_old = 0 for name in cite_list_all.split(): count_old = count if target_name in name: cite_list.append(name) count += 1 if count > 0 and count_old == count: break return cite_list if cite_type == 'cited': cite_dir = os.path.join(data_dir, 'citedcontexts') cite_list_all = read_file(os.path.join(data_dir, 'cited_list')) elif cite_type == 'citing': cite_dir = os.path.join(data_dir, 'citingcontexts') cite_list_all = read_file(os.path.join(data_dir, 'citing_list')) else: print('wrong cite type') cite_list = get_cite_list(file_name, cite_list_all) # 目标文档 target = filter_text( read_file(os.path.join(data_dir, 'abstracts', file_name))) cite_edge_freqs = {} for cite_name in cite_list: cite_text = filter_text(read_file(os.path.join(cite_dir, cite_name)), with_tag=False) cite_edge_freq = single_cite_edge_freq(target, cite_text, window=window) for key in cite_edge_freq: cite_edge_freqs[key] = cite_edge_freqs.get(key, 0) + cite_edge_freq[key] return cite_edge_freqs
def extract_cossim(dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join('./config', dataset+'.ini')) window = int(cfg.get('graph', 'window')) damping = float(cfg.get('graph', 'damping')) abstract_dir = cfg.get('dataset', 'abstract') with_tag = cfg.getboolean('dataset', 'with_tag') filelist = cfg.get('dataset', 'filelist') cfg.read('./config/global.ini') vec_path = cfg.get('embedding', 'wiki_vec') names = read_file(filelist).split() wvmodel = gensim.models.Word2Vec.load(vec_path) for name in names: doc_path = os.path.join(abstract_dir, name) text = read_file(doc_path) text_candidates = filter_text(text, with_tag=with_tag) edge_freq = get_edge_freq(text_candidates, window=window) save_feature(edge_freq)
def read_lda(lda_path): """ Return a dict, key is node, value is topic prob :param lda_path: path to lda prob file """ lda_raw = read_file(lda_path).split('\n') if lda_raw[-1] == '': lda_raw = lda_raw[:-1] lda = {} for line in lda_raw: key, value = line.split() lda[key] = float(value) return lda
def cite_edge_freq(name, dataset, cite_type): """ Return a dict, key is edge tuple, value is the sum of citation frequency in all citation contexts :param name: file name of the target doc :param dataset: dataset name :param cite_type: citation type, citing or cited """ cfg = ConfigParser() cfg.read(os.path.join("./config", dataset.lower() + '.ini')) abstract_dir = cfg.get('dataset', 'abstract') window = int(cfg.get('graph', 'window')) with_tag = cfg.getboolean('dataset', 'with_tag') if cite_type == 'citing': cite_dir = cfg.get('dataset', 'citing') cite_names = [n for n in os.listdir(cite_dir) if name in n] elif cite_type == 'cited': cite_dir = cfg.get('dataset', 'cited') cite_names = [n for n in os.listdir(cite_dir) if name in n] else: print('wrong cite type') target = filter_text(read_file(os.path.join(abstract_dir, name)), with_tag=with_tag) cite_edge_freqs = {} for cite_name in cite_names: cite_text = filter_text(read_file(os.path.join(cite_dir, cite_name)), with_tag=False) cite_edge_freq = single_cite_edge_freq(target, cite_text, window=window) for key in cite_edge_freq: cite_edge_freqs[key] = cite_edge_freqs.get(key, 0) + cite_edge_freq[key] return cite_edge_freqs
def singletpr(name, dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join("./config", dataset+'.ini')) window = int(cfg.get('graph', 'window')) damping = float(cfg.get('graph', 'damping')) abstract_dir = cfg.get('dataset', 'abstract') with_tag = cfg.getboolean('dataset', 'with_tag') lda_dir = cfg.get('dataset', 'lda') doc_path = os.path.join(abstract_dir, name) text = read_file(doc_path) candidates = filter_text(text, with_tag=with_tag) edges = dict2list(get_edge_freq(candidates, window=window)) graph = build_graph(edges) lda = read_lda(os.path.join(lda_dir, name)) pr = nx.pagerank(graph, alpha=damping, personalization=lda) return pr, graph
def wordattractionrank(name, dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join('./config', dataset + '.ini')) window = int(cfg.get('graph', 'window')) damping = float(cfg.get('graph', 'damping')) abstract_dir = cfg.get('dataset', 'abstract') with_tag = cfg.getboolean('dataset', 'with_tag') cfg.read('./config/global.ini') vec_path = cfg.get('embedding', 'wiki_vec') doc_path = os.path.join(abstract_dir, name) text = read_file(doc_path) stemdict = stem2word(text) text_candidate = filter_text(text, with_tag=with_tag) edge_freq = get_edge_freq(text_candidate, window=window) wvmodel = gensim.models.KeyedVectors.load_word2vec_format(vec_path, binary=False) edge_weight = {} for edge in edge_freq: word1 = edge[0] word2 = edge[1] try: distance = 1 - wvmodel.similarity(stemdict[word1], stemdict[word2]) except: distance = 1 words = text_candidate.split() tf1 = words.count(word1) tf2 = words.count(word2) cf = edge_freq[edge] force = calc_force(tf1, tf2, distance) dice = calc_dice(tf1, tf2, cf) edge_weight[edge] = force * dice edges = dict2list(edge_weight) graph = build_graph(edges) pr = nx.pagerank(graph, alpha=damping) return pr, graph
def positionrank(name, dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join("./config", dataset + '.ini')) window = int(cfg.get('graph', 'window')) damping = float(cfg.get('graph', 'damping')) abstract_dir = cfg.get('dataset', 'abstract') with_tag = cfg.getboolean('dataset', 'with_tag') doc_path = os.path.join(abstract_dir, name) text = read_file(doc_path) candidates = filter_text(text, with_tag=with_tag) edges = dict2list(get_edge_freq(candidates, window=window)) graph = build_graph(edges) nodes = graph.nodes() if with_tag: text = rm_tags(text) pos_sum = position_sum(text, nodes) pr = nx.pagerank(graph, alpha=damping, personalization=pos_sum) return pr, graph
def kee(name, dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join("./config", dataset+'.ini')) abstract_dir = cfg.get('dataset', 'abstract') window = int(cfg.get('graph', 'window')) with_tag = cfg.getboolean('dataset', 'with_tag') damping = float(cfg.get('graph', 'damping')) cfg.read('./config/kee.ini') feature_select = cfg.get('kee', 'features') text = read_file(os.path.join(abstract_dir, name)) text_candidates = filter_text(text, with_tag=with_tag) edge_freq = get_edge_freq(text_candidates, window=window) tf = get_term_freq(text) edges = dict2list(edge_weight) graph = build_graph(edges) pr = nx.pagerank(graph, alpha=damping) return pr, graph
def textrank(name, dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join("./config", dataset + '.ini')) window = int(cfg.get('graph', 'window')) damping = float(cfg.get('graph', 'damping')) abstract_dir = cfg.get('dataset', 'abstract') with_tag = cfg.getboolean('dataset', 'with_tag') cfg.read('./config/global.ini') use_edge_weight = cfg.getboolean('textrank', 'use_edge_weight') doc_path = os.path.join(abstract_dir, name) text = read_file(doc_path) text_candidates = filter_text(text, with_tag=with_tag) edge_freq = get_edge_freq(text_candidates, window=window) if not use_edge_weight: edge_freq = {e: 1 for e in edge_freq} edges = dict2list(edge_freq) graph = build_graph(edges) pr = nx.pagerank_numpy(graph, alpha=damping) return pr, graph
citing_dir = cfg.get('dataset', 'citing') with_tag = cfg.getboolean('dataset', 'with_tag') gold_dir = cfg.get('dataset', 'gold') out_dir = os.path.join('./data', d, 'abs_filtered') # names = read_file(filelist).split() names = [ name for name in os.listdir(gold_dir) if os.path.isfile(os.path.join(gold_dir, name)) ] outpath1 = os.path.join('./data/jy/', d + '_1.txt') # abstract outpath2 = os.path.join('./data/jy/', d + '_2.txt') # gold out1 = [] out2 = [] for name in names: text = read_file(os.path.join(abstract_dir, name)) words = set(text.split()) for w in words: out1.append(','.join([name, w])) gold = read_file(os.path.join(gold_dir, name)).split('\n') if gold[-1] == '': gold = gold[:-1] for g in gold: out2.append(','.join([name, g])) with open(outpath1, 'w', encoding='utf-8') as file: file.write('\n'.join(out1)) with open(outpath2, 'w', encoding='utf-8') as file: file.write('\n'.join(out2))
def evaluate(dataset): """ Evaluate ranking result. :param dataset: name of dataset :param pr: dict, key is stemmed word, value is score """ method_name = 'pagerank_zf' dataset = dataset.upper() abstract_dir = os.path.join('./data', dataset, 'abstracts') gold_dir = os.path.join('./data', dataset, 'gold') extracted = os.path.join('./result', dataset, 'extracted_zf') pr_type = 'a1b1' #alfa=1beta=1 pr_dir = os.path.join('./data', dataset, 'rank_zf', pr_type) vocabulary_path = os.path.join('./data', dataset, 'rank_zf', 'vocabulary') #对应 damping = 0.85 #0.2 0.5 0.8 0.85 with_tag = True topn = 4 window = 2 ngrams = 2 weight2 = 0.6 weight3 = 0.3 names = [ name for name in os.listdir(pr_dir) if os.path.isfile(os.path.join(pr_dir, name)) ] vocabulary = id2word(vocabulary_path) count = 0 gold_count = 0 extract_count = 0 mrr = 0 prcs_micro = 0 recall_micro = 0 for name in names: pr = read_pr(os.path.join(pr_dir, name), vocabulary, damping) doc_path = os.path.join(abstract_dir, name) text = read_file(doc_path) text_candidates = filter_text(text, with_tag=with_tag) edge_freq = get_edge_freq(text_candidates, window=window) edges = dict2list(edge_freq) graph = build_graph(edges) keyphrases = get_phrases(pr, graph, doc_path, ng=ngrams, pl2=weight2, pl3=weight3, with_tag=with_tag) top_phrases = [] for phrase in keyphrases: if phrase[0] not in str(top_phrases): top_phrases.append(phrase[0]) if len(top_phrases) == topn: break if not os.path.exists(extracted): os.makedirs(extracted) with open(os.path.join(extracted, name), encoding='utf-8', mode='w') as file: file.write('\n'.join(top_phrases)) standard = read_file(os.path.join(gold_dir, name)).split('\n') if standard[-1] == '': standard = standard[:-1] # 根据phrases是否取词干决定 standard = list(' '.join(list(normalized_token(w) for w in g.split())) for g in standard) count_micro = 0 position = [] for phrase in top_phrases: if phrase in standard: count += 1 count_micro += 1 position.append(top_phrases.index(phrase)) if position != []: mrr += 1 / (position[0] + 1) gold_count += len(standard) extract_count += len(top_phrases) prcs_micro += count_micro / len(top_phrases) recall_micro += count_micro / len(standard) prcs = count / extract_count recall = count / gold_count f1 = 2 * prcs * recall / (prcs + recall) mrr /= len(names) prcs_micro /= len(names) recall_micro /= len(names) f1_micro = 2 * prcs_micro * recall_micro / (prcs_micro + recall_micro) print(dataset, method_name, count, prcs, recall, f1, mrr) eval_result = method_name + pr_type + str(damping) + '@' + str(topn) + ',' + dataset + ',' + str(prcs) + ',' \ + str(recall) + ',' + str(f1) + ',' + str(mrr) + ',' + str(prcs_micro) \ + ',' + str(recall_micro) + ',' + str(f1_micro) + ',\n' with open(os.path.join('./result', method_name + '.csv'), mode='a', encoding='utf8') as file: file.write(eval_result)
filelist = cfg.get('dataset', 'filelist') abstract_dir = cfg.get('dataset', 'abstract') cited_dir = cfg.get('dataset', 'cited') citing_dir = cfg.get('dataset', 'citing') with_tag = cfg.getboolean('dataset', 'with_tag') gold_dir = cfg.get('dataset', 'gold') # names = read_file(filelist).split() names = [ name for name in os.listdir(gold_dir) if os.path.isfile(os.path.join(gold_dir, name)) ] phrases = [] words = [] for name in names: gold = read_file(os.path.join(gold_dir, name)).split('\n') if gold[-1] == '': gold = gold[:-1] gold = [g.lower() for g in gold] phrases += gold for g in gold: words += g.split() phrase_set = set(phrases) word_set = set(words) phrase_count = {} word_count = {} for p in phrase_set: phrase_count[p] = phrases.count(p) for w in word_set: word_count[w] = words.count(w) with open(os.path.join('./data', d + '_phrase.csv'),
def evaluate_pagerank(dataset, extract_method): # setup logger logger = logging.getLogger('evaluate') formatter = logging.Formatter('%(message)s') logfilename = '_'.join(time.asctime().replace(':', '_').split()) + '.log' file_handler = logging.FileHandler('./log/' + logfilename) file_handler.setFormatter(formatter) # console_handler = logging.StreamHandler(sys.stdout) logger.addHandler(file_handler) logger.setLevel(logging.DEBUG) # read config method_name = extract_method.__name__ dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join("./config", dataset + '.ini')) filelist = cfg.get('dataset', 'filelist') abstract_dir = cfg.get('dataset', 'abstract') gold_dir = cfg.get('dataset', 'gold') topn = int(cfg.get('dataset', 'topn')) extracted = cfg.get('dataset', 'extracted') with_tag = cfg.getboolean('dataset', 'with_tag') ngrams = int(cfg.get('phrase', 'ngrams')) weight2 = float(cfg.get('phrase', 'weight2')) weight3 = float(cfg.get('phrase', 'weight3')) # names = [name for name in os.listdir(gold_dir) # if os.path.isfile(os.path.join(gold_dir, name))] names = read_file(filelist).split() count = 0 gold_count = 0 extract_count = 0 mrr = 0 prcs_micro = 0 recall_micro = 0 for name in names: pr, graph = extract_method(name, dataset) # logger.debug(str(pr)) #Python3.6后字典有序,此处未做处理 doc_path = os.path.join(abstract_dir, name) keyphrases = get_phrases(pr, graph, doc_path, ng=ngrams, pl2=weight2, pl3=weight3, with_tag=with_tag) logger.debug(str(keyphrases)) top_phrases = [] for phrase in keyphrases: if phrase[0] not in str(top_phrases): top_phrases.append(phrase[0]) if len(top_phrases) == topn: break detailedresult_dir = os.path.join(extracted, method_name) if not os.path.exists(detailedresult_dir): os.makedirs(detailedresult_dir) with open(os.path.join(detailedresult_dir, name), encoding='utf-8', mode='w') as file: file.write('\n'.join(top_phrases)) standard = read_file(os.path.join(gold_dir, name)).split('\n') if standard[-1] == '': standard = standard[:-1] # standard = list(' '.join(list(normalized_token(w) for w in g.split())) for g in standard) count_micro = 0 position = [] for phrase in top_phrases: if phrase in standard: count += 1 count_micro += 1 position.append(top_phrases.index(phrase)) if position != []: mrr += 1 / (position[0] + 1) gold_count += len(standard) extract_count += len(top_phrases) prcs_micro += count_micro / len(top_phrases) recall_micro += count_micro / len(standard) prcs = count / extract_count recall = count / gold_count f1 = 2 * prcs * recall / (prcs + recall) mrr /= len(names) prcs_micro /= len(names) recall_micro /= len(names) f1_micro = 2 * prcs_micro * recall_micro / (prcs_micro + recall_micro) result_print = (dataset, method_name, count, prcs, recall, f1, mrr) print(str(result_print)) logger.info(str(result_print)) eval_result = method_name + '@' + str(topn) + ',' + dataset + ',' + str(prcs) + ',' \ + str(recall) + ',' + str(f1) + ',' + str(mrr) + ',' + str(prcs_micro) \ + ',' + str(recall_micro) + ',' + str(f1_micro) + ',\n' with open(os.path.join('./result', dataset + '.csv'), mode='a', encoding='utf8') as file: file.write(eval_result) with open(os.path.join('./result', 'all.csv'), mode='a', encoding='utf8') as file: file.write(eval_result)
def evaluate_extraction(dataset, method_name, ngrams=2, damping=0.85, omega=None, phi=None, alter_topn=None, alter_edge=None, alter_node=None): """ 评价实验结果 omega,phi, [0]代表不适用任何特征,权重设置为1。None为所有特征的简单加和。[-1]只用最后一个特征。 """ if dataset == 'KDD': abstr_dir = './data/embedding/KDD/abstracts/' out_dir = './result/embedding/' gold_dir = './data/embedding/KDD/gold/' edge_dir = './data/embedding/KDD/edge_features/' node_dir = './data/embedding/KDD/node_features/' file_names = read_file('./data/embedding/KDD/abstract_list').split(',') topn = 4 elif dataset == 'WWW': abstr_dir = './data/embedding/WWW/abstracts/' out_dir = './result/embedding/' gold_dir = './data/embedding/WWW/gold/' edge_dir = './data/embedding/WWW/edge_features/' node_dir = './data/embedding/WWW/node_features/' file_names = read_file('./data/embedding/WWW/abstract_list').split(',') topn = 5 else: print('wrong dataset name') if not os.path.exists(out_dir): os.makedirs(out_dir) if alter_edge: edge_dir = alter_edge if alter_node: node_dir = alter_node if alter_topn: topn = alter_topn count = 0 gold_count = 0 extract_count = 0 mrr = 0 prcs_micro = 0 recall_micro = 0 for file_name in file_names: # print(file_name) pr, graph = wpr(edge_dir + file_name, node_dir + file_name, omega=omega, phi=phi, d=damping) gold = read_file(gold_dir + file_name) pl2 = 0.62 pl3 = 0.3 if dataset == "WWW": pl2 = 0.55 file = os.path.join(abstr_dir, file_name) keyphrases = get_phrases(pr, graph, file, ng=ngrams, pl2=pl2, pl3=pl3) top_phrases = [] for phrase in keyphrases: if phrase[0] not in str(top_phrases): top_phrases.append(phrase[0]) if len(top_phrases) == topn: break golds = gold.split('\n') if golds[-1] == '': golds = golds[:-1] golds = list(' '.join(list(normalized_token(w) for w in g.split())) for g in golds) count_micro = 0 position = [] for phrase in top_phrases: if phrase in golds: count += 1 count_micro += 1 position.append(top_phrases.index(phrase)) if position != []: mrr += 1 / (position[0] + 1) gold_count += len(golds) extract_count += len(top_phrases) if len(top_phrases) != 0: prcs_micro += count_micro / len(top_phrases) recall_micro += count_micro / len(golds) # 记录每个文档关键词提取的详细结果 # prcs_single = count_micro / len(top_phrases) # recall_single = count_micro / len(golds) # output_single = str(file_name) + ',' + str(prcs_single) + ',' + str(recall_single) + ','\ # + ','.join(phrase for phrase in top_phrases) + '\n' # with open(out_dir + dataset + 'DETAILS.csv', mode='a', encoding='utf8') as f: # f.write(output_single) prcs = count / extract_count recall = count / gold_count f1 = 2 * prcs * recall / (prcs + recall) mrr /= len(file_names) prcs_micro /= len(file_names) recall_micro /= len(file_names) f1_micro = 2 * prcs_micro * recall_micro / (prcs_micro + recall_micro) print(prcs, recall, f1, mrr) tofile_result = method_name + ',' + str(prcs) + ',' + str(recall) + ',' + str(f1) + ',' + str(mrr) + ',' \ + str(prcs_micro) + ',' + str(recall_micro) + ',' + str(f1_micro) + ',\n' with open(out_dir + dataset + '_RESULTS.csv', mode='a', encoding='utf8') as f: f.write(tofile_result)
output = [] for node in nodefeatures: row = [node] + nodefeatures[node] output.append(row) with open(path, mode='w', encoding='utf-8', newline='') as f: f_csv = csv.writer(f) f_csv.writerows(output) if __name__ == "__main__": dataset = 'KDD' vec_type = 'total' dataset_dir = path.join('./data/embedding/', dataset) # edgefeature_dir = path.join(dataset_dir, 'edge_features') nodefeature_dir = path.join(dataset_dir, 'node_features') filenames = read_file(path.join(dataset_dir, 'abstract_list')).split(',') vecdir = path.join('./data/embedding/vec/liu/data_8_11/Word', dataset) if vec_type == 'total' and dataset == 'KDD': vec_dict = read_vec('./data/embedding/vec/kdd.words.emb0.119') elif vec_type == 'total' and dataset == 'WWW': vec_dict = read_vec('./data/embedding/vec/WWW0.128') # # 主题概率作为点特征 # topic_num = topic # ldadir = path.join('./data/embedding/data_lda/', text_type, dataset+'_'+topic_num) for filename in filenames: print(filename) filtered_text = filter_text(read_file(path.join(dataset_dir, 'abstracts', filename)))