def citetextrank(name, dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join("./config", dataset + '.ini')) abstract_dir = cfg.get('dataset', 'abstract') doc_weight = int(cfg.get('ctr', 'doc_weight')) citing_weight = int(cfg.get('ctr', 'citing_weight')) cited_weight = int(cfg.get('ctr', 'cited_weight')) window = int(cfg.get('graph', 'window')) with_tag = cfg.getboolean('dataset', 'with_tag') damping = float(cfg.get('graph', 'damping')) text = filter_text(read_file(os.path.join(abstract_dir, name)), with_tag=with_tag) edge_f = get_edge_freq(text, window=window) citing_edge_freq = cite_edge_freq(name, dataset, 'citing') cited_edge_freq = cite_edge_freq(name, dataset, 'cited') edge_weight = dict() for edge in edge_f: edge_weight[edge] = doc_weight * edge_f.get(edge, 0) \ + citing_weight * citing_edge_freq.get(edge, 0) \ + cited_weight * cited_edge_freq.get(edge, 0) edges = dict2list(edge_weight) graph = build_graph(edges) pr = nx.pagerank(graph, alpha=damping) return pr, graph
def singletpr(name, dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join("./config", dataset+'.ini')) window = int(cfg.get('graph', 'window')) damping = float(cfg.get('graph', 'damping')) abstract_dir = cfg.get('dataset', 'abstract') with_tag = cfg.getboolean('dataset', 'with_tag') lda_dir = cfg.get('dataset', 'lda') doc_path = os.path.join(abstract_dir, name) text = read_file(doc_path) candidates = filter_text(text, with_tag=with_tag) edges = dict2list(get_edge_freq(candidates, window=window)) graph = build_graph(edges) lda = read_lda(os.path.join(lda_dir, name)) pr = nx.pagerank(graph, alpha=damping, personalization=lda) return pr, graph
def extract_cossim(dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join('./config', dataset+'.ini')) window = int(cfg.get('graph', 'window')) damping = float(cfg.get('graph', 'damping')) abstract_dir = cfg.get('dataset', 'abstract') with_tag = cfg.getboolean('dataset', 'with_tag') filelist = cfg.get('dataset', 'filelist') cfg.read('./config/global.ini') vec_path = cfg.get('embedding', 'wiki_vec') names = read_file(filelist).split() wvmodel = gensim.models.Word2Vec.load(vec_path) for name in names: doc_path = os.path.join(abstract_dir, name) text = read_file(doc_path) text_candidates = filter_text(text, with_tag=with_tag) edge_freq = get_edge_freq(text_candidates, window=window) save_feature(edge_freq)
def wordattractionrank(name, dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join('./config', dataset + '.ini')) window = int(cfg.get('graph', 'window')) damping = float(cfg.get('graph', 'damping')) abstract_dir = cfg.get('dataset', 'abstract') with_tag = cfg.getboolean('dataset', 'with_tag') cfg.read('./config/global.ini') vec_path = cfg.get('embedding', 'wiki_vec') doc_path = os.path.join(abstract_dir, name) text = read_file(doc_path) stemdict = stem2word(text) text_candidate = filter_text(text, with_tag=with_tag) edge_freq = get_edge_freq(text_candidate, window=window) wvmodel = gensim.models.KeyedVectors.load_word2vec_format(vec_path, binary=False) edge_weight = {} for edge in edge_freq: word1 = edge[0] word2 = edge[1] try: distance = 1 - wvmodel.similarity(stemdict[word1], stemdict[word2]) except: distance = 1 words = text_candidate.split() tf1 = words.count(word1) tf2 = words.count(word2) cf = edge_freq[edge] force = calc_force(tf1, tf2, distance) dice = calc_dice(tf1, tf2, cf) edge_weight[edge] = force * dice edges = dict2list(edge_weight) graph = build_graph(edges) pr = nx.pagerank(graph, alpha=damping) return pr, graph
def kee(name, dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join("./config", dataset+'.ini')) abstract_dir = cfg.get('dataset', 'abstract') window = int(cfg.get('graph', 'window')) with_tag = cfg.getboolean('dataset', 'with_tag') damping = float(cfg.get('graph', 'damping')) cfg.read('./config/kee.ini') feature_select = cfg.get('kee', 'features') text = read_file(os.path.join(abstract_dir, name)) text_candidates = filter_text(text, with_tag=with_tag) edge_freq = get_edge_freq(text_candidates, window=window) tf = get_term_freq(text) edges = dict2list(edge_weight) graph = build_graph(edges) pr = nx.pagerank(graph, alpha=damping) return pr, graph
def positionrank(name, dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join("./config", dataset + '.ini')) window = int(cfg.get('graph', 'window')) damping = float(cfg.get('graph', 'damping')) abstract_dir = cfg.get('dataset', 'abstract') with_tag = cfg.getboolean('dataset', 'with_tag') doc_path = os.path.join(abstract_dir, name) text = read_file(doc_path) candidates = filter_text(text, with_tag=with_tag) edges = dict2list(get_edge_freq(candidates, window=window)) graph = build_graph(edges) nodes = graph.nodes() if with_tag: text = rm_tags(text) pos_sum = position_sum(text, nodes) pr = nx.pagerank(graph, alpha=damping, personalization=pos_sum) return pr, graph
def textrank(name, dataset): dataset = dataset.lower() cfg = ConfigParser() cfg.read(os.path.join("./config", dataset + '.ini')) window = int(cfg.get('graph', 'window')) damping = float(cfg.get('graph', 'damping')) abstract_dir = cfg.get('dataset', 'abstract') with_tag = cfg.getboolean('dataset', 'with_tag') cfg.read('./config/global.ini') use_edge_weight = cfg.getboolean('textrank', 'use_edge_weight') doc_path = os.path.join(abstract_dir, name) text = read_file(doc_path) text_candidates = filter_text(text, with_tag=with_tag) edge_freq = get_edge_freq(text_candidates, window=window) if not use_edge_weight: edge_freq = {e: 1 for e in edge_freq} edges = dict2list(edge_freq) graph = build_graph(edges) pr = nx.pagerank_numpy(graph, alpha=damping) return pr, graph
def evaluate(dataset): """ Evaluate ranking result. :param dataset: name of dataset :param pr: dict, key is stemmed word, value is score """ method_name = 'pagerank_zf' dataset = dataset.upper() abstract_dir = os.path.join('./data', dataset, 'abstracts') gold_dir = os.path.join('./data', dataset, 'gold') extracted = os.path.join('./result', dataset, 'extracted_zf') pr_type = 'a1b1' #alfa=1beta=1 pr_dir = os.path.join('./data', dataset, 'rank_zf', pr_type) vocabulary_path = os.path.join('./data', dataset, 'rank_zf', 'vocabulary') #对应 damping = 0.85 #0.2 0.5 0.8 0.85 with_tag = True topn = 4 window = 2 ngrams = 2 weight2 = 0.6 weight3 = 0.3 names = [ name for name in os.listdir(pr_dir) if os.path.isfile(os.path.join(pr_dir, name)) ] vocabulary = id2word(vocabulary_path) count = 0 gold_count = 0 extract_count = 0 mrr = 0 prcs_micro = 0 recall_micro = 0 for name in names: pr = read_pr(os.path.join(pr_dir, name), vocabulary, damping) doc_path = os.path.join(abstract_dir, name) text = read_file(doc_path) text_candidates = filter_text(text, with_tag=with_tag) edge_freq = get_edge_freq(text_candidates, window=window) edges = dict2list(edge_freq) graph = build_graph(edges) keyphrases = get_phrases(pr, graph, doc_path, ng=ngrams, pl2=weight2, pl3=weight3, with_tag=with_tag) top_phrases = [] for phrase in keyphrases: if phrase[0] not in str(top_phrases): top_phrases.append(phrase[0]) if len(top_phrases) == topn: break if not os.path.exists(extracted): os.makedirs(extracted) with open(os.path.join(extracted, name), encoding='utf-8', mode='w') as file: file.write('\n'.join(top_phrases)) standard = read_file(os.path.join(gold_dir, name)).split('\n') if standard[-1] == '': standard = standard[:-1] # 根据phrases是否取词干决定 standard = list(' '.join(list(normalized_token(w) for w in g.split())) for g in standard) count_micro = 0 position = [] for phrase in top_phrases: if phrase in standard: count += 1 count_micro += 1 position.append(top_phrases.index(phrase)) if position != []: mrr += 1 / (position[0] + 1) gold_count += len(standard) extract_count += len(top_phrases) prcs_micro += count_micro / len(top_phrases) recall_micro += count_micro / len(standard) prcs = count / extract_count recall = count / gold_count f1 = 2 * prcs * recall / (prcs + recall) mrr /= len(names) prcs_micro /= len(names) recall_micro /= len(names) f1_micro = 2 * prcs_micro * recall_micro / (prcs_micro + recall_micro) print(dataset, method_name, count, prcs, recall, f1, mrr) eval_result = method_name + pr_type + str(damping) + '@' + str(topn) + ',' + dataset + ',' + str(prcs) + ',' \ + str(recall) + ',' + str(f1) + ',' + str(mrr) + ',' + str(prcs_micro) \ + ',' + str(recall_micro) + ',' + str(f1_micro) + ',\n' with open(os.path.join('./result', method_name + '.csv'), mode='a', encoding='utf8') as file: file.write(eval_result)