""" import os from collections import OrderedDict import numpy import sys sys.path.append('..') from evaluation import mean_avg_precision, recall dataset_dir = '../wikipedia' tfidf_dir = 'wiki_tfidf' eval_map = mean_avg_precision.MAP('wiki_tfidf_map.csv') k = 5 eval_recall = recall.Recall('wiki_tfidf_recall@'+str(k)+'.csv') def getTopKEntitiesByTFIDFperDoc(topic, doc, k): doc_dir = os.path.join(tfidf_dir, topic, doc) tfidf_map = {} with open(doc_dir, 'r', encoding='utf-8-sig') as f: content = f.readlines() f.close() for l in content: (entity, score) = l.split('|') if entity not in tfidf_map: tfidf_map[entity] = [] tfidf_map[entity].append(float(score.strip('\n'))) tfidf_map = sorted({k.replace(' ', '_').lower():numpy.sum(v) for k,v in tfidf_map.items()}.items(), key=lambda x:x[1], reverse=True)
""" @author: Yi-Ru Cheng """ import os import numpy import sys sys.path.append('..') from evaluation import mean_avg_precision, recall dataset_dir = '../../chinese-dataset' tfidf_dir = 'tfidf_zh_ckip' eval_map = mean_avg_precision.MAP('zh_CKIP_doc_tfidf_map.csv') k = 5 eval_recall = recall.Recall('zh_CKIP_doc_tfidf_recall@' + str(k) + '.csv') def getTopKEntitiesByTFIDFperDoc(topic, doc, k): doc_dir = os.path.join(tfidf_dir, topic, doc) tfidf_map = {} with open(doc_dir, 'r', encoding='utf-8-sig') as f: content = f.readlines() f.close() for l in content: (entity, score) = l.split('|') if entity not in tfidf_map: tfidf_map[entity] = [] tfidf_map[entity].append(float(score.strip('\n')))
""" @author: Yi-Ru Cheng """ import os from collections import OrderedDict import sys sys.path.append('..') from evaluation import mean_avg_precision, recall dataset_dir = '../../chinese-dataset' tfidf_dir = '/zh_tfidf' eval_map = mean_avg_precision.MAP('zh_CKIP_doc_freq200_map.csv') k = 5 eval_recall = recall.Recall('zh_CKIP_doc_freq200_recall@' + str(k) + '.csv') for topic in os.listdir(dataset_dir): print('***********************' + topic) topic_dir = os.path.join(dataset_dir, topic) doc_recall = {} for doc in os.listdir(topic_dir): entities = {} with open(os.path.join(topic_dir, doc), 'r', encoding='utf-8-sig') as f: content = f.readlines() content = [x.strip() for x in content] f.close() for e in content: if e in entities.keys():
Created on Sat Mar 17 18:56:00 2018 @author: Yiru """ import os import random import sys sys.path.append('..') from evaluation import mean_avg_precision, recall dataset_dir = '../../sample-en-dataset' eval_map = mean_avg_precision.MAP('doc_baseline_map.csv') k = 5 eval_recall = recall.Recall('doc_baseline_recall@'+str(k)+'.csv') for topic in os.listdir(dataset_dir): print('***********************'+topic) topic_dir = os.path.join(dataset_dir, topic, 'tagMe') doc_recall = {} for doc in os.listdir(topic_dir): entities = {} with open(os.path.join(topic_dir, doc), 'r', encoding='utf-8-sig') as f: content = [l.strip().lower() for l in f.read().split('|') if l.strip().lower()] f.close() #count entity freq for e in content:
""" @author: Yi-Ru Cheng """ import os from collections import OrderedDict import sys sys.path.append('..') from evaluation import mean_avg_precision, recall dataset_dir = '../wikipedia' eval_map = mean_avg_precision.MAP('wiki_freq200_map.csv') k = 5 eval_recall = recall.Recall('wiki_freq200_recall@'+str(k)+'.csv') for topic in os.listdir(dataset_dir): print('***********************'+topic) topic_dir = os.path.join(dataset_dir, topic) doc_recall = {} for doc in os.listdir(topic_dir): entities = {} """ counting the frequency of each entity """ with open(os.path.join(topic_dir, doc), 'r', encoding='utf-8-sig') as f: content = f.readlines() f.close()
@author: Yi-Ru Cheng """ import os import numpy import sys sys.path.append('..') from evaluation import mean_avg_precision, recall dataset_dir = '../../sample-en-dataset' tfidf_dir = '/tfidf' eval_map = mean_avg_precision.MAP('doc_tfidf_map.csv') k = 5 eval_recall = recall.Recall('doc_tfidf_recall@' + str(k) + '.csv') def getTopKEntitiesByTFIDFperDoc(topic, doc, k): doc_dir = os.path.join(tfidf_dir, topic, doc) tfidf_map = {} with open(doc_dir, 'r', encoding='utf-8-sig') as f: content = f.readlines() f.close() for l in content: (entity, score) = l.split('|') if entity not in tfidf_map: tfidf_map[entity] = [] tfidf_map[entity].append(float(score.strip('\n')))
# -*- coding: utf-8 -*- """ @author: Yi-Ru Cheng """ import os from collections import OrderedDict import sys sys.path.append('..') from evaluation import mean_avg_precision, recall dataset_dir = '../../sample-en-dataset' eval_map = mean_avg_precision.MAP('doc_freq200_map.csv') k = 5 eval_recall = recall.Recall('doc_freq200_recall@' + str(k) + '.csv') for topic in os.listdir(dataset_dir): print('***********************' + topic) topic_dir = os.path.join(dataset_dir, topic, 'tagMe') doc_recall = {} for doc in os.listdir(topic_dir): entities = {} """ counting the frequency of each entity """ with open(os.path.join(topic_dir, doc), 'r', encoding='utf-8-sig') as f: content = [ l.strip().lower() for l in f.read().split('|') if l.strip().lower()
import os from collections import OrderedDict import ast import numpy import csv from package import pagerank import sys sys.path.append('..') from evaluation import mean_avg_precision, recall edges_dir = '../doc_edges' eval_map = mean_avg_precision.MAP('doc_pagerank_map.csv') k = 5 eval_recall = recall.Recall('doc_pagerank_recall@' + str(k) + '.csv') for topic in os.listdir(edges_dir): print('***********************' + topic) topic_dir = os.path.join(edges_dir, topic) doc_recall = {} for doc in os.listdir(topic_dir): entities = {} with open(os.path.join(topic_dir, doc), 'r', encoding='utf-8-sig') as f: reader = csv.reader(f) entities = {row[0]: ast.literal_eval(row[1]) for row in reader} f.close() index_list = list(entities.keys())