Example #1
0
# -*- coding: utf-8 -*-
"""
@author: Yi-Ru Cheng
"""

import os
from collections import OrderedDict
import sys
sys.path.append('..')
from evaluation import mean_avg_precision, recall

dataset_dir = '../../chinese-dataset'
tfidf_dir = '/zh_tfidf'

eval_map = mean_avg_precision.MAP('zh_CKIP_doc_freq200_map.csv')
k = 5
eval_recall = recall.Recall('zh_CKIP_doc_freq200_recall@' + str(k) + '.csv')
for topic in os.listdir(dataset_dir):
    print('***********************' + topic)
    topic_dir = os.path.join(dataset_dir, topic)

    doc_recall = {}
    for doc in os.listdir(topic_dir):
        entities = {}
        with open(os.path.join(topic_dir, doc), 'r',
                  encoding='utf-8-sig') as f:
            content = f.readlines()
            content = [x.strip() for x in content]
        f.close()

        for e in content:
Example #2
0
"""
@author: Yi-Ru Cheng
"""

import os
from collections import OrderedDict
import numpy
import sys
sys.path.append('..')
from evaluation import mean_avg_precision, recall 


dataset_dir = '../wikipedia'
tfidf_dir = 'wiki_tfidf'

eval_map = mean_avg_precision.MAP('wiki_tfidf_map.csv')
k = 5
eval_recall = recall.Recall('wiki_tfidf_recall@'+str(k)+'.csv')

def getTopKEntitiesByTFIDFperDoc(topic, doc, k):
    doc_dir = os.path.join(tfidf_dir, topic, doc)
    
    tfidf_map = {}
    with open(doc_dir, 'r', encoding='utf-8-sig') as f:
        content = f.readlines()
    f.close()
    for l in content:
        (entity, score) = l.split('|')
        if entity not in tfidf_map:
            tfidf_map[entity] = []
            tfidf_map[entity].append(float(score.strip('\n')))
Example #3
0
# -*- coding: utf-8 -*-
"""
@author: Yi-Ru Cheng
"""

import os
import numpy
import sys
sys.path.append('..')
from evaluation import mean_avg_precision, recall

dataset_dir = '../../chinese-dataset'
tfidf_dir = 'tfidf_zh_ckip'

eval_map = mean_avg_precision.MAP('zh_CKIP_doc_tfidf_map.csv')
k = 5
eval_recall = recall.Recall('zh_CKIP_doc_tfidf_recall@' + str(k) + '.csv')


def getTopKEntitiesByTFIDFperDoc(topic, doc, k):
    doc_dir = os.path.join(tfidf_dir, topic, doc)

    tfidf_map = {}
    with open(doc_dir, 'r', encoding='utf-8-sig') as f:
        content = f.readlines()
    f.close()
    for l in content:
        (entity, score) = l.split('|')
        if entity not in tfidf_map:
            tfidf_map[entity] = []
            tfidf_map[entity].append(float(score.strip('\n')))
Example #4
0
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 17 18:56:00 2018

@author: Yiru
"""

import os
import random
import sys
sys.path.append('..')
from evaluation import mean_avg_precision, recall

dataset_dir = '../../sample-en-dataset'

eval_map = mean_avg_precision.MAP('doc_baseline_map.csv')
k = 5
eval_recall = recall.Recall('doc_baseline_recall@'+str(k)+'.csv')


for topic in os.listdir(dataset_dir):
    print('***********************'+topic)
    topic_dir = os.path.join(dataset_dir, topic, 'tagMe')
    
    doc_recall = {}
    for doc in os.listdir(topic_dir):
        entities = {}
        with open(os.path.join(topic_dir, doc), 'r', encoding='utf-8-sig') as f:
            content = [l.strip().lower() for l in f.read().split('|') if l.strip().lower()]
        f.close()
        
Example #5
0
# -*- coding: utf-8 -*-
"""
@author: Yi-Ru Cheng
"""

import os
from collections import OrderedDict
import sys
sys.path.append('..')
from evaluation import mean_avg_precision, recall 


dataset_dir = '../wikipedia'

eval_map = mean_avg_precision.MAP('wiki_freq200_map.csv')
k = 5
eval_recall = recall.Recall('wiki_freq200_recall@'+str(k)+'.csv')
for topic in os.listdir(dataset_dir):
    print('***********************'+topic)
    topic_dir = os.path.join(dataset_dir, topic)
    
    doc_recall = {}
    for doc in os.listdir(topic_dir):
        
        entities = {}
        
        """
        counting the frequency of each entity
        """
        with open(os.path.join(topic_dir, doc), 'r', encoding='utf-8-sig') as f:
            content = f.readlines()
Example #6
0
# -*- coding: utf-8 -*-
"""
@author: Yi-Ru Cheng
"""

import os
import numpy
import sys

sys.path.append('..')
from evaluation import mean_avg_precision, recall

dataset_dir = '../../sample-en-dataset'
tfidf_dir = '/tfidf'

eval_map = mean_avg_precision.MAP('doc_tfidf_map.csv')
k = 5
eval_recall = recall.Recall('doc_tfidf_recall@' + str(k) + '.csv')


def getTopKEntitiesByTFIDFperDoc(topic, doc, k):
    doc_dir = os.path.join(tfidf_dir, topic, doc)

    tfidf_map = {}
    with open(doc_dir, 'r', encoding='utf-8-sig') as f:
        content = f.readlines()
    f.close()
    for l in content:
        (entity, score) = l.split('|')
        if entity not in tfidf_map:
            tfidf_map[entity] = []
Example #7
0
@author: Yi-Ru Cheng
"""

import os
from collections import OrderedDict
import ast
import numpy
import csv
from package import pagerank
import sys
sys.path.append('..')
from evaluation import mean_avg_precision, recall

edges_dir = '../doc_edges'

eval_map = mean_avg_precision.MAP('doc_pagerank_map.csv')
k = 5
eval_recall = recall.Recall('doc_pagerank_recall@' + str(k) + '.csv')
for topic in os.listdir(edges_dir):
    print('***********************' + topic)
    topic_dir = os.path.join(edges_dir, topic)

    doc_recall = {}
    for doc in os.listdir(topic_dir):

        entities = {}
        with open(os.path.join(topic_dir, doc), 'r',
                  encoding='utf-8-sig') as f:
            reader = csv.reader(f)
            entities = {row[0]: ast.literal_eval(row[1]) for row in reader}
        f.close()