def __init__(self, candidates): self.G = nx.Graph() _candidate_filter(candidates) self.candidates = candidates neighbors = {} self.index_map = {} self.candidate_uris = set() for cand in candidates: for e in cand.entities: self.candidate_uris.add(e.uri) neighbors[e.uri] = defaultdict(lambda: 0) for n_uri, n_count in NgramService.get_wiki_title_pagelinks( e.uri).items(): neighbors[e.uri][n_uri] = int(n_count) for n_uri, n_count in NgramService.get_wiki_links_cooccur( e.uri).items(): neighbors[e.uri][n_uri] += int(n_count) # delete self try: del neighbors[e.uri][e.uri] except KeyError: pass for cand in candidates: """ :type cand: CandidateEntity """ for e in cand.entities: for neighbor, weight in neighbors[e.uri].iteritems(): if self.G.has_edge(e.uri, neighbor): continue try: self.G.add_edge(e.uri, neighbor, {'w': int(weight)}) # happens because of malformed links except ValueError: pass # always add candidates self.G.add_node(e.uri) # prune 1-degree edges except original candidates to_remove = set() for node, degree in self.G.degree_iter(): if degree <= 1: to_remove.add(node) to_remove = to_remove.difference(self.candidate_uris) self.G.remove_nodes_from(to_remove) if self.G.number_of_nodes() > 0: self.matrix = nx.to_scipy_sparse_matrix(self.G, weight='w', dtype=np.float64) self.matrix = normalize(self.matrix, norm='l1', axis=1) for i, uri in enumerate(self.G.nodes()): self.index_map[uri] = i
def closeness_pruning(candidates, pickle_dict=None): import itertools if pickle_dict is None: pickle_dict = {} for cand1, cand2 in itertools.combinations(candidates, 2): prev_max_count = 0 new_cand1_entities = [] new_cand2_entities = [] if cand1.cand_string == cand2.cand_string: continue if len(cand1.entities) > len(cand2.entities): cand2, cand1 = cand1, cand2 cand2_entities = set([e.uri for e in cand2.entities]) if not cand2_entities: continue for entity in cand1.entities: if entity.uri in pickle_dict: related_uris = pickle_dict[entity.uri] else: related_uris = NgramService.get_wiki_links_cooccur(entity.uri) pickle_dict[entity.uri] = related_uris intersect = set(related_uris.keys()).intersection(cand2_entities) if intersect: for intersect_uri in intersect: pickle_dict[ intersect_uri] = NgramService.get_wiki_links_cooccur( intersect_uri) max_count = int(related_uris[intersect_uri]) + int( pickle_dict[intersect_uri].get(entity.uri, 0)) if prev_max_count < max_count: new_cand1_entities = [entity] new_cand2_entities = [ e for e in cand2.entities if e.uri == intersect_uri ] prev_max_count = max_count elif prev_max_count == max_count: new_cand1_entities.append(entity) new_cand2_entities.extend([ e for e in cand2.entities if e.uri == intersect_uri ]) if new_cand1_entities: cand1.entities = new_cand1_entities if new_cand2_entities: cand2.entities = new_cand2_entities for cand in candidates: if cand.cand_string.islower() and len(cand.entities) > 1: cand.entities = []
def closeness_pruning(candidates, pickle_dict=None): import itertools if pickle_dict is None: pickle_dict = {} for cand1, cand2 in itertools.combinations(candidates, 2): prev_max_count = 0 new_cand1_entities = [] new_cand2_entities = [] if cand1.cand_string == cand2.cand_string: continue if len(cand1.entities) > len(cand2.entities): cand2, cand1 = cand1, cand2 cand2_entities = set([e.uri for e in cand2.entities]) if not cand2_entities: continue for entity in cand1.entities: if entity.uri in pickle_dict: related_uris = pickle_dict[entity.uri] else: related_uris = NgramService.get_wiki_links_cooccur(entity.uri) pickle_dict[entity.uri] = related_uris intersect = set(related_uris.keys()).intersection(cand2_entities) if intersect: for intersect_uri in intersect: pickle_dict[intersect_uri] = NgramService.get_wiki_links_cooccur(intersect_uri) max_count = int(related_uris[intersect_uri]) + int(pickle_dict[intersect_uri].get(entity.uri, 0)) if prev_max_count < max_count: new_cand1_entities = [entity] new_cand2_entities = [e for e in cand2.entities if e.uri == intersect_uri] prev_max_count = max_count elif prev_max_count == max_count: new_cand1_entities.append(entity) new_cand2_entities.extend([e for e in cand2.entities if e.uri == intersect_uri]) if new_cand1_entities: cand1.entities = new_cand1_entities if new_cand2_entities: cand2.entities = new_cand2_entities for cand in candidates: if cand.cand_string.islower() and len(cand.entities) > 1: cand.entities = []
def __init__(self, candidates): self.G = nx.Graph() _candidate_filter(candidates) self.candidates = candidates neighbors = {} self.index_map = {} #self.candidate_uris1 = set() #for cand in candidates: # self.candidate_uris1.add(cand.cand_string) self.candidate_uris = set() for cand in candidates: total = sum([e.count for e in cand.entities]) for e in cand.entities: mention_uri = _mention_uri(e.uri, cand.cand_string) self.candidate_uris.add(mention_uri) neighbors[ mention_uri] = NgramService.get_wiki_link_mention_cooccur( mention_uri) # delete self try: del neighbors[mention_uri][mention_uri] except KeyError: pass for neighbor, weight in neighbors[mention_uri].iteritems(): #if neighbor.split('|')[0] not in self.candidate_uris1: # continue if self.G.has_edge(mention_uri, neighbor): continue try: self.G.add_edge(mention_uri, neighbor, {'w': int(weight)}) # happens because of malformed links except ValueError: pass # always add candidates self.G.add_node(mention_uri, {'prior': e.count / total}) # prune 1-degree edges except original candidates to_remove = set() for node, degree in self.G.degree_iter(): if degree <= 1: to_remove.add(node) to_remove = to_remove.difference(self.candidate_uris) self.G.remove_nodes_from(to_remove) if self.G.number_of_nodes() > 0: self.matrix = nx.to_scipy_sparse_matrix(self.G, weight='w', dtype=np.float64) self.matrix = normalize(self.matrix, norm='l1', axis=1) for i, uri in enumerate(self.G.nodes()): self.index_map[uri] = i
def _get_ngram_probs(self, context, filter_types=None): # pre, post, mid bigrams context[2] = SUBSTITUTION_TOKEN trigrams = [context[:3], context[-3:], context[1:4]] bigrams = [context[1:3], None, context[2:4]] types = [] for bigram, trigram in zip(bigrams, trigrams): type_values = NgramService.hbase_raw(self.hbase_table, " ".join(trigram), "ngram:value") if not type_values and bigram: type_values = NgramService.hbase_raw(self.hbase_table, " ".join(bigram), "ngram:value") if type_values: type_values_unpacked = ListPacker.unpack(type_values) if filter_types: type_values_unpacked = [x for x in type_values_unpacked if x[0] in filter_types] if type_values_unpacked: types.append(type_values_unpacked) totals = [sum(int(x) for x in zip(*type_values)[1]) for type_values in types] probs = [[(entity_type, int(count)/totals[i]) for entity_type, count in type_values] for i, type_values in enumerate(types)] return probs
def _get_uri_counts_hbase(self): table = "wiki_anchor_ngrams" column = "ngram:value" prev_cand_string = None for cand_string in (self.cand_string, self.cand_string.title(), split_camel_case(self.cand_string)): if prev_cand_string and cand_string == prev_cand_string: continue res = NgramService.hbase_raw(table, cand_string, column) if res: candidates = ListPacker.unpack(res) self.cand_string = cand_string return [(uri, float(count)) for uri, count in candidates] prev_cand_string = cand_string res = NgramService.hbase_raw("wiki_anchor_ngrams_nospace", self.cand_string.replace(' ', '').lower(), column) if res: candidates = ListPacker.unpack(res) self.cand_string = self.cand_string.replace(' ', '').lower() return [(uri, float(count)) for uri, count in candidates] return None
def __init__(self, candidates): self.G = nx.Graph() _candidate_filter(candidates) self.candidates = candidates neighbors = {} self.index_map = {} #self.candidate_uris1 = set() #for cand in candidates: # self.candidate_uris1.add(cand.cand_string) self.candidate_uris = set() for cand in candidates: total = sum([e.count for e in cand.entities]) for e in cand.entities: mention_uri = _mention_uri(e.uri, cand.cand_string) self.candidate_uris.add(mention_uri) neighbors[mention_uri] = NgramService.get_wiki_link_mention_cooccur(mention_uri) # delete self try: del neighbors[mention_uri][mention_uri] except KeyError: pass for neighbor, weight in neighbors[mention_uri].iteritems(): #if neighbor.split('|')[0] not in self.candidate_uris1: # continue if self.G.has_edge(mention_uri, neighbor): continue try: self.G.add_edge(mention_uri, neighbor, {'w': int(weight)}) # happens because of malformed links except ValueError: pass # always add candidates self.G.add_node(mention_uri, {'prior': e.count/total}) # prune 1-degree edges except original candidates to_remove = set() for node, degree in self.G.degree_iter(): if degree <= 1: to_remove.add(node) to_remove = to_remove.difference(self.candidate_uris) self.G.remove_nodes_from(to_remove) if self.G.number_of_nodes() > 0: self.matrix = nx.to_scipy_sparse_matrix(self.G, weight='w', dtype=np.float64) self.matrix = normalize(self.matrix, norm='l1', axis=1) for i, uri in enumerate(self.G.nodes()): self.index_map[uri] = i
def __init__(self, candidates): context = zmq.Context() socket = context.socket(zmq.REQ) socket.connect("ipc:///tmp/wikipedia_signatures") self.G = nx.DiGraph() self.candidates = candidates neighbors = {} for cand in candidates: for e in cand.entities: neighbors[e.uri] = NgramService.get_wiki_edge_weights(e.uri) # delete self try: del neighbors[e.uri][e.uri] except KeyError: pass for cand_i in candidates: """ :type cand_i: CandidateEntity """ for cand_j in candidates: # do not link same candidates if cand_i == cand_j: continue # skip edges between candidates originating from the same noun if cand_j.noun_index is not None and cand_i.noun_index is not None \ and cand_j.noun_index == cand_i.noun_index: continue for e_i in cand_i.entities: for e_j in cand_j.entities: if not self.G.has_edge(e_i.uri, e_j.uri): weight = int(neighbors[e_i.uri].get(e_j.uri, 0)) if weight > 0: self.G.add_edge(e_i.uri, e_j.uri, {'w': weight}) self.uri_fragment_counts = defaultdict(lambda: 0) # TODO: do not prune if no nodes? if self.G.number_of_nodes() == 0: return for cand in candidates: for e in cand.entities: if self.G.has_node(e.uri): self.uri_fragment_counts[e.uri] += 1
def _get_full_ngram_probs(self, context): # pre, post, mid bigrams context[2] = SUBSTITUTION_TOKEN trigrams = [context[:3], context[-3:], context[1:4]] bigrams = [context[1:3], 'NONE', context[2:4]] types = {2: [], 3: []} for bigram, trigram in zip(bigrams, trigrams): for ngram, order in zip((bigram, trigram), (2, 3)): type_values = NgramService.hbase_raw(self.hbase_table, " ".join(ngram), "ngram:value") or [] type_values_unpacked = ListPacker.unpack(type_values) type_dict = {'ngram': ' '.join(ngram)} try: total = sum(int(x) for x in zip(*type_values_unpacked)[1]) features = [{'name': e_type, 'count': int(count), 'prob': int(count)/total, 'pmi': int(count)/total/self.type_priors[e_type]} for e_type, count in type_values_unpacked] features.sort(key=lambda x: x['pmi'], reverse=True) type_dict['values'] = features except: type_dict['values'] = [] types[order].append(type_dict) return types
import unittest from dataset.dbpedia import NgramEntityResolver from dataset.entity_linking.msnbc import DataSet from entity_linking import syntactic_subsumption, extract_candidates from entity_linking.babelfy import link, SemanticGraph from entity_linking.evaluation import Metrics from kilogram import NgramService import kilogram NgramService.configure(hbase_host=('diufpc304', '9090')) kilogram.NER_HOSTNAME = 'diufpc54.unifr.ch' ner = NgramEntityResolver( "/Users/dragoon/Downloads/dbpedia/dbpedia_data.txt", "/Users/dragoon/Downloads/dbpedia/dbpedia_2015-04.owl") msnbc_data = DataSet('../extra/data/msnbc/texts/', '../extra/data/msnbc/msnbc_truth.txt', ner) class TestEntityLinking(unittest.TestCase): def test_extract_candidates(self): self.assertIsNotNone(extract_candidates([("Obama", "NNP")])) self.assertEquals( len( extract_candidates([('Obama', 'NNP'), ('went', 'VBD'), ('with', 'IN'), ('me', 'PRP'), ('for', 'IN'), ('a', 'DT'), ('walk', 'NN'), ('.', '.')])), 2) def test_entity_linking(self): print link(
import unittest from dataset.dbpedia import NgramEntityResolver from dataset.entity_linking.msnbc import DataSet from entity_linking import syntactic_subsumption, extract_candidates from entity_linking.babelfy import link, SemanticGraph from entity_linking.evaluation import Metrics from kilogram import NgramService import kilogram NgramService.configure(hbase_host=('diufpc304', '9090')) kilogram.NER_HOSTNAME = 'diufpc54.unifr.ch' ner = NgramEntityResolver("/Users/dragoon/Downloads/dbpedia/dbpedia_data.txt", "/Users/dragoon/Downloads/dbpedia/dbpedia_2015-04.owl") msnbc_data = DataSet('../extra/data/msnbc/texts/', '../extra/data/msnbc/msnbc_truth.txt', ner) class TestEntityLinking(unittest.TestCase): def test_extract_candidates(self): self.assertIsNotNone(extract_candidates([("Obama", "NNP")])) self.assertEquals(len(extract_candidates([('Obama', 'NNP'), ('went', 'VBD'), ('with', 'IN'), ('me', 'PRP'), ('for', 'IN'), ('a', 'DT'), ('walk', 'NN'), ('.', '.')])), 2) def test_entity_linking(self): print link("After his departure from Buffalo, Saban returned to coach college football teams including Miami, Army and UCF.") print link("Barack and Michelle visited us today.") print link("GitHub experienced a massive DDoS attack yesterday evening.") print link("Saban, previously a head coach of NFL's Miami, is now coaching Crimson Tide. " "His achievements include leading LSU to the BCS National Championship once and Alabama three times.")
import unittest from kilogram.dataset.dbpedia import NgramEntityResolver from kilogram.dataset.entity_linking.msnbc import DataSet as MSNBC from kilogram.dataset.entity_linking.microposts import DataSet as Tweets from kilogram.entity_linking import syntactic_subsumption from kilogram.entity_linking.evaluation import Metrics from kilogram.entity_linking.rel_rw import SemanticGraph from kilogram.entity_linking.util.ml import Feature from kilogram.entity_types.prediction import NgramTypePredictor from kilogram import NgramService import kilogram kilogram.DEBUG = False NgramService.configure(hbase_host=('diufpc304', '9090'), subst_table='typogram') kilogram.NER_HOSTNAME = 'diufpc54.unifr.ch' ner = NgramEntityResolver( "/Users/dragoon/Downloads/dbpedia/dbpedia_data.txt", "/Users/dragoon/Downloads/dbpedia/dbpedia_2015-04.owl") ngram_predictor = NgramTypePredictor('typogram') class TestEntityLinkingKBMSNBC(unittest.TestCase): msnbc_data = MSNBC('../extra/data/msnbc/texts/', '../extra/data/msnbc/msnbc_truth.txt', ner) def test_d2kb(self): print 'REL-RW, D2KB' feature_file = open('features.txt', 'w') metric = Metrics()
def setUp(self): NgramService.configure(hbase_host=('diufpc304', 9090), subst_table='typogram') dbpedia_ontology = DBPediaOntology('fixtures/dbpedia_2015-04.owl') self.ngram_predictor = NgramTypePredictor('typogram', dbpedia_ontology)
import unittest from dataset.dbpedia import NgramEntityResolver from dataset.entity_linking.msnbc import DataSet as MSNBC from kilogram.entity_linking import syntactic_subsumption from kilogram.entity_linking.evaluation import Metrics from kilogram.entity_linking.mention_rw import SemanticGraph from kilogram.entity_linking.util.ml import Feature from kilogram.entity_types.prediction import NgramTypePredictor from kilogram import NgramService import kilogram kilogram.DEBUG = False NgramService.configure(hbase_host=('diufpc304', '9090'), subst_table='typogram') kilogram.NER_HOSTNAME = 'diufpc54.unifr.ch' ner = NgramEntityResolver("/Users/dragoon/Downloads/dbpedia/dbpedia_data.txt", "/Users/dragoon/Downloads/dbpedia/dbpedia_2015-04.owl") ngram_predictor = NgramTypePredictor('typogram') class TestEntityLinkingKBMSNBC(unittest.TestCase): msnbc_data = MSNBC('../extra/data/msnbc/texts/', '../extra/data/msnbc/msnbc_truth.txt', ner) def test_d2kb(self): print 'MENTION-RW, D2KB' feature_file = open('features.txt', 'w') metric = Metrics() feature_file.write(Feature.header()+'\n') for datafile in self.msnbc_data.data:
#!/usr/bin/env python """ ./app.py typogram """ import sys from flask import Flask, jsonify, request from kilogram.dataset.dbpedia import DBPediaOntology from kilogram.entity_types.prediction import NgramTypePredictor from kilogram import NgramService from kilogram.lang.unicode import strip_unicode NgramService.configure(hbase_host=('diufpc304', 9090), subst_table=sys.argv[1]) app = Flask(__name__) dbpedia_ontology = DBPediaOntology('dbpedia_2015-04.owl') ngram_predictor = NgramTypePredictor(sys.argv[1], dbpedia_ontology) @app.route('/predict/types/context', methods=['GET']) def predict_ngram_from_context(): context = strip_unicode(request.args.get('context').strip()).split() return jsonify({'types': ngram_predictor.predict_types_features(context)}) @app.after_request def after_request(response): response.headers.add('Access-Control-Allow-Origin', '*') return response if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('--dbpedia-data-dir', dest='dbpedia_data_dir', action='store', required=True, help='DBpedia data directory') parser.add_argument('--ner-host', dest='ner_host', action='store', required=True, help='Hostname of the server where Stanford NER is running') parser.add_argument('--types-table', dest='types_table', action='store', required=True, help='Typed N-gram table in HBase') parser.add_argument('--hbase-host', dest='hbase_host', action='store', required=True, help='HBase gateway host') parser.add_argument('--hbase-port', dest='hbase_port', action='store', default='9090', help='HBase gateway host') args = parser.parse_args() NgramService.configure(hbase_host=(args.hbase_host, args.hbase_port)) kilogram.NER_HOSTNAME = args.ner_host ner = NgramEntityResolver(os.path.join(args.dbpedia_data_dir, "dbpedia_data.txt"), os.path.join(args.dbpedia_data_dir, "dbpedia_2015-04.owl")) dbpedia_ontology = DBPediaOntology(os.path.join(args.dbpedia_data_dir, "dbpedia_2015-04.owl")) ngram_predictor = NgramTypePredictor(args.types_table, dbpedia_ontology) app = Flask(__name__) @app.route('/entity-linking/d2kb/prior', methods=['POST']) def d2kb_prior(): result = request.get_json(force=True) candidates = DataSet(result['text'], result['mentions'], ner).candidates