コード例 #1
0
    def __init__(self, candidates):
        self.G = nx.Graph()
        _candidate_filter(candidates)
        self.candidates = candidates
        neighbors = {}
        self.index_map = {}

        self.candidate_uris = set()
        for cand in candidates:
            for e in cand.entities:
                self.candidate_uris.add(e.uri)
                neighbors[e.uri] = defaultdict(lambda: 0)
                for n_uri, n_count in NgramService.get_wiki_title_pagelinks(
                        e.uri).items():
                    neighbors[e.uri][n_uri] = int(n_count)
                for n_uri, n_count in NgramService.get_wiki_links_cooccur(
                        e.uri).items():
                    neighbors[e.uri][n_uri] += int(n_count)
                # delete self
                try:
                    del neighbors[e.uri][e.uri]
                except KeyError:
                    pass

        for cand in candidates:
            """
            :type cand: CandidateEntity
            """
            for e in cand.entities:
                for neighbor, weight in neighbors[e.uri].iteritems():
                    if self.G.has_edge(e.uri, neighbor):
                        continue
                    try:
                        self.G.add_edge(e.uri, neighbor, {'w': int(weight)})
                    # happens because of malformed links
                    except ValueError:
                        pass
                # always add candidates
                self.G.add_node(e.uri)

        # prune 1-degree edges except original candidates
        to_remove = set()
        for node, degree in self.G.degree_iter():
            if degree <= 1:
                to_remove.add(node)
        to_remove = to_remove.difference(self.candidate_uris)
        self.G.remove_nodes_from(to_remove)
        if self.G.number_of_nodes() > 0:
            self.matrix = nx.to_scipy_sparse_matrix(self.G,
                                                    weight='w',
                                                    dtype=np.float64)
            self.matrix = normalize(self.matrix, norm='l1', axis=1)
        for i, uri in enumerate(self.G.nodes()):
            self.index_map[uri] = i
コード例 #2
0
ファイル: __init__.py プロジェクト: dragoon/kilogram
def closeness_pruning(candidates, pickle_dict=None):
    import itertools
    if pickle_dict is None:
        pickle_dict = {}
    for cand1, cand2 in itertools.combinations(candidates, 2):
        prev_max_count = 0
        new_cand1_entities = []
        new_cand2_entities = []
        if cand1.cand_string == cand2.cand_string:
            continue
        if len(cand1.entities) > len(cand2.entities):
            cand2, cand1 = cand1, cand2
        cand2_entities = set([e.uri for e in cand2.entities])
        if not cand2_entities:
            continue
        for entity in cand1.entities:
            if entity.uri in pickle_dict:
                related_uris = pickle_dict[entity.uri]
            else:
                related_uris = NgramService.get_wiki_links_cooccur(entity.uri)
                pickle_dict[entity.uri] = related_uris
            intersect = set(related_uris.keys()).intersection(cand2_entities)
            if intersect:
                for intersect_uri in intersect:
                    pickle_dict[
                        intersect_uri] = NgramService.get_wiki_links_cooccur(
                            intersect_uri)
                    max_count = int(related_uris[intersect_uri]) + int(
                        pickle_dict[intersect_uri].get(entity.uri, 0))
                    if prev_max_count < max_count:
                        new_cand1_entities = [entity]
                        new_cand2_entities = [
                            e for e in cand2.entities if e.uri == intersect_uri
                        ]
                        prev_max_count = max_count
                    elif prev_max_count == max_count:
                        new_cand1_entities.append(entity)
                        new_cand2_entities.extend([
                            e for e in cand2.entities if e.uri == intersect_uri
                        ])

        if new_cand1_entities:
            cand1.entities = new_cand1_entities
        if new_cand2_entities:
            cand2.entities = new_cand2_entities

    for cand in candidates:
        if cand.cand_string.islower() and len(cand.entities) > 1:
            cand.entities = []
コード例 #3
0
ファイル: __init__.py プロジェクト: XI-lab/kilogram
def closeness_pruning(candidates, pickle_dict=None):
    import itertools
    if pickle_dict is None:
        pickle_dict = {}
    for cand1, cand2 in itertools.combinations(candidates, 2):
        prev_max_count = 0
        new_cand1_entities = []
        new_cand2_entities = []
        if cand1.cand_string == cand2.cand_string:
            continue
        if len(cand1.entities) > len(cand2.entities):
            cand2, cand1 = cand1, cand2
        cand2_entities = set([e.uri for e in cand2.entities])
        if not cand2_entities:
            continue
        for entity in cand1.entities:
            if entity.uri in pickle_dict:
                related_uris = pickle_dict[entity.uri]
            else:
                related_uris = NgramService.get_wiki_links_cooccur(entity.uri)
                pickle_dict[entity.uri] = related_uris
            intersect = set(related_uris.keys()).intersection(cand2_entities)
            if intersect:
                for intersect_uri in intersect:
                    pickle_dict[intersect_uri] = NgramService.get_wiki_links_cooccur(intersect_uri)
                    max_count = int(related_uris[intersect_uri]) + int(pickle_dict[intersect_uri].get(entity.uri, 0))
                    if prev_max_count < max_count:
                        new_cand1_entities = [entity]
                        new_cand2_entities = [e for e in cand2.entities if e.uri == intersect_uri]
                        prev_max_count = max_count
                    elif prev_max_count == max_count:
                        new_cand1_entities.append(entity)
                        new_cand2_entities.extend([e for e in cand2.entities if e.uri == intersect_uri])

        if new_cand1_entities:
            cand1.entities = new_cand1_entities
        if new_cand2_entities:
            cand2.entities = new_cand2_entities

    for cand in candidates:
        if cand.cand_string.islower() and len(cand.entities) > 1:
            cand.entities = []
コード例 #4
0
ファイル: __init__.py プロジェクト: dragoon/kilogram
    def __init__(self, candidates):
        self.G = nx.Graph()
        _candidate_filter(candidates)
        self.candidates = candidates
        neighbors = {}
        self.index_map = {}

        #self.candidate_uris1 = set()
        #for cand in candidates:
        #    self.candidate_uris1.add(cand.cand_string)

        self.candidate_uris = set()
        for cand in candidates:
            total = sum([e.count for e in cand.entities])
            for e in cand.entities:
                mention_uri = _mention_uri(e.uri, cand.cand_string)
                self.candidate_uris.add(mention_uri)
                neighbors[
                    mention_uri] = NgramService.get_wiki_link_mention_cooccur(
                        mention_uri)
                # delete self
                try:
                    del neighbors[mention_uri][mention_uri]
                except KeyError:
                    pass
                for neighbor, weight in neighbors[mention_uri].iteritems():
                    #if neighbor.split('|')[0] not in self.candidate_uris1:
                    #    continue
                    if self.G.has_edge(mention_uri, neighbor):
                        continue
                    try:
                        self.G.add_edge(mention_uri, neighbor,
                                        {'w': int(weight)})
                    # happens because of malformed links
                    except ValueError:
                        pass
                # always add candidates
                self.G.add_node(mention_uri, {'prior': e.count / total})

        # prune 1-degree edges except original candidates
        to_remove = set()
        for node, degree in self.G.degree_iter():
            if degree <= 1:
                to_remove.add(node)
        to_remove = to_remove.difference(self.candidate_uris)
        self.G.remove_nodes_from(to_remove)

        if self.G.number_of_nodes() > 0:
            self.matrix = nx.to_scipy_sparse_matrix(self.G,
                                                    weight='w',
                                                    dtype=np.float64)
            self.matrix = normalize(self.matrix, norm='l1', axis=1)
        for i, uri in enumerate(self.G.nodes()):
            self.index_map[uri] = i
コード例 #5
0
ファイル: prediction.py プロジェクト: dragoon/kilogram
    def _get_ngram_probs(self, context, filter_types=None):
        # pre, post, mid bigrams
        context[2] = SUBSTITUTION_TOKEN
        trigrams = [context[:3], context[-3:], context[1:4]]
        bigrams = [context[1:3], None, context[2:4]]

        types = []
        for bigram, trigram in zip(bigrams, trigrams):
            type_values = NgramService.hbase_raw(self.hbase_table, " ".join(trigram), "ngram:value")
            if not type_values and bigram:
                type_values = NgramService.hbase_raw(self.hbase_table, " ".join(bigram), "ngram:value")
            if type_values:
                type_values_unpacked = ListPacker.unpack(type_values)
                if filter_types:
                    type_values_unpacked = [x for x in type_values_unpacked if x[0] in filter_types]
                if type_values_unpacked:
                    types.append(type_values_unpacked)
        totals = [sum(int(x) for x in zip(*type_values)[1]) for type_values in types]
        probs = [[(entity_type, int(count)/totals[i]) for entity_type, count in type_values]
                        for i, type_values in enumerate(types)]
        return probs
コード例 #6
0
ファイル: __init__.py プロジェクト: XI-lab/kilogram
    def _get_uri_counts_hbase(self):
        table = "wiki_anchor_ngrams"
        column = "ngram:value"
        prev_cand_string = None
        for cand_string in (self.cand_string, self.cand_string.title(), split_camel_case(self.cand_string)):
            if prev_cand_string and cand_string == prev_cand_string:
                continue
            res = NgramService.hbase_raw(table, cand_string, column)
            if res:
                candidates = ListPacker.unpack(res)
                self.cand_string = cand_string
                return [(uri, float(count)) for uri, count in candidates]
            prev_cand_string = cand_string

        res = NgramService.hbase_raw("wiki_anchor_ngrams_nospace",
                                     self.cand_string.replace(' ', '').lower(), column)
        if res:
            candidates = ListPacker.unpack(res)
            self.cand_string = self.cand_string.replace(' ', '').lower()
            return [(uri, float(count)) for uri, count in candidates]

        return None
コード例 #7
0
ファイル: __init__.py プロジェクト: XI-lab/kilogram
    def __init__(self, candidates):
        self.G = nx.Graph()
        _candidate_filter(candidates)
        self.candidates = candidates
        neighbors = {}
        self.index_map = {}

        #self.candidate_uris1 = set()
        #for cand in candidates:
        #    self.candidate_uris1.add(cand.cand_string)

        self.candidate_uris = set()
        for cand in candidates:
            total = sum([e.count for e in cand.entities])
            for e in cand.entities:
                mention_uri = _mention_uri(e.uri, cand.cand_string)
                self.candidate_uris.add(mention_uri)
                neighbors[mention_uri] = NgramService.get_wiki_link_mention_cooccur(mention_uri)
                # delete self
                try:
                    del neighbors[mention_uri][mention_uri]
                except KeyError:
                    pass
                for neighbor, weight in neighbors[mention_uri].iteritems():
                    #if neighbor.split('|')[0] not in self.candidate_uris1:
                    #    continue
                    if self.G.has_edge(mention_uri, neighbor):
                        continue
                    try:
                        self.G.add_edge(mention_uri, neighbor, {'w': int(weight)})
                    # happens because of malformed links
                    except ValueError:
                        pass
                # always add candidates
                self.G.add_node(mention_uri, {'prior': e.count/total})

        # prune 1-degree edges except original candidates
        to_remove = set()
        for node, degree in self.G.degree_iter():
            if degree <= 1:
                to_remove.add(node)
        to_remove = to_remove.difference(self.candidate_uris)
        self.G.remove_nodes_from(to_remove)

        if self.G.number_of_nodes() > 0:
            self.matrix = nx.to_scipy_sparse_matrix(self.G, weight='w', dtype=np.float64)
            self.matrix = normalize(self.matrix, norm='l1', axis=1)
        for i, uri in enumerate(self.G.nodes()):
            self.index_map[uri] = i
コード例 #8
0
ファイル: __init__.py プロジェクト: dragoon/kilogram
    def _get_uri_counts_hbase(self):
        table = "wiki_anchor_ngrams"
        column = "ngram:value"
        prev_cand_string = None
        for cand_string in (self.cand_string, self.cand_string.title(),
                            split_camel_case(self.cand_string)):
            if prev_cand_string and cand_string == prev_cand_string:
                continue
            res = NgramService.hbase_raw(table, cand_string, column)
            if res:
                candidates = ListPacker.unpack(res)
                self.cand_string = cand_string
                return [(uri, float(count)) for uri, count in candidates]
            prev_cand_string = cand_string

        res = NgramService.hbase_raw("wiki_anchor_ngrams_nospace",
                                     self.cand_string.replace(' ', '').lower(),
                                     column)
        if res:
            candidates = ListPacker.unpack(res)
            self.cand_string = self.cand_string.replace(' ', '').lower()
            return [(uri, float(count)) for uri, count in candidates]

        return None
コード例 #9
0
ファイル: densest_subgraph.py プロジェクト: dragoon/kilogram
    def __init__(self, candidates):
        context = zmq.Context()
        socket = context.socket(zmq.REQ)
        socket.connect("ipc:///tmp/wikipedia_signatures")
        self.G = nx.DiGraph()
        self.candidates = candidates
        neighbors = {}

        for cand in candidates:
            for e in cand.entities:
                neighbors[e.uri] = NgramService.get_wiki_edge_weights(e.uri)
                # delete self
                try:
                    del neighbors[e.uri][e.uri]
                except KeyError:
                    pass

        for cand_i in candidates:
            """
            :type cand_i: CandidateEntity
            """
            for cand_j in candidates:
                # do not link same candidates
                if cand_i == cand_j:
                    continue
                # skip edges between candidates originating from the same noun
                if cand_j.noun_index is not None and cand_i.noun_index is not None \
                        and cand_j.noun_index == cand_i.noun_index:
                    continue
                for e_i in cand_i.entities:
                    for e_j in cand_j.entities:
                        if not self.G.has_edge(e_i.uri, e_j.uri):
                            weight = int(neighbors[e_i.uri].get(e_j.uri, 0))
                            if weight > 0:
                                self.G.add_edge(e_i.uri, e_j.uri, {'w': weight})
        self.uri_fragment_counts = defaultdict(lambda: 0)
        # TODO: do not prune if no nodes?
        if self.G.number_of_nodes() == 0:
            return
        for cand in candidates:
            for e in cand.entities:
                if self.G.has_node(e.uri):
                    self.uri_fragment_counts[e.uri] += 1
コード例 #10
0
ファイル: prediction.py プロジェクト: dragoon/kilogram
    def _get_full_ngram_probs(self, context):
        # pre, post, mid bigrams
        context[2] = SUBSTITUTION_TOKEN
        trigrams = [context[:3], context[-3:], context[1:4]]
        bigrams = [context[1:3], 'NONE', context[2:4]]

        types = {2: [], 3: []}
        for bigram, trigram in zip(bigrams, trigrams):
            for ngram, order in zip((bigram, trigram), (2, 3)):
                type_values = NgramService.hbase_raw(self.hbase_table, " ".join(ngram), "ngram:value") or []
                type_values_unpacked = ListPacker.unpack(type_values)
                type_dict = {'ngram': ' '.join(ngram)}
                try:
                    total = sum(int(x) for x in zip(*type_values_unpacked)[1])
                    features = [{'name': e_type, 'count': int(count), 'prob': int(count)/total,
                                 'pmi': int(count)/total/self.type_priors[e_type]} for e_type, count in type_values_unpacked]
                    features.sort(key=lambda x: x['pmi'], reverse=True)
                    type_dict['values'] = features
                except:
                    type_dict['values'] = []
                types[order].append(type_dict)
        return types
コード例 #11
0
ファイル: el_babel_tests.py プロジェクト: dragoon/kilogram
import unittest

from dataset.dbpedia import NgramEntityResolver
from dataset.entity_linking.msnbc import DataSet
from entity_linking import syntactic_subsumption, extract_candidates
from entity_linking.babelfy import link, SemanticGraph
from entity_linking.evaluation import Metrics
from kilogram import NgramService
import kilogram

NgramService.configure(hbase_host=('diufpc304', '9090'))
kilogram.NER_HOSTNAME = 'diufpc54.unifr.ch'
ner = NgramEntityResolver(
    "/Users/dragoon/Downloads/dbpedia/dbpedia_data.txt",
    "/Users/dragoon/Downloads/dbpedia/dbpedia_2015-04.owl")
msnbc_data = DataSet('../extra/data/msnbc/texts/',
                     '../extra/data/msnbc/msnbc_truth.txt', ner)


class TestEntityLinking(unittest.TestCase):
    def test_extract_candidates(self):
        self.assertIsNotNone(extract_candidates([("Obama", "NNP")]))
        self.assertEquals(
            len(
                extract_candidates([('Obama', 'NNP'), ('went', 'VBD'),
                                    ('with', 'IN'), ('me', 'PRP'),
                                    ('for', 'IN'), ('a', 'DT'), ('walk', 'NN'),
                                    ('.', '.')])), 2)

    def test_entity_linking(self):
        print link(
コード例 #12
0
ファイル: el_babel_tests.py プロジェクト: XI-lab/kilogram
import unittest

from dataset.dbpedia import NgramEntityResolver
from dataset.entity_linking.msnbc import DataSet
from entity_linking import syntactic_subsumption, extract_candidates
from entity_linking.babelfy import link, SemanticGraph
from entity_linking.evaluation import Metrics
from kilogram import NgramService
import kilogram

NgramService.configure(hbase_host=('diufpc304', '9090'))
kilogram.NER_HOSTNAME = 'diufpc54.unifr.ch'
ner = NgramEntityResolver("/Users/dragoon/Downloads/dbpedia/dbpedia_data.txt",
                          "/Users/dragoon/Downloads/dbpedia/dbpedia_2015-04.owl")
msnbc_data = DataSet('../extra/data/msnbc/texts/',
                        '../extra/data/msnbc/msnbc_truth.txt', ner)


class TestEntityLinking(unittest.TestCase):

    def test_extract_candidates(self):
        self.assertIsNotNone(extract_candidates([("Obama", "NNP")]))
        self.assertEquals(len(extract_candidates([('Obama', 'NNP'), ('went', 'VBD'), ('with', 'IN'), ('me', 'PRP'), ('for', 'IN'), ('a', 'DT'), ('walk', 'NN'), ('.', '.')])), 2)

    def test_entity_linking(self):
        print link("After his departure from Buffalo, Saban returned to coach college football teams including Miami, Army and UCF.")
        print link("Barack and Michelle visited us today.")
        print link("GitHub experienced a massive DDoS attack yesterday evening.")
        print link("Saban, previously a head coach of NFL's Miami, is now coaching Crimson Tide. "
                   "His achievements include leading LSU to the BCS National Championship once and Alabama three times.")
コード例 #13
0
ファイル: el_relrw_tests.py プロジェクト: dragoon/kilogram
import unittest

from kilogram.dataset.dbpedia import NgramEntityResolver
from kilogram.dataset.entity_linking.msnbc import DataSet as MSNBC
from kilogram.dataset.entity_linking.microposts import DataSet as Tweets
from kilogram.entity_linking import syntactic_subsumption
from kilogram.entity_linking.evaluation import Metrics
from kilogram.entity_linking.rel_rw import SemanticGraph
from kilogram.entity_linking.util.ml import Feature
from kilogram.entity_types.prediction import NgramTypePredictor
from kilogram import NgramService
import kilogram

kilogram.DEBUG = False

NgramService.configure(hbase_host=('diufpc304', '9090'),
                       subst_table='typogram')
kilogram.NER_HOSTNAME = 'diufpc54.unifr.ch'
ner = NgramEntityResolver(
    "/Users/dragoon/Downloads/dbpedia/dbpedia_data.txt",
    "/Users/dragoon/Downloads/dbpedia/dbpedia_2015-04.owl")
ngram_predictor = NgramTypePredictor('typogram')


class TestEntityLinkingKBMSNBC(unittest.TestCase):
    msnbc_data = MSNBC('../extra/data/msnbc/texts/',
                       '../extra/data/msnbc/msnbc_truth.txt', ner)

    def test_d2kb(self):
        print 'REL-RW, D2KB'
        feature_file = open('features.txt', 'w')
        metric = Metrics()
コード例 #14
0
 def setUp(self):
     NgramService.configure(hbase_host=('diufpc304', 9090),
                            subst_table='typogram')
     dbpedia_ontology = DBPediaOntology('fixtures/dbpedia_2015-04.owl')
     self.ngram_predictor = NgramTypePredictor('typogram', dbpedia_ontology)
コード例 #15
0
ファイル: mention_rw_tests.py プロジェクト: XI-lab/kilogram
import unittest

from dataset.dbpedia import NgramEntityResolver
from dataset.entity_linking.msnbc import DataSet as MSNBC
from kilogram.entity_linking import syntactic_subsumption
from kilogram.entity_linking.evaluation import Metrics
from kilogram.entity_linking.mention_rw import SemanticGraph
from kilogram.entity_linking.util.ml import Feature
from kilogram.entity_types.prediction import NgramTypePredictor
from kilogram import NgramService
import kilogram

kilogram.DEBUG = False

NgramService.configure(hbase_host=('diufpc304', '9090'), subst_table='typogram')
kilogram.NER_HOSTNAME = 'diufpc54.unifr.ch'
ner = NgramEntityResolver("/Users/dragoon/Downloads/dbpedia/dbpedia_data.txt",
                          "/Users/dragoon/Downloads/dbpedia/dbpedia_2015-04.owl")
ngram_predictor = NgramTypePredictor('typogram')


class TestEntityLinkingKBMSNBC(unittest.TestCase):
    msnbc_data = MSNBC('../extra/data/msnbc/texts/',
                        '../extra/data/msnbc/msnbc_truth.txt', ner)

    def test_d2kb(self):
        print 'MENTION-RW, D2KB'
        feature_file = open('features.txt', 'w')
        metric = Metrics()
        feature_file.write(Feature.header()+'\n')
        for datafile in self.msnbc_data.data:
コード例 #16
0
 def setUp(self):
     NgramService.configure(hbase_host=('diufpc304', 9090), subst_table='typogram')
     dbpedia_ontology = DBPediaOntology('fixtures/dbpedia_2015-04.owl')
     self.ngram_predictor = NgramTypePredictor('typogram', dbpedia_ontology)
コード例 #17
0
ファイル: app.py プロジェクト: dragoon/kilogram
#!/usr/bin/env python
"""
./app.py typogram
"""
import sys
from flask import Flask, jsonify, request
from kilogram.dataset.dbpedia import DBPediaOntology
from kilogram.entity_types.prediction import NgramTypePredictor
from kilogram import NgramService
from kilogram.lang.unicode import strip_unicode

NgramService.configure(hbase_host=('diufpc304', 9090), subst_table=sys.argv[1])

app = Flask(__name__)
dbpedia_ontology = DBPediaOntology('dbpedia_2015-04.owl')
ngram_predictor = NgramTypePredictor(sys.argv[1], dbpedia_ontology)


@app.route('/predict/types/context', methods=['GET'])
def predict_ngram_from_context():
    context = strip_unicode(request.args.get('context').strip()).split()
    return jsonify({'types': ngram_predictor.predict_types_features(context)})


@app.after_request
def after_request(response):
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response


if __name__ == '__main__':
コード例 #18
0
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--dbpedia-data-dir', dest='dbpedia_data_dir', action='store', required=True,
                    help='DBpedia data directory')
parser.add_argument('--ner-host', dest='ner_host', action='store', required=True,
                    help='Hostname of the server where Stanford NER is running')
parser.add_argument('--types-table', dest='types_table', action='store', required=True,
                    help='Typed N-gram table in HBase')
parser.add_argument('--hbase-host', dest='hbase_host', action='store', required=True,
                    help='HBase gateway host')
parser.add_argument('--hbase-port', dest='hbase_port', action='store',
                    default='9090', help='HBase gateway host')


args = parser.parse_args()

NgramService.configure(hbase_host=(args.hbase_host, args.hbase_port))
kilogram.NER_HOSTNAME = args.ner_host
ner = NgramEntityResolver(os.path.join(args.dbpedia_data_dir, "dbpedia_data.txt"),
                          os.path.join(args.dbpedia_data_dir, "dbpedia_2015-04.owl"))


dbpedia_ontology = DBPediaOntology(os.path.join(args.dbpedia_data_dir, "dbpedia_2015-04.owl"))
ngram_predictor = NgramTypePredictor(args.types_table, dbpedia_ontology)

app = Flask(__name__)


@app.route('/entity-linking/d2kb/prior', methods=['POST'])
def d2kb_prior():
    result = request.get_json(force=True)
    candidates = DataSet(result['text'], result['mentions'], ner).candidates