Beispiel #1
0
class ClusterMember:
    def __init__(self, model, uri, label=None, type_=None, debug_info=None):
        self.model = model
        self.uri = URIRef(uri)
        self.__id = None
        self.__label = label
        self.__all_labels = None
        self.__type = type_
        self.__targets = None
        self.__freebases = None
        self.__qids = None
        self.__q_labels = None
        self.__q_aliases = None
        self.__q_urls = None
        self.__source = None
        self.__context_pos = []
        self.__context_extractor = None
        self.__cluster: Cluster = None
        self.__debug_info = debug_info

        if model.graph:
            self.__open_clause = 'GRAPH <%s> {' % self.model.graph
            self.__close_clause = '}'
        else:
            self.__open_clause = self.__close_clause = ''

    @property
    def id(self):
        if not self.__id:
            self.__id = self.uri.replace(
                'http://www.isi.edu/gaia/entities/',
                '').replace('http://www.columbia.edu/entities/', '')
        return self.__id

    @property
    def label(self):
        if not self.__label:
            self._init_member()
        return self.__label

    @property
    def all_labels(self):
        if not self.__all_labels:
            self.__all_labels = Counter()
            query = """
                SELECT ?label (COUNT(?label) AS ?n)
                WHERE {
                  ?member aida:justifiedBy/skos:prefLabel ?label .
                }
                GROUP BY ?label
                ORDER BY DESC(?n)
            """
            for label, n in self.model.sparql.query(query, namespaces,
                                                    {'member': self.uri}):
                if label:
                    label = " ".join(label.split())  # remove double spaces
                    self.__all_labels[label] = int(n)

            query = """
                SELECT ?label (COUNT(?label) AS ?n)
                    WHERE {
                      ?member aida:hasName ?label .
                    }
                    GROUP BY ?label
                    ORDER BY DESC(?n)
                """
            for label, n in self.model.sparql.query(query, namespaces,
                                                    {'member': self.uri}):
                if label:
                    label = " ".join(label.split())  # remove double spaces
                    if label in self.__all_labels:
                        self.__all_labels[label] += int(n)
                    else:
                        self.__all_labels[label] = int(n)

        return self.__all_labels.most_common()

    @property
    def type(self):
        if not self.__type:
            self._init_member()
        return self.__type

    @property
    def type_text(self):
        _, text = split_uri(self.type)
        return text

    @property
    def targets(self):
        if self.__targets is None:
            self._init_member()
        return self.__targets

    @property
    def freebases(self):
        if self.__freebases is None:
            self._init_member()
        return self.__freebases

    @property
    def qids(self):
        if self.__qids is None and self.freebases:
            self._init_qnode()
        return self.__qids

    @property
    def q_urls(self):
        if self.__qids is None and self.freebases:
            self._init_qnode()
        return self.__q_urls

    @property
    def q_labels(self):
        if self.__q_labels is None and self.freebases:
            self._init_qnode()
        return self.__q_labels

    @property
    def q_aliases(self):
        if self.__q_aliases is None and self.freebases:
            self._init_qnode()
        return self.__q_aliases

    def _init_qnode(self):
        self.__qids = {}  # qid to score
        self.__q_urls = {}
        self.__q_labels = {}
        self.__q_aliases = {}

        for fbid, score in self.freebases.items():
            if ":NIL" not in fbid:
                fbid = '/' + fbid[fbid.find(':') + 1:].replace('.', '/')
                query = """
                    SELECT ?qid ?label WHERE {
                      ?qid wdt:P646 ?freebase .
                      ?qid rdfs:label ?label filter (lang(?label) = "en") .
                    }
                    LIMIT 1
                """
                for q_url, label in wikidata_sparql.query(
                        query, namespaces, {'freebase': Literal(fbid)}):
                    qid = str(q_url).rsplit('/', 1)[1]
                    self.__qids[qid] = score
                    self.__q_urls[qid] = str(q_url)
                    self.__q_labels[qid] = str(label)

                query = """
                    SELECT ?qid ?alias WHERE {
                      ?qid wdt:P646 ?freebase .
                      ?qid skos:altLabel ?alias filter (lang(?alias) = "en") .
                    }
                """
                aliases = []
                qid = None
                for q_url, alias in wikidata_sparql.query(
                        query, namespaces, {'freebase': Literal(fbid)}):
                    qid = str(q_url).rsplit('/', 1)[1]
                    aliases.append(str(alias))
                self.__q_aliases[qid] = ', '.join(aliases)

    @property
    def context_extractor(self):
        if self.__context_extractor is None:
            self.__context_extractor = LTFSourceContext(self.source)
        return self.__context_extractor

    @property
    def roles(self):
        query = """
        SELECT ?pred ?obj ?objtype (MIN(?objlbl) AS ?objlabel)
        WHERE {
            ?statement rdf:subject ?event ;
                       rdf:predicate ?pred ;
                       rdf:object ?obj .
            ?objstate rdf:subject ?obj ;
                      rdf:predicate rdf:type ;
                      rdf:object ?objtype .
            OPTIONAL { ?obj aida:hasName ?objlbl }
        }
        GROUP BY ?pred ?obj ?objtype
        """
        for pred, obj, obj_type, obj_lbl in self.model.sparql.query(
                query, namespaces, {'event': self.uri}):
            if not obj_lbl:
                _, obj_lbl = split_uri(obj_type)
            # _, pred = split_uri(pred)
            ind = pred.find('_')
            pred = pred[ind + 1:]
            yield pred, ClusterMember(self.model, obj, obj_lbl, obj_type)

    @property
    def events_by_role(self):
        query = """
      SELECT ?pred ?event ?event_type (MIN(?lbl) AS ?label)
      WHERE {
          ?event a aida:Event .
          ?statement rdf:subject ?event ;
                    rdf:predicate ?pred ;
                    rdf:object ?obj .
          ?event_state rdf:subject ?event ;
                    rdf:predicate rdf:type ;
                    rdf:object ?event_type .
          OPTIONAL { ?event aida:justifiedBy/skos:prefLabel ?lbl }
      }
      GROUP BY ?pred ?event ?event_type
      """
        for pred, event, event_type, event_lbl in self.model.sparql.query(
                query, namespaces, {'obj': self.uri}):
            if not event_lbl:
                _, event_lbl = split_uri(event_type)
            ind = pred.find('_')
            pred = pred[ind + 1:]
            yield pred, ClusterMember(self.model, event, event_lbl, event_type)

    @property
    def entity_relations(self):
        query = """
        SELECT ?relation ?pred2 ?obj2 ?relation_type (min(?lbl) as ?label)
        WHERE {
            ?relation a aida:Relation .
            ?s1 rdf:subject ?relation ;
                        rdf:predicate ?pred ;
                        rdf:object ?obj .
            ?s2 rdf:subject ?relation ;
                        rdf:predicate rdf:type ;
                        rdf:object ?relation_type .
            ?s3 rdf:subject ?relation ;
                        rdf:predicate ?pred2 ;
                        rdf:object ?obj2 .
            OPTIONAL {?obj2 aida:hasName ?lbl}
            filter(?s3 != ?s2 && ?s3 != ?s1)
        }
        groupby ?relation ?pred2 ?obj2 ?relation_type
          """
        for relation, pred, obj, relation_type, label in self.model.sparql.query(
                query, namespaces, {'obj': self.uri}):
            _, relation_type = split_uri(relation_type)
            ind = pred.find('_')
            pred = pred[ind + 1:]
            yield relation_type, obj, label

    @property
    def cluster(self):
        if self.__cluster is None:
            query = "SELECT ?cluster WHERE { %s ?membership aida:cluster ?cluster ; aida:clusterMember ?member . MINUS {?cluster aida:prototype ?member} %s}" % (
                self.__open_clause, self.__close_clause)
            for cluster, in self.model.sparql.query(query, namespaces,
                                                    {'member': self.uri}):
                self.__cluster = self.model.get_cluster(cluster)
        return self.__cluster

    def _init_member(self):
        query = """
SELECT ?label ?type
WHERE {
  OPTIONAL { ?member aida:hasName ?label }
  OPTIONAL { ?member aida:justifiedBy ?justification .
    ?justification skos:prefLabel ?label }
  ?statement rdf:subject ?member ;
             rdf:predicate rdf:type ;
             rdf:object ?type .
}
LIMIT 1 """
        for label, type_ in self.model.sparql.query(query, namespaces,
                                                    {'member': self.uri}):
            if not label:
                _, label = split_uri(type_)
            self.__label = label
            self.__type = type_

        self.__targets = {}
        if self.__debug_info:
            if self.__debug_info['targets']:
                for i in range(0, len(self.__debug_info['targets'])):
                    target = self.__debug_info['targets'][i]
                    score = self.__debug_info['target_scores'][i]
                    self.__targets[target] = score
        else:
            query = """
                SELECT ?target
                WHERE {
                  ?member aida:link/aida:linkTarget ?target 
                } """
            for target, in self.model.sparql.query(query, namespaces,
                                                   {'member': self.uri}):
                self.__targets[str(target)] = 0

        self.__freebases = {}
        if self.__debug_info:
            if self.__debug_info['fbid']:
                for i in range(0, len(self.__debug_info['fbid'])):
                    fbid = self.__debug_info['fbid'][i]
                    score = self.__debug_info['fbid_score_avg'][i]
                    self.__freebases[fbid] = score
        else:
            query = """
                SELECT DISTINCT ?fbid {
                   ?member aida:privateData [
                        aida:jsonContent ?fbid ;
                        aida:system <http://www.rpi.edu/EDL_Freebase>
                    ]
                }
            """

            for j_fbid, in self.model.sparql.query(query, namespaces,
                                                   {'member': self.uri}):
                fbids = json.loads(j_fbid).get('freebase_link').keys()
                for fbid in fbids:
                    self.__freebases[fbid] = 0

    def _init_source(self):
        query = """
SELECT DISTINCT ?source ?start ?end
WHERE {
  ?member aida:justifiedBy ?justification .
  ?justification aida:source ?source ;
                 aida:startOffset ?start ;
                 aida:endOffsetInclusive ?end .
}
ORDER BY ?start """
        for source, start, end in self.model.sparql.query(
                query, namespaces, {'member': self.uri}):
            self.__source = str(source)
            self.__context_pos.append((int(start), int(end)))

    @property
    def source(self):
        if not self.__source:
            self._init_source()
        return self.__source

    @property
    def mention(self):
        if self.context_extractor.doc_exists():
            for start, end in self.__context_pos:
                res = self.context_extractor.query_context(start, end)
                if not res:
                    continue
                yield res

    def __hash__(self):
        return self.uri.__hash__()
Beispiel #2
0
class Cluster:
    def __init__(self, model, uri):
        self.model = model
        self.uri = URIRef(uri)
        self.__prototype = None
        self.__type = None
        self.__members = []
        self.__forward = None
        self.__backward = None
        self.__targets = None
        self.__selected_targets = None
        self.__target_wiki = None
        self.__freebases = None
        self.__qids = Counter()
        self.__selected_qnodes = None
        self.__q_urls = {}
        self.__groundtruth = None
        self.__debug_info = None
        self.__all_labels = None

        if model.graph:
            self.__open_clause = 'GRAPH <%s> {' % self.model.graph
            self.__close_clause = '}'
        else:
            self.__open_clause = self.__close_clause = ''

    @property
    def href(self):
        res = self.uri.replace('http://www.isi.edu/gaia',
                               '/cluster').replace('http://www.columbia.edu',
                                                   '/cluster')
        res = res.replace('/entities/', '/entities/' + self.model.repo + '/')
        res = res.replace('/events/', '/events/' + self.model.repo + '/')
        if self.model.graph:
            res = res + '?g=' + self.model.graph
        return res

    @property
    def label(self):
        if self.uri in self.model.pickled and 'label' in self.model.pickled[
                self.uri]:
            return self.model.pickled[self.uri]['label']
        return self.prototype.label

    @property
    def all_labels(self):
        if not self.__all_labels:
            self.__all_labels = Counter()
            for m in self.members:
                for l, c in m.all_labels:
                    if l in self.__all_labels:
                        self.__all_labels[l] += c
                    else:
                        self.__all_labels[l] = c
        return self.__all_labels.most_common()

    @property
    def prototype(self):
        if not self.__prototype:
            self._init_cluster_prototype()
        return self.__prototype

    @property
    def type(self):
        if self.uri in self.model.pickled and 'type' in self.model.pickled[
                self.uri]:
            return self.model.pickled[self.uri]['type']
        if not self.__type:
            self._init_cluster_prototype()
        return self.__type

    @property
    def members(self):
        if not self.__members:
            self._init_cluster_members()
        return self.__members

    @property
    def targets(self):
        if self.__targets is None:
            self._init_cluster_members()
        return self.__targets.most_common()

    @property
    def selected_targets(self):
        if self.__selected_targets is None:
            self.__selected_targets = self.debug_info.selected_targets
        return self.__selected_targets

    def get_target_stats(self, target):
        return self.debug_info.target_statistics[target]

    @property
    def target_wiki(self):
        if self.__target_wiki is None:
            self._init_cluster_members()
        return self.__target_wiki

    @property
    def freebases(self):
        if self.__freebases is None:
            self._init_cluster_members()
        return self.__freebases.most_common()

    @property
    def targetsSize(self):
        return len(self.targets)

    @property
    def qids(self):
        if not self.__qids:
            self._init_qnodes()
        return self.__qids.most_common()

    @property
    def selected_qnodes(self):
        if not self.__selected_qnodes:
            self.__selected_qnodes = self.debug_info.selected_qnodes
        return self.__selected_qnodes

    def get_qnode_stats(self, qurl):
        if qurl in self.debug_info.qnode_statistics:
            return self.debug_info.qnode_statistics[qurl]
        else:
            return None

    @property
    def q_urls(self):
        if not self.__q_urls:
            self._init_qnodes()
        return self.__q_urls

    @property
    def size(self):
        if self.__members:
            return len(self.__members)
        return self._query_for_size()

    @property
    def forward(self):
        if self.__forward is None:
            self.__forward = set()
            self._init_forward_clusters()
        return self.__forward

    @property
    def backward(self):
        if self.__backward is None:
            self.__backward = set()
            self._init_backward_clusters()
        return self.__backward

    @property
    def neighbors(self):
        return self.forward | self.backward

    def neighborhood(self, hop=1):
        if hop == 1 and self.prototype.type != AIDA.Relation:
            hood = self.neighbors
            # for neighbor in [x for x in self.neighbors if x.subject.proto]
            for neighbor in self.neighbors:
                if neighbor.subject.prototype.type == AIDA.Relation:
                    hood |= neighbor.subject.neighbors
            return hood
        if hop <= 1:
            return self.neighbors
        hood = set()
        for neighbor in self.neighbors:
            hood |= neighbor.subject.neighborhood(hop - 1)
            hood |= neighbor.object.neighborhood(hop - 1)
        return hood

    @property
    def img(self):
        import os.path
        _, name = split_uri(self.uri)
        svgpath = 'static/img/' + name + '.svg'
        if os.path.isfile(svgpath):
            return name

        from graph import SuperEdgeBasedGraph
        graph = SuperEdgeBasedGraph(self.model, self.neighborhood(), self,
                                    self.uri)
        path = graph.dot()
        return graph.name

    @classmethod
    def ask(cls, sparql, graph, uri):
        if graph:
            open_clause = 'GRAPH <%s> {' % graph
            close_clause = '}'
        else:
            open_clause = close_clause = ''
        query = "ASK { %s ?cluster a aida:SameAsCluster %s}" % (open_clause,
                                                                close_clause)
        for ans in sparql.query(query, namespaces, {'cluster': URIRef(uri)}):
            return ans
        return False

    @property
    def groundtruth(self):
        if self.__groundtruth is None:
            self._init_groundtruth()
        return self.__groundtruth

    @property
    def has_debug(self):
        return debug.has_debug(self.model.repo, self.model.graph)

    @property
    def debug_info(self):
        if self.__debug_info is None:
            if debug.has_debug(self.model.repo, self.model.graph):
                self._init_debug_info()
            else:
                self.__debug_info = False
        return self.__debug_info

    def _init_cluster_prototype(self):
        query = """
SELECT ?prototype (MIN(?label) AS ?mlabel) ?type ?category
WHERE {
    %s
    ?cluster aida:prototype ?prototype .
    ?prototype a ?type .
    OPTIONAL { ?prototype aida:hasName ?label } .
    OPTIONAL { ?statement a rdf:Statement ;
               rdf:subject ?prototype ;
               rdf:predicate rdf:type ;
               rdf:object ?category ; }
    %s
}
GROUP BY ?prototype ?type ?category """ % (self.__open_clause,
                                           self.__close_clause)
        for prototype, label, type_, cate in self.model.sparql.query(
                query, namespaces, {'cluster': self.uri}):
            if not label and cate:
                _, label = split_uri(cate)
            self.__prototype = ClusterMember(self.model, prototype, label,
                                             type_)
            self.__type = cate

    def _init_cluster_members(self):
        self.__targets = Counter()
        self.__target_wiki = {}
        self.__freebases = Counter()
        query = """
SELECT ?member (MIN(?label) AS ?mlabel) ?type
WHERE {
    %s
    ?membership aida:cluster ?cluster ;
                aida:clusterMember ?member .
    MINUS {?cluster aida:prototype ?member}
    %s
    OPTIONAL { ?member aida:hasName ?label } .
    OPTIONAL {?statement a rdf:Statement ;
              rdf:subject ?member ;
              rdf:predicate rdf:type ;
              rdf:object ?type }.
     
}
GROUP BY ?member ?type """ % (self.__open_clause, self.__close_clause)
        for member, label, type_ in self.model.sparql.query(
                query, namespaces, {'cluster': self.uri}):
            m = ClusterMember(
                model=self.model,
                uri=str(member),
                label=label,
                type_=type_,
                debug_info=self.debug_info.members[str(member)]['raw_object'])
            self.__members.append(m)
            for target in m.targets.keys():
                self.__targets[target] += 1
            for freebase in m.freebases.keys():
                self.__freebases[freebase] += 1

        query = '''
SELECT ?qnode ?qnodeLabel 
WHERE 
{
    ?qnode wdt:P1566 ?target .
    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
} '''
        for target in self.__targets.keys():
            target_t = target[target.index(':') + 1:]
            for qnode, qnodeLabel in wikidata_sparql.query(
                    query, namespaces, {'target': Literal(target_t)}):
                url = str(qnode)
                qnode = url[url.rfind('/') + 1:]
                self.__target_wiki[target] = {}
                self.__target_wiki[target]['qnode'] = qnode
                self.__target_wiki[target]['url'] = url
                self.__target_wiki[target]['label'] = str(qnodeLabel)

    def _init_qnodes(self):
        for fbid, count in self.freebases:
            if ":NIL" not in fbid:
                fbid = '/' + fbid.replace('.', '/')
                query = """
                    SELECT ?qid ?label WHERE {
                      ?qid wdt:P646 ?freebase .  
                      ?qid rdfs:label ?label filter (lang(?label) = "en") .
                    }
                    LIMIT 1
                """
                for qid, label in wikidata_sparql.query(
                        query, namespaces, {'freebase': Literal(fbid)}):
                    qnodeURL = str(qid)
                    qid = qnodeURL.rsplit('/', 1)[1]
                    self.__qids[qid] = count
                    if qid not in self.__q_urls:
                        self.__q_urls[qid] = qnodeURL

    def _init_groundtruth(self):
        # query to find cluster of the missing member
        query = '''
            SELECT ?cluster 
            WHERE {
                %s
                ?membership aida:cluster ?cluster ;
                aida:clusterMember ?member .
                %s
            }
        ''' % (self.__open_clause, self.__close_clause)

        member_set = set([str(m.uri) for m in self.members])
        gt_set = set()
        for m in member_set:
            if self.model.graph:
                res = requests.get(groundtruth_url + '/' + self.model.repo +
                                   '?g=' + self.model.graph + '&e=' + m)
            else:
                res = requests.get(groundtruth_url + '/' + self.model.repo +
                                   '?e=' + m)
            if res.status_code == 404:
                self.__groundtruth = False
                return
            if len(res.json()) > 0:
                gt_set = set(res.json())
                break

        if len(gt_set) > 0:
            hit = member_set.intersection(gt_set)
            miss = member_set.difference(gt_set)
            missing = gt_set.difference(member_set)
            missing_dict = {}

            if missing:
                for m in missing:
                    for c, in self.model.sparql.query(query, namespaces,
                                                      {'member': URIRef(m)}):
                        missing_dict[m] = str(c).replace(
                            'http://www.isi.edu/gaia/entities/', '')

            self.__groundtruth = Groundtruth(gt_set, hit, miss, missing_dict)

        else:
            self.__groundtruth = False

    def _init_debug_info(self):
        info = debug.get_debug_for_cluster(self.model.repo, self.model.graph,
                                           str(self.uri))
        if info:
            self.__debug_info = DebugInfo(info)
        else:
            self.__debug_info = False

    def _init_forward_clusters(self):
        query = """
SELECT ?p ?o ?cnt
WHERE {
    %s
  ?s aida:prototype ?proto1 .
  ?o aida:prototype ?proto2 .
  ?se rdf:subject ?proto1 ;
      rdf:predicate ?p ;
      rdf:object ?proto2 ;
      aida:confidence/aida:confidenceValue ?conf .
  BIND(ROUND(1/(2*(1-?conf))) as ?cnt)
  %s
} """ % (self.__open_clause, self.__close_clause)
        for p, o, cnt in self.model.sparql.query(query, namespaces,
                                                 {'s': self.uri}):
            self.__forward.add(
                SuperEdge(self, Cluster(self.model, o), p,
                          int(float(str(cnt)))))

    def _init_backward_clusters(self):
        query = """
SELECT ?s ?p ?cnt
WHERE {
    %s
  ?s aida:prototype ?proto1 .
  ?o aida:prototype ?proto2 .
  ?se rdf:subject ?proto1 ;
      rdf:predicate ?p ;
      rdf:object ?proto2 ;
      aida:confidence/aida:confidenceValue ?conf .
  BIND(ROUND(1/(2*(1-?conf))) as ?cnt)
    %s
} """ % (self.__open_clause, self.__close_clause)
        for s, p, cnt in self.model.sparql.query(query, namespaces,
                                                 {'o': self.uri}):
            self.__backward.add(
                SuperEdge(Cluster(self.model, s), self, p,
                          int(float(str(cnt)))))

    def _query_for_size(self):
        if self.uri in self.model.pickled and 'size' in self.model.pickled[
                self.uri]:
            return self.model.pickled[self.uri]['size']
        query = """
SELECT (COUNT(?member) AS ?size)
WHERE {
    %s
    ?membership aida:cluster ?cluster ;
                aida:clusterMember ?member .
    MINUS {?cluster aida:prototype ?member}
    %s
}  """ % (self.__open_clause, self.__close_clause)
        for size, in self.model.sparql.query(query, namespaces,
                                             {'cluster': self.uri}):
            return int(size)
        return 0

    def __hash__(self):
        return self.uri.__hash__()

    def __eq__(self, other):
        return isinstance(other, Cluster) and str(self.uri) == str(other.uri)
Beispiel #3
0
class ClusterMember:
    def __init__(self, uri, label=None, type_=None, target=None):
        self.uri = URIRef(uri)
        self.__label = label
        self.__all_labels = None
        self.__type = type_
        self.__target = target
        self.__qid = None
        self.__qLabel = None
        self.__qAliases = None
        self.__qURL = None
        self.__source = None
        self.__context_pos = []
        self.__context_extractor = None
        self.__cluster: Cluster = None

    @property
    def label(self):
        if not self.__label:
            self._init_member()
        return self.__label

    @property
    def all_labels(self):
        if not self.__all_labels:
            self.__all_labels = ""
            query = """
                SELECT ?label (COUNT(?label) AS ?n)
                WHERE {
                  ?member aida:justifiedBy/skos:prefLabel ?label .
                }
                GROUP BY ?label
                ORDER BY DESC(?n)
            """
            labels = []
            for label, n in sparql.query(query, namespaces, {'member': self.uri}):
                labels.append('{}(x{})'.format(label, n))
            self.__all_labels = ", ".join(labels)

        return self.__all_labels

    @property
    def type(self):
        if not self.__type:
            self._init_member()
        return self.__type

    @property
    def type_text(self):
        _, text = split_uri(self.type)
        return text

    @property
    def target(self):
        if self.__target is None:
            self._init_member()
        return self.__target

    @property
    def qid(self):
        if self.__qid is None and self.target:
            self._init_qNode()
        return self.__qid

    @property
    def qLabel(self):
        if self.__qLabel is None and self.target:
            self._init_qNode()
        return self.__qLabel

    @property
    def qAliases(self):
        if self.__qAliases is None and self.target:
            self._init_qNode()
        return self.__qAliases

    def _init_qNode(self):
        target = self.target
        self.__qid = False
        self.__qLabel = False
        self.__qAliases = False

        if target and ":NIL" not in target:
            fbid = '/' + target[target.find(':')+1:].replace('.', '/')
            query = """
                SELECT ?qid ?label WHERE {
                  ?qid wdt:P646 ?freebase .
                  ?qid rdfs:label ?label filter (lang(?label) = "en") .
                }
                LIMIT 1
            """
            for qid, label in wikidata_sparql.query(query, namespaces, {'freebase': Literal(fbid)}):
                self.__qURL = str(qid)
                self.__qid = self.__qURL.rsplit('/', 1)[1]
                self.__qLabel = label

            query = """
                SELECT ?qid ?alias WHERE {
                  ?qid wdt:P646 ?freebase .
                  ?qid skos:altLabel ?alias filter (lang(?alias) = "en") .
                }
            """
            aliases = []
            for qid, alias in wikidata_sparql.query(query, namespaces, {'freebase': Literal(fbid)}):
                aliases.append(str(alias))
            self.__qAliases = ', '.join(aliases)

    @property
    def context_extractor(self):
        if self.__context_extractor is None:
            self.__context_extractor = SourceContext(self.source)
        return self.__context_extractor

    @property
    def roles(self):
        query = """
        SELECT ?pred ?obj ?objtype (MIN(?objlbl) AS ?objlabel)
        WHERE {
            ?statement rdf:subject ?event ;
                       rdf:predicate ?pred ;
                       rdf:object ?obj .
            ?objstate rdf:subject ?obj ;
                      rdf:predicate rdf:type ;
                      rdf:object ?objtype .
            OPTIONAL { ?obj aida:hasName ?objlbl }
        }
        GROUP BY ?pred ?obj ?objtype
        """
        for pred, obj, obj_type, obj_lbl in sparql.query(query, namespaces, {'event': self.uri}):
            if not obj_lbl:
                _, obj_lbl = split_uri(obj_type)
            # _, pred = split_uri(pred)
            ind = pred.find('_')
            pred = pred[ind+1:]
            yield pred, ClusterMember(obj, obj_lbl, obj_type)

    @property
    def events_by_role(self):
      query = """
      SELECT ?pred ?event ?event_type (MIN(?lbl) AS ?label)
      WHERE {
          ?event a aida:Event .
          ?statement rdf:subject ?event ;
                    rdf:predicate ?pred ;
                    rdf:object ?obj .
          ?event_state rdf:subject ?event ;
                    rdf:predicate rdf:type ;
                    rdf:object ?event_type .
          OPTIONAL { ?event aida:justifiedBy/skos:prefLabel ?lbl }
      }
      GROUP BY ?pred ?event ?event_type
      """
      for pred, event, event_type, event_lbl in sparql.query(query, namespaces, {'obj': self.uri}):
          if not event_lbl:
              _, event_lbl = split_uri(event_type)
          ind = pred.find('_')
          pred = pred[ind+1:]
          yield pred, ClusterMember(event, event_lbl, event_type)

    @property
    def cluster(self):
        if self.__cluster is None:
            query = "SELECT ?cluster WHERE { ?membership aida:cluster ?cluster ; aida:clusterMember ?member . }"
            for cluster, in sparql.query(query, namespaces, {'member': self.uri}):
                self.__cluster = get_cluster(cluster)
        return self.__cluster

    def _init_member(self):
        query = """
SELECT ?label ?type ?target
WHERE {
  OPTIONAL { ?member aida:hasName ?label }
  OPTIONAL { ?member aida:justifiedBy ?justification .
    ?justification skos:prefLabel ?label }
  OPTIONAL { ?obj aida:link/aida:linkTarget ?target }
  ?statement rdf:subject ?member ;
             rdf:predicate rdf:type ;
             rdf:object ?type .
}
LIMIT 1 """
        for label, type_, target in sparql.query(query, namespaces, {'member': self.uri}):
            if not label:
                _, label = split_uri(type_)
            self.__label = label
            self.__type = type_
            self.__target = target if target else False

    def _init_source(self):
        query = """
SELECT DISTINCT ?source ?start ?end
WHERE {
  ?member aida:justifiedBy ?justification .
  ?justification aida:source ?source ;
                 aida:startOffset ?start ;
                 aida:endOffsetInclusive ?end .
}
ORDER BY ?start """
        for source, start, end in sparql.query(query, namespaces, {'member': self.uri}):
            self.__source = str(source)
            self.__context_pos.append((int(start), int(end)))

    @property
    def source(self):
        if not self.__source:
            self._init_source()
        return self.__source

    @property
    def mention(self):
        if self.context_extractor.doc_exists():
            for start, end in self.__context_pos:
                res = self.context_extractor.query_context(start, end)
                if not res:
                    continue
                yield res

    def __hash__(self):
        return self.uri.__hash__()
Beispiel #4
0
class Cluster:
    def __init__(self, uri):
        self.uri = URIRef(uri)
        self.__prototype = None
        self.__type = None
        self.__members = []
        self.__forward = None
        self.__backward = None
        self.__targets = Counter()
        self.__qnodes = Counter()
        self.__qnodesURL = {}

    @property
    def href(self):
        return self.uri.replace('http://www.isi.edu/gaia', '/cluster').replace('http://www.columbia.edu', '/cluster')

    @property
    def label(self):
        if self.uri in pickled and 'label' in pickled[self.uri]:
            return pickled[self.uri]['label']
        return self.prototype.label

    @property
    def prototype(self):
        if not self.__prototype:
            self._init_cluster_prototype()
        return self.__prototype

    @property
    def type(self):
        if self.uri in pickled and 'type' in pickled[self.uri]:
            return pickled[self.uri]['type']
        if not self.__type:
            self._init_cluster_prototype()
        return self.__type

    @property
    def members(self):
        if not self.__members:
            self._init_cluster_members()
        return self.__members

    @property
    def targets(self):
        if not self.__targets:
            self._init_cluster_members()
        return self.__targets.most_common()

    @property
    def targetsSize(self):
        return len(self.targets)

    @property
    def qnodes(self):
        if not self.__qnodes:
            self._init_qnodes()
        return self.__qnodes.most_common()

    @property
    def qnodesURL(self):
        if not self.__qnodesURL:
            self._init_qnodes()
        return self.__qnodesURL

    @property
    def size(self):
        if self.__members:
            return len(self.__members)
        return self._query_for_size()

    @property
    def forward(self):
        if self.__forward is None:
            self.__forward = set()
            self._init_forward_clusters()
        return self.__forward

    @property
    def backward(self):
        if self.__backward is None:
            self.__backward = set()
            self._init_backward_clusters()
        return self.__backward

    @property
    def neighbors(self):
        return self.forward | self.backward

    def neighborhood(self, hop=1):
        if hop == 1 and self.prototype.type != AIDA.Relation:
            hood = self.neighbors
            # for neighbor in [x for x in self.neighbors if x.subject.proto]
            for neighbor in self.neighbors:
                if neighbor.subject.prototype.type == AIDA.Relation:
                    hood |= neighbor.subject.neighbors
            return hood
        if hop <= 1:
            return self.neighbors
        hood = set()
        for neighbor in self.neighbors:
            hood |= neighbor.subject.neighborhood(hop-1)
            hood |= neighbor.object.neighborhood(hop-1)
        return hood

    @property
    def img(self):
        import os.path
        _, name = split_uri(self.uri)
        svgpath = 'static/img/' + name + '.svg'
        if os.path.isfile(svgpath):
            return name

        from graph import SuperEdgeBasedGraph
        graph = SuperEdgeBasedGraph(self.neighborhood(), self, self.uri)
        path = graph.dot()
        return graph.name

    @classmethod
    def ask(cls, uri):
        query = "ASK { ?cluster a aida:SameAsCluster }"
        for ans in sparql.query(query, namespaces, {'cluster': URIRef(uri)}):
            return ans
        return False

    def _init_cluster_prototype(self):
        query = """
SELECT ?prototype (MIN(?label) AS ?mlabel) ?type ?category
WHERE {
    ?cluster aida:prototype ?prototype .
    ?prototype a ?type .
    OPTIONAL { ?prototype aida:hasName ?label } .
    ?statement a rdf:Statement ;
               rdf:subject ?prototype ;
               rdf:predicate rdf:type ;
               rdf:object ?category ;
}
GROUP BY ?prototype ?type ?category """
        for prototype, label, type_, cate in sparql.query(query, namespaces, {'cluster': self.uri}):
            if not label:
                _, label = split_uri(cate)
            self.__prototype = ClusterMember(prototype, label, type_)
            self.__type = cate

    def _init_cluster_members(self):
        query = """
SELECT ?member (MIN(?label) AS ?mlabel) ?type ?target
WHERE {
  ?membership aida:cluster ?cluster ;
              aida:clusterMember ?member .
  OPTIONAL { ?member aida:hasName ?label } .
  OPTIONAL { ?member aida:link/aida:linkTarget ?target } .
  ?statement a rdf:Statement ;
             rdf:subject ?member ;
             rdf:predicate rdf:type ;
             rdf:object ?type .
}
GROUP BY ?member ?type ?target """
        for member, label, type_, target in sparql.query(query, namespaces, {'cluster': self.uri}):
            self.__members.append(ClusterMember(member, label, type_, target))
            if target:
                self.__targets[str(target)] += 1

    def _init_qnodes(self):
        for target, count in self.targets:
            if ":NIL" not in target:
                fbid = '/' + target[target.find(':')+1:].replace('.', '/')
                query = """
                    SELECT ?qid ?label WHERE {
                      ?qid wdt:P646 ?freebase .
                      ?qid rdfs:label ?label filter (lang(?label) = "en") .
                    }
                    LIMIT 1
                """
                for qid, label in wikidata_sparql.query(query, namespaces, {'freebase': Literal(fbid)}):
                    qnodeURL = str(qid)
                    qid = qnodeURL.rsplit('/', 1)[1]
                    self.__qnodes[qid] = count
                    if qid not in self.__qnodesURL:
                        self.__qnodesURL[qid] = qnodeURL

    def _init_forward_clusters(self):
        query = """
SELECT ?p ?o ?cnt
WHERE {
  ?s aida:prototype ?proto1 .
  ?o aida:prototype ?proto2 .
  ?se rdf:subject ?proto1 ;
      rdf:predicate ?p ;
      rdf:object ?proto2 ;
      aida:confidence/aida:confidenceValue ?conf .
  BIND(ROUND(1/(2*(1-?conf))) as ?cnt)
} """
        for p, o, cnt in sparql.query(query, namespaces, {'s': self.uri}):
            self.__forward.add(SuperEdge(self, Cluster(o), p, int(cnt)))

    def _init_backward_clusters(self):
        query = """
SELECT ?s ?p ?cnt
WHERE {
  ?s aida:prototype ?proto1 .
  ?o aida:prototype ?proto2 .
  ?se rdf:subject ?proto1 ;
      rdf:predicate ?p ;
      rdf:object ?proto2 ;
      aida:confidence/aida:confidenceValue ?conf .
  BIND(ROUND(1/(2*(1-?conf))) as ?cnt)
} """
        for s, p, cnt in sparql.query(query, namespaces, {'o': self.uri}):
            self.__backward.add(SuperEdge(Cluster(s), self, p, int(cnt)))

    def _query_for_size(self):
        if self.uri in pickled and 'size' in pickled[self.uri]:
            return pickled[self.uri]['size']
        query = """
SELECT (COUNT(?member) AS ?size)
WHERE {
    ?membership aida:cluster ?cluster ;
                aida:clusterMember ?member .
}  """
        for size, in sparql.query(query, namespaces, {'cluster': self.uri}):
            return int(size)
        return 0

    def __hash__(self):
        return self.uri.__hash__()

    def __eq__(self, other):
        return isinstance(other, Cluster) and str(self.uri) == str(other.uri)
Beispiel #5
0
 def __hash__(self):
     return URIRef.__hash__(self)