def profile_people(): 'Generate a list of :class:`RdfPerson` with profiles on the site.' g = rdf_data() start = time.time() current_site = Site.objects.get(id=settings.SITE_ID) res = g.query(''' PREFIX schema: <%(schema)s> PREFIX rdf: <%(rdf)s> SELECT DISTINCT ?person WHERE { ?person rdf:type schema:Person . ?person schema:familyName ?name . FILTER regex(str(?person), "^http://%(site)s") } ORDER BY ?name ''' % {'schema': rdfns.SCHEMA_ORG, 'rdf': rdflib.RDF, 'site': current_site.domain} ) # FIXME: should be possible to filter at this level # on precense of a dbpedia description or a local schema description # but can't get the query to work... logger.debug('Found %d people in %.02f sec' % (len(res), time.time() - start)) # people = [RdfPerson(g.get_context(r['person']), r['person']) for r in res] people = [RdfPerson(g, r['person']) for r in res] return people
def handle(self, *args, **options): graph = rdf_data() verbosity = options.get('verbosity', self.v_normal) if verbosity >= self.v_normal: print >> sys.stderr, "Exporting %d triples" % len(graph) print graph.serialize(format=options['format'])
def handle(self, filename, *args, **options): graph = rdf_data() size = len(graph) verbosity = options.get('verbosity', self.v_normal) graph.parse(filename, format=options['format']) if verbosity >= self.v_normal: print >> sys.stderr, "Loaded %d triples" % (len(graph) - size)
def egograph_node_info(request, id): """HTML snippet to provide information about a node in the egograph. Intended to be loaded and displayed via AJAX. Some overlap with :meth:`belfast.network.views.node_info`. """ # id is the person to whom this node is connected uri = local_uri(reverse("people:profile", args=[id]), request) g = rdf_data() ego_person = RdfPerson(g, rdflib.URIRef(uri)) # NOTE: some overlap here with networks node_info view # id param is the node we want information node_id = request.GET.get("id", None) if node_id is None: raise Http404 node_uri = rdflib.URIRef(node_id) # TODO: better to get relations from gexf or rdf ? graph = gexf.read_gexf(settings.GEXF_DATA["full"]) node = graph.node[node_id] context = {"node": node} if node.get("type", None) == "Person": # init rdf person person = RdfPerson(rdf_data(), rdflib.URIRef(node_id)) context["person"] = person # determine relation between node and ego-center rels = set(g.predicates(ego_person.identifier, node_uri)) # TODO: may want to display other relationships? # special case: if "mentions", should be a poem; find for display/link if rdfns.SCHEMA_ORG.mentions in rels: txts = set(g.subjects(rdfns.SCHEMA_ORG.mentions, node_uri)) - set([ego_person.identifier]) if txts: poems = [RdfPoem(g, p) for p in txts] # explicitly skip any non-poems, just in case context["poems"] = [p for p in poems if rdfns.FREEBASE["book/poem"] in p.rdf_types] return render(request, "network/node_info.html", context)
def connections(self, rdftype=None, resource=rdflib.resource.Resource): '''Generate a dictionary of connected entities (direct neighbors in the network graph) with a list of relationship terms (edge labels). Optionally, takes an RDF type to filter the entities (e.g., restrict only to People or Organizations), and a subclass of :class:`rdflib.resource.Resource` to initialize the entity as.''' network = network_data() graph = rdf_data() if self.nx_node_id not in network.nodes(): return {} # this also works... # neighbors = network.neighbors(self.nx_node_id) ego_graph = self.ego_graph() neighbors = ego_graph.nodes() connections = {} for node in neighbors: weight = 0 # don't include the current person in their own connections if node == self.nx_node_id: continue uriref = rdflib.URIRef(node) # if an rdf type was specified, filter out items that do not # match that type. if rdftype is not None and \ (uriref, rdflib.RDF.type, rdftype) not in graph: continue res = resource(graph, uriref) rels = set() # find any edges between this node and me # include data to simplify accessing edge label # use edges & labels from original multidigraph all_edges = network.out_edges(node, data=True) + \ network.in_edges(node, data=True) for edge in all_edges: src, target, data = edge if self.nx_node_id in edge and 'label' in data: weight += data.get('weight', 1) # assume default of 1 if not set rels.add(data['label']) if 'knows' in rels and 'correspondedWith' in rels: rels.remove('knows') # connections[res] = {'rels': rels, 'weight': weight} connections[res] = (rels, weight) # sort by weight so strongest connections will be listed first conn = sorted(connections.items(), key=lambda x: x[1][1], reverse=True) return conn
def egograph_js(request, id): "Egograph information as JSON for a single person." uri = local_uri(reverse("people:profile", args=[id]), request) g = rdf_data() person = RdfPerson(g, rdflib.URIRef(uri)) graph = person.ego_graph(radius=1, types=["Person", "Organization", "Place"]) # annotate nodes in graph with degree # NOTE: not a directional graph, so in/out degree not available graph = annotate_graph( graph, fields=["degree", "in_degree", "out_degree", "betweenness_centrality", "eigenvector_centrality"] ) data = json_graph.node_link_data(graph) return HttpResponse(json.dumps(data), content_type="application/json")
def profile(request, id): "Display a profile page for a single person associated with the Belfast Group." uri = local_uri(reverse("people:profile", args=[id]), request) g = rdf_data() uriref = rdflib.URIRef(uri) # check that the generated URI is actually a person in our rdf dataset; # if not, 404 if not (uriref, rdflib.RDF.type, rdfns.SCHEMA_ORG.Person) in g: raise Http404 person = RdfPerson(g, uriref) groupsheets = get_rdf_groupsheets(author=uri) # TODO: move to rdfperson class return render( request, "people/profile.html", {"person": person, "groupsheets": groupsheets, "page_rdf_type": "schema:ProfilePage"}, )
def _network_graph(min_degree=1, **kwargs): graph = network_data().copy() # don't modify the original network rdfgraph = rdf_data() # filter graph by type of node types = ["Person", "Organization", "Place", "BelfastGroupSheet"] for n in graph.nodes(): if "type" not in graph.node[n] or graph.node[n]["type"] not in types: graph.remove_node(n) continue # use groupsheets to infer a connection between the author # of the groupsheet and the group itself # FIXME: this needs to be in data prep/clean, NOT here # TODO: should be handled in prep now; confirm and then remove this logic if graph.node[n]["type"] == "BelfastGroupSheet": sheet = RdfGroupSheet(rdfgraph, rdflib.URIRef(n)) # FIXME: error handling when author is not in the graph? # should probably at least log this... if sheet.author and unicode(sheet.author.identifier) in graph: graph.add_edge(unicode(sheet.author.identifier), BELFAST_GROUP_URI, weight=4) # remove the groupsheet itself from the network, to avoid # cluttering up the graph with too much information # graph.add_edge(n, BELFAST_GROUP_URI, weight=5) graph.remove_node(n) # AFTER filtering by type, filter out by requested minimum degree removed = 0 for n in graph.nodes(): if graph.degree(n) < min_degree: removed += 1 graph.remove_node(n) logger.info("removed %d nodes with degree less than %d" % (removed, min_degree)) return graph
def node_info(request): """Return an HTML snippet with brief information about a node in the network (e.g., name, number of Group sheets, link to profile page if there is one). Intended to be called via AJAX and displayed with the network graphs. Expects a url parameter ``id`` with the node identifier. """ node_id = request.GET.get("id", None) # if no id is specified, 404 if node_id is None: raise Http404 # TODO: better to get from gexf or rdf ? graph = gexf.read_gexf(settings.GEXF_DATA["full"]) node = graph.node[node_id] context = {"node": node} if node.get("type", None) == "Person": # init rdf person person = RdfPerson(rdf_data(), rdflib.URIRef(node_id)) context["person"] = person # TODO: handle other types? location, organization return render(request, "network/node_info.html", context)
def find_places(): 'Generate a list of :class:`RdfLocation` associated with Belfast Group people.' g = rdf_data() return [RdfLocation(g, subj) for subj in g.subjects(predicate=rdflib.RDF.type, object=rdfns.SCHEMA_ORG.Place)]
def BelfastGroup(): '''Convenience method to initalize and return an :class:`RdfOrganization` for the Belfast Group''' return RdfOrganization(rdf_data(), rdfns.BELFAST_GROUP_URIREF)
def rdfcollection(self): ''':class:`belfast.groupsheets.rdfodels.RdfArchivalCollection` this image came from, if :attr:`collection_uri` is set.''' if self.collection_uri is not None: return RdfArchivalCollection(rdf_data(), rdflib.URIRef(self.collection_uri))
def rdfperson(self): 'associated :class:`~belfast.people.rdfmodels.RdfPerson`' return RdfPerson(rdf_data(), rdflib.URIRef(self.person_uri))
def handle(self, *args, **options): self.verbosity = options['verbosity'] # harvest from the current configured site current_site = Site.objects.get(id=settings.SITE_ID) self.harvest_urls.extend(['http://%s/groupsheets/%s/' % (current_site.domain.rstrip('/'), i) for i in self.tei_ids]) # if specific steps are specified, run only those # otherwise, run all steps all_steps = not any([options['harvest'], options['queens'], options['related'], options['smush'], options['gexf'], options['identify'], options['connect']]) # initialize graph persistence # graph = rdflib.ConjunctiveGraph('Sleepycat') # graph.open(settings.RDF_DATABASE, create=True) graph = rdf_data() # if clear is specified, remove the entire db if options['clear']: if self.verbosity >= self.v_normal: print 'Removing %d contexts and %d triples from the current RDF graph' % \ (len(list(graph.contexts())), len(graph)) # can't find a reliable way to remove all triples and contexts # so close the graph, remove everything, and start over graph.close() shutil.rmtree(settings.RDF_DATABASE) graph.open(settings.RDF_DATABASE, create=True) if all_steps or options['harvest']: self.stdout.write('-- Harvesting RDF from EmoryFindingAids related to the Belfast Group') # inaccurate; also harvesting tei from local site HarvestRdf(self.harvest_urls, find_related=True, verbosity=self.verbosity, graph=graph, no_cache=options['no_cache']) # local info from RDF data - additional bios, Group sheet in private collection self.stdout.write('-- Adding RDF data from local fixtures') LocalRDF(graph, self.local_rdf_fixtures) if all_steps or options['queens']: self.stdout.write('-- Converting Queens University Belfast Group collection description to RDF') QUB(self.QUB_input, verbosity=self.verbosity, graph=graph, url=QUB.QUB_BELFAST_COLLECTION) if all_steps or options['identify']: # identify groupsheets in the data and add local groupsheet type if not present self.stdout.write('-- Identifying groupsheets') IdentifyGroupSheets(graph) if all_steps or options['smush']: # smush any groupsheets in the data self.stdout.write('-- Smushing groupsheet URIs and generating local profile URIs') # NOTE: might be nice to smush *after* cleaning up author names, but for some reason # that results in a number of authors/groupsheets getting dropped SmushGroupSheets(graph) ProfileUris(graph) if all_steps or options['related']: self.stdout.write('-- Annotating graph with related information from VIAF, GeoNames, and DBpedia') Annotate(graph) if all_steps or options['connect']: # infer connections self.stdout.write('-- Inferring connections: groupsheet time period, owner, authors affiliated with group') InferConnections(graph) # TODO: groupsheet owner based on source collection if all_steps or options['gexf']: # generate gexf self.stdout.write('-- Generating network graphs and saving as GEXF') nx.Rdf2Gexf(graph, settings.GEXF_DATA['full']) nx.BelfastGroupGexf(graph, settings.GEXF_DATA['bg1']) # set last-modification time set_site_lastmodified(graph) graph.close()