def connected_graph(self, graph, uri): if unconnectedAction == 'ignore': return graph g = Graph() all_nodes = list(graph.all_nodes()) all_nodes = filter(lambda y: not isinstance(y, Literal), all_nodes) discovered = {} visiting = [uri] while visiting: x = visiting.pop() if not discovered.has_key(x): discovered[x] = 1 for (p, new_x) in graph.predicate_objects(subject=x): g.add((x, p, new_x)) if ( isinstance(new_x, URIRef) or isinstance(new_x, BNode) ) and not discovered.has_key(new_x) and not new_x in visiting: visiting.append(new_x) for (new_x, p) in graph.subject_predicates(object=x): g.add((new_x, p, x)) if ( isinstance(new_x, URIRef) or isinstance(new_x, BNode) ) and not discovered.has_key(new_x) and not new_x in visiting: visiting.append(new_x) if len(discovered) != len(all_nodes): if unconnectedAction == 'warn': print "Warning: Graph is unconnected, some nodes being dropped" elif unconnectedAction == 'raise': raise OreException('Graph to be serialized is unconnected') elif unconnectedAction != 'drop': raise ValueError('Unknown unconnectedAction setting: %s' % unconnectedAction) return g
def merge_graphs(self, rem, page=-1): g = Graph() # Put in some sort of recognition of library? n = now() if not rem.created: rem._dcterms.created = n rem._dcterms.modified = n if not rem._dcterms.creator: rem.add_agent(foresiteAgent, 'creator') aggr = rem.aggregation stack = [rem, aggr] if page != -1: # first is 1, 2, 3 ... start = (page - 1) * pageSize tosrlz = aggr._resources_[start:start + pageSize] else: tosrlz = aggr._resources_ remove = [] for (r, p) in tosrlz: if isinstance(r, Aggregation): for a in r._ore.aggregates: remove.append( (r._uri_, namespaces['ore']['aggregates'], a)) stack.extend([r, p]) done = [] while stack: what = stack.pop(0) if what == None or what in done: continue done.append(what) g += what._graph_ for at in what._triples_.values(): stack.append(at) for who in what._agents_.values(): stack.append(who) if self.public: # Remove internal methods for p in internalPredicates: for (s, o) in g.subject_objects(p): g.remove((s, p, o)) for trip in remove: g.remove(trip) if not aggr._resources_: raise OreException("Aggregation must aggregate something") g = self.connected_graph(g, aggr._uri_) return g
def serialize(self, rem, page=-1): aggr = rem._aggregation_ # Check entire graph is connected g = self.merge_graphs(rem) if namespaces.has_key(''): del namespaces[u''] root = Element("feed", nsmap=namespaces) #namespaces[''] = myNamespace ## Aggregation Info e = SubElement(root, 'id') e.text = str(aggr.uri) if not aggr._dc.title: raise OreException( "Atom Serialisation requires title on aggregation") else: e = SubElement(root, 'title') e.text = str(aggr._dc.title[0]) if aggr._dc.description: e = SubElement(root, 'subtitle') e.text = str(aggr._dc.description[0]) for who in aggr._dcterms.creator: e = SubElement(root, 'author') agent = aggr._agents_[who] self.make_agent(e, agent) for bn in aggr._dcterms.contributor: e = SubElement(root, 'contributor') agent = aggr._agents_[bn] self.make_agent(e, agent) for t in aggr._ore.similarTo: self.make_link(root, 'related', t, g) for t in aggr._dcterms.rights: self.make_link(root, 'license', t, g) for t in aggr._rdf.type: e = SubElement(root, 'category', term=str(t)) try: scheme = list(g.objects(t, namespaces['rdfs']['isDefinedBy']))[0] e.set('scheme', str(scheme)) except: pass try: label = list(g.objects(t, namespaces['rdfs']['label']))[0] e.set('label', str(label)) except: pass orms = [] for orm in aggr._resourceMaps_: if orm != rem: self.make_link(root, 'alternate', orm.uri, g) orms.append(orm.uri) for t in aggr._ore.isDescribedBy: # check not in orms if not t in orms: self.make_link(root, 'alternate', t, g) self.generate_rdf(root, aggr) ## ReM Info self.make_link(root, 'self', rem.uri, g) e = SubElement(root, 'updated') e.text = now() # ReM Author if rem._dcterms.creator: uri = rem._dcterms.creator[0] e = SubElement(root, 'generator', uri=str(uri)) agent = rem._agents_[uri] n = agent._foaf.name[0] e.text = str(n) self.done_triples.append((uri, namespaces['foaf']['name'], n)) # if no logo, put in nice ORE icon e = SubElement(root, 'icon') if aggr._foaf.logo: e.text = str(aggr._foaf.logo[0]) elif rem._foaf.logo: e.text = str(rem._foaf.logo[0]) else: e.text = "http://www.openarchives.org/ore/logos/ore_icon.png" if rem._dc.rights: e = SubElement(root, 'rights') e.text = rem._dc.rights[0] self.generate_rdf(root, rem) ## Process Entries for (res, proxy) in aggr._resources_: entry = SubElement(root, 'entry') e = SubElement(entry, 'id') if proxy: e.text = str(proxy.uri) else: e.text = "urn:uuid:%s" % gen_uuid() e = SubElement(entry, 'link', rel="alternate", href=str(res.uri)) # type = dc:format fmt = list(g.objects(res.uri, namespaces['dc']['format'])) if fmt: e.set('type', str(fmt[0])) if not res._dc.title: raise ValueError( "All entries must have a title for ATOM serialisation") else: e = SubElement(entry, 'title') e.text = str(res._dc.title[0]) for t in res._rdf.type: e = SubElement(entry, 'category', term=str(t)) try: scheme = list( g.objects(t, namespaces['rdfs']['isDefinedBy']))[0] e.set('scheme', str(scheme)) except: pass try: label = list(g.objects(t, namespaces['rdfs']['label']))[0] e.set('label', str(label)) except: pass for a in res._dcterms.creator: e = SubElement(entry, 'author') agent = res._agents_[a] self.make_agent(e, agent) for a in res._dcterms.contributor: e = SubElement(entry, 'contributor') agent = res._agents_[a] self.make_agent(e, agent) if res._dcterms.abstract: e = SubElement(entry, 'summary') e.text = str(res._dcterms.abstract[0]) # Not sure about this at object level? for oa in res._ore.isAggregatedBy: if oa != aggr._uri_: e = SubElement(entry, 'link', rel="related", href=str(oa)) e = SubElement(entry, 'updated') e.text = now() if proxy and proxy._ore.lineage: e = SubElement(entry, 'link', rel="via", href=str(proxy._ore.lineage[0])) res._currProxy_ = proxy self.generate_rdf(entry, res) res._currProxy_ = None data = etree.tostring(root) data = data.replace('\n', '') data = self.spacesub.sub('', data) uri = str(rem._uri_) self.done_triples = [] return ReMDocument(uri, data)
def serialize(self, rem, page=-1): aggr = rem._aggregation_ g = self.merge_graphs(rem) # make nsmap better nm = g.namespace_manager nsmap = {'atom': str(namespaces['atom'])} poss = uniq(g.predicates()) + uniq(g.objects(None, RDF.type)) for pred in poss: pf, ns, l = nm.compute_qname(pred) nsmap[pf] = ns root = Element("{%s}entry" % namespaces['atom'], nsmap=nsmap) # entry/id == tag for entry == ReM dc:identifier # if not exist, generate Yet Another uuid e = SubElement(root, '{%s}id' % namespaces['atom']) if rem._dc.identifier: dcid = rem._dc.identifier[0] e.text = str(dcid) self.done_triples.append( (rem._uri_, namespaces['dc']['identifier'], dcid)) else: e.text = "urn:uuid:%s" % gen_uuid() # entry/title == Aggr's dc:title title = aggr._dc.title tns = 'dc' if not title: title = aggr._dcterms.title tns = 'dcterms' if not title: raise OreException( "Atom Serialisation requires title on aggregation") else: e = SubElement(root, '{%s}title' % namespaces['atom']) dctit = title[0] e.text = str(dctit) self.done_triples.append( (aggr._uri_, namespaces[tns]['title'], dctit)) # entry/author == Aggr's dcterms:creator for who in aggr._dcterms.creator: e = SubElement(root, '{%s}author' % namespaces['atom']) agent = aggr._agents_[who] self.make_agent(e, agent) self.done_triples.append( (aggr._uri_, namespaces['dcterms']['creator'], agent._uri_)) # entry/contributor == Aggr's dcterms:contributor for bn in aggr._dcterms.contributor: e = SubElement(root, '{%s}contributor' % namespaces['atom']) agent = aggr._agents_[who] self.make_agent(e, agent) self.done_triples.append( (aggr._uri_, namespaces['dcterms']['contributor'], agent._uri_)) # entry/category[@scheme="(magic)"][@term="(datetime)"] for t in aggr._dcterms.created: t = t.strip() e = SubElement( root, '{%s}category' % namespaces['atom'], term=str(t), scheme="http://www.openarchives.org/ore/terms/datetime/created" ) for t in aggr._dcterms.modified: t = t.strip() e = SubElement( root, '{%s}category' % namespaces['atom'], term=str(t), scheme="http://www.openarchives.org/ore/terms/datetime/modified" ) # entry/category == Aggr's rdf:type for t in aggr._rdf.type: e = SubElement(root, '{%s}category' % namespaces['atom'], term=str(t)) try: scheme = list(g.objects(t, namespaces['rdfs']['isDefinedBy']))[0] e.set('scheme', str(scheme)) self.done_triples.append( (t, namespaces['rdfs']['isDefinedBy'], scheme)) except: pass try: label = list(g.objects(t, namespaces['rdfs']['label']))[0] e.set('label', str(label)) self.done_triples.append( (t, namespaces['rdfs']['label'], label)) except: pass self.done_triples.append( (aggr._uri_, namespaces['rdf']['type'], t)) # entry/summary if aggr._dc.description: e = SubElement(root, '{%s}summary' % namespaces['atom']) desc = aggr._dc.description[0] e.text = str(desc) self.done_triples.append( (aggr._uri_, namespaces['dc']['description'], desc)) # All aggr links: done = [ namespaces['rdf']['type'], namespaces['ore']['aggregates'], namespaces['dcterms']['creator'], namespaces['dcterms']['contributor'], namespaces['dc']['title'], namespaces['dc']['description'] ] for (p, o) in g.predicate_objects(aggr.uri): if not p in done: if isinstance(o, URIRef): self.make_link(root, p, o, g) self.done_triples.append((aggr._uri_, p, o)) # entry/content // link[@rel="alternate"] # Do we have a splash page? altDone = 0 atypes = aggr._rdf._type possAlts = [] for (r, p) in aggr.resources: mytypes = r._rdf.type if namespaces['eurepo']['humanStartPage'] in mytypes: altDone = 1 self.make_link(root, 'alternate', r.uri, g) break # check if share non Aggregation type # eg aggr == article and aggres == article, likely # to be good alternate for m in mytypes: if m != namespaces['ore']['Aggregation'] and \ m in atypes: possAlt.append(r.uri) if not altDone and possAlts: # XXX more intelligent algorithm here self.make_link(root, '{%s}alternate' % namespaces['atom'], possAlts[0], g) altDone = 1 if not altDone and build_html_atom_content: e = SubElement(root, '{%s}content' % namespaces['atom']) e.set('type', 'html') # make some representative html # this can get VERY LONG so default to not doing this html = ['<ul>'] for (r, p) in aggr.resources: html.append('<li><a href="%s">%s</a></li>' % (r.uri, r.title[0])) html.append('</ul>') e.text = '\n'.join(html) else: e = SubElement(root, '{%s}content' % namespaces['atom']) e.set('type', 'html') e.text = "No Content" # entry/link[@rel='self'] == URI-R self.make_link(root, 'self', rem._uri_, g) # entry/link[@rel='ore:describes'] == URI-A self.make_link(root, namespaces['ore']['describes'], aggr._uri_, g) ### These are generated automatically in merge_graphs # entry/published == ReM's dcterms:created if rem._dcterms.created: e = SubElement(root, '{%s}published' % namespaces['atom']) c = rem._dcterms.created[0] md = str(c) if md.find('Z') == -1: # append Z md += "Z" e.text = md self.done_triples.append( (rem._uri_, namespaces['dcterms']['created'], c)) # entry/updated == ReM's dcterms:modified e = SubElement(root, '{%s}updated' % namespaces['atom']) if rem._dcterms.modified: c = rem._dcterms.modified[0] md = str(c) if md.find('Z') == -1: # append Z md += "Z" e.text = str(md) self.done_triples.append( (rem._uri_, namespaces['dcterms']['modified'], c)) else: e.text = now() # entry/rights == ReM's dc:rights if rem._dc.rights: e = SubElement(root, '{%s}rights' % namespaces['atom']) r = rem._dc.rights[0] e.text = str(r) self.done_triples.append( (rem._uri_, namespaces['dc']['rights'], r)) # entry/source/author == ReM's dcterms:creator if rem._dcterms.creator: # Should at least be our generator! (right?) src = SubElement(root, '{%s}source' % namespaces['atom']) for who in rem._dcterms.creator: e = SubElement(src, '{%s}author' % namespaces['atom']) agent = rem._agents_[who] self.make_agent(e, agent) self.done_triples.append( (rem._uri_, namespaces['dcterms']['creator'], agent._uri_)) for who in rem._dcterms.contributor: e = SubElement(src, '{%s}contributor' % namespaces['atom']) agent = rem._agents_[who] self.make_agent(e, agent) self.done_triples.append( (rem._uri_, namespaces['dcterms']['contributor'], agent._uri_)) e = SubElement(src, '{%s}generator' % namespaces['atom'], uri=str(libraryUri), version=str(libraryVersion)) e.text = str(libraryName) # Remove aggregation, resource map props already done # All of agg res needs to be done for (r, p) in aggr.resources: self.make_link(root, namespaces['ore']['aggregates'], r.uri, g) self.done_triples.append( (aggr._uri_, namespaces['ore']['aggregates'], r._uri_)) # Now create ore:triples # and populate with rdf/xml trips = SubElement(root, '{%s}triples' % namespaces['ore']) self.generate_rdf(trips, g) data = etree.tostring(root, pretty_print=True) #data = data.replace('\n', '') #data = self.spacesub.sub('', data) uri = str(rem._uri_) self.done_triples = [] return ReMDocument(uri, data, format='atom', mimeType=self.mimeType)
def parse(self, doc): root = etree.fromstring(doc.data) self.curr_root = root graph = Graph() # first construct aggr and rem self.all_objects = {} uri_a = root.xpath("/atom:entry/atom:link[@rel='http://www.openarchives.org/ore/terms/describes']/@href", namespaces=namespaces) uri_r = root.xpath("/atom:entry/atom:link[@rel='self']/@href", namespaces=namespaces) rem = ResourceMap(uri_r[0]) aggr = Aggregation(uri_a[0]) rem.set_aggregation(aggr) self.all_objects[rem._uri_] = rem self.all_objects[aggr._uri_] = aggr # Aggregation Info title = root.xpath("/atom:entry/atom:title/text()", namespaces=namespaces) aggr._dc.title = title[0] for auth in root.xpath('/atom:entry/atom:author', namespaces=namespaces): self.handle_person(auth, aggr, 'creator') for auth in root.xpath('/atom:entry/atom:contributor', namespaces=namespaces): self.handle_person(auth, aggr, 'contributor') for cat in root.xpath('/atom:entry/atom:category', namespaces=namespaces): self.handle_category(cat, aggr) for link in root.xpath('/atom:entry/atom:link', namespaces=namespaces): self.handle_link(link, aggr) summary = root.xpath("/atom:entry/atom:summary/text()", namespaces=namespaces) if summary: aggr._dc.description = summary[0] # Resource Map Info aid = root.xpath("/atom:entry/atom:id/text()", namespaces=namespaces) at = ArbitraryResource(aid[0]) at._dcterms.hasVersion = rem._uri_ rem.add_triple(at) self.all_objects[at._uri_] = at updated = root.xpath("/atom:entry/atom:updated/text()", namespaces=namespaces) if updated: rem._dcterms.modified = updated[0] elif self.strict: raise OreException("Graph does not have mandatory 'ResourceMap dcterms:modified timestamp' triple") published = root.xpath("/atom:entry/atom:published/text()", namespaces=namespaces) if published: rem._dcterms.created = published[0] rights = root.xpath("/atom:entry/atom:rights/text()", namespaces=namespaces) if rights: rem._dc.rights = rights[0] lic = root.xpath("/atom:entry/atom:link[@rel='license']/@href", namespaces=namespaces) if lic: rem._dcterms.rights = URIRef(lic[0]) for rauth in root.xpath('/atom:entry/atom:source/atom:author', namespaces=namespaces): self.handle_person(rauth, rem, 'creator') for rauth in root.xpath('/atom:entry/atom:source/atom:contributor', namespaces=namespaces): self.handle_person(rauth, rem, 'contributor') for rdf in root.xpath('/atom:entry/ore:triples/rdf:Description', namespaces=namespaces): try: about = URIRef(rdf.attrib['{%s}about' % namespaces['rdf']]) except: # probably a blank node try: about = BNode(rdf.attrib['{%s}nodeID' % namespaces['rdf']]) except: raise if about in self.all_objects: self.handle_rdf(rdf, self.all_objects[about]) else: self.handle_rdf(rdf, aggr) self.all_objects = {} return rem
def process_graph(self, graph): # take graph and find objects, split up stuff into graph # Find ReM/Aggr lres = list(graph.query("PREFIX ore: <%s> SELECT ?a ?b WHERE {?a ore:describes ?b .}" % namespaces['ore'])) try: uri_r = lres[0][0] uri_a = lres[0][1] except IndexError: raise OreException("Graph does not have mandatory ore:describes triple") if self.strict and len(lres) != 1: raise OreException("Graph must contain exactly one ore:describes triple") if self.strict and not protocolUriRe.match(uri_r): raise OreException("Resource Map URI must be protocol-based URI: %s" % uri_r) if self.strict and not protocolUriRe.match(uri_a): raise OreException("Aggregation URI must be protocol-based URI: %s" % uri_a) remc = list(graph.query("PREFIX dcterms: <%s> SELECT ?a WHERE { <%s> dcterms:creator ?a .}" % (namespaces['dcterms'], uri_r))) if self.strict and not remc: raise OreException("Graph does not have mandatory 'ResourceMap dcterms:creator ?x' triple") remc = list(graph.query("PREFIX dcterms: <%s> SELECT ?a WHERE { <%s> dcterms:modified ?a .}" % (namespaces['dcterms'], uri_r))) if self.strict and not remc: raise OreException("Graph does not have mandatory 'ResourceMap dcterms:modified timestamp' triple") rem = ResourceMap(uri_r) aggr = Aggregation(uri_a) rem.set_aggregation(aggr) self.set_fields(rem, graph) self.set_fields(aggr, graph) things = {uri_r : rem, uri_a : aggr} res2 = graph.query("PREFIX ore: <http://www.openarchives.org/ore/terms/> SELECT ?b WHERE {<%s> ore:aggregates ?b .}" % uri_a ) for uri_ar in res2: uri_ar = uri_ar[0] if self.strict and not protocolUriRe.match(uri_ar): raise OreException("Aggregated Resource URI must be protocol-based URI: %s" % uri_ar) res = AggregatedResource(uri_ar) things[uri_ar] = res proxy = list(graph.query("PREFIX ore: <http://www.openarchives.org/ore/terms/> SELECT ?a WHERE {?a ore:proxyFor <%s> .}" % uri_ar )) try: uri_p = proxy[0][0] p = Proxy(uri_p) p.set_forIn(res, aggr) things[uri_p] = p aggr.add_resource(res, p) self.set_fields(res, graph) self.set_fields(p, graph) except IndexError: aggr.add_resource(res, None) self.set_fields(res, graph) allThings = things.copy() agents = list(graph.query("PREFIX foaf: <%s> PREFIX dcterms: <%s> SELECT ?a WHERE { { ?a foaf:name ?b } UNION { ?a foaf:mbox ?b } UNION { ?b dcterms:creator ?a } UNION { ?b dcterms:contributor ?a } }" % (namespaces['foaf'], namespaces['dcterms']))) for a_uri in agents: a_uri = a_uri[0] a = Agent(a_uri) allThings[a_uri] = a self.set_fields(a, graph) for (subj, pred) in graph.subject_predicates(URIRef(a_uri)): if things.has_key(subj): # direct manipulation, as will have already added predicate in set_fields what = things[subj] what._agents_[a_uri] = a # rem and aggr will have default rdf:type triples already allThings.update(rem.triples) allThings.update(aggr.triples) for subj in graph.subjects(): if not allThings.has_key(subj): # triple needed ar = ArbitraryResource(subj) allThings[subj] = ar # find our graph for (pred, obj) in graph.predicate_objects(subj): ar.graph.add((subj, pred, obj)) # find shortest distance to main object to link to main graph # Breadth First Search found = 0 checked = {} tocheck = list(graph.subject_predicates(subj)) while tocheck: subsubj = tocheck.pop(0)[0] checked[subsubj] = 1 if things.has_key(subsubj): things[subsubj]._triples_[ar.uri] = ar found = 1 break else: extd = list(graph.subject_predicates(subsubj)) if extd: for e in extd[0]: if not checked.has_key(e): tocheck.append(e) if not found: if unconnectedAction == 'ignore': # Input graph is not connected! rem._triples_[ar.uri] = ar elif unconnectedAction == 'warn': print "Input Graph Not Connected at: %s" % subj elif unconnectedAction == 'raise': raise OreException("Input Graph Not Connected at: %s" % subj) return rem