Example #1
0
 def connected_graph(self, graph, uri):
     if unconnectedAction == 'ignore':
         return graph
     g = Graph()
     all_nodes = list(graph.all_nodes())
     all_nodes = filter(lambda y: not isinstance(y, Literal), all_nodes)
     discovered = {}
     visiting = [uri]
     while visiting:
         x = visiting.pop()
         if not discovered.has_key(x):
             discovered[x] = 1
         for (p, new_x) in graph.predicate_objects(subject=x):
             g.add((x, p, new_x))
             if (
                     isinstance(new_x, URIRef) or isinstance(new_x, BNode)
             ) and not discovered.has_key(new_x) and not new_x in visiting:
                 visiting.append(new_x)
         for (new_x, p) in graph.subject_predicates(object=x):
             g.add((new_x, p, x))
             if (
                     isinstance(new_x, URIRef) or isinstance(new_x, BNode)
             ) and not discovered.has_key(new_x) and not new_x in visiting:
                 visiting.append(new_x)
     if len(discovered) != len(all_nodes):
         if unconnectedAction == 'warn':
             print "Warning: Graph is unconnected, some nodes being dropped"
         elif unconnectedAction == 'raise':
             raise OreException('Graph to be serialized is unconnected')
         elif unconnectedAction != 'drop':
             raise ValueError('Unknown unconnectedAction setting: %s' %
                              unconnectedAction)
     return g
Example #2
0
    def merge_graphs(self, rem, page=-1):
        g = Graph()
        # Put in some sort of recognition of library?

        n = now()
        if not rem.created:
            rem._dcterms.created = n
        rem._dcterms.modified = n
        if not rem._dcterms.creator:
            rem.add_agent(foresiteAgent, 'creator')

        aggr = rem.aggregation
        stack = [rem, aggr]

        if page != -1:
            # first is 1, 2, 3 ...
            start = (page - 1) * pageSize
            tosrlz = aggr._resources_[start:start + pageSize]
        else:
            tosrlz = aggr._resources_

        remove = []
        for (r, p) in tosrlz:
            if isinstance(r, Aggregation):
                for a in r._ore.aggregates:
                    remove.append(
                        (r._uri_, namespaces['ore']['aggregates'], a))
            stack.extend([r, p])

        done = []
        while stack:
            what = stack.pop(0)
            if what == None or what in done:
                continue
            done.append(what)
            g += what._graph_
            for at in what._triples_.values():
                stack.append(at)
            for who in what._agents_.values():
                stack.append(who)

        if self.public:
            # Remove internal methods
            for p in internalPredicates:
                for (s, o) in g.subject_objects(p):
                    g.remove((s, p, o))
        for trip in remove:
            g.remove(trip)

        if not aggr._resources_:
            raise OreException("Aggregation must aggregate something")
        g = self.connected_graph(g, aggr._uri_)
        return g
Example #3
0
    def serialize(self, rem, page=-1):
        aggr = rem._aggregation_
        # Check entire graph is connected
        g = self.merge_graphs(rem)

        if namespaces.has_key(''):
            del namespaces[u'']
        root = Element("feed", nsmap=namespaces)
        #namespaces[''] = myNamespace

        ## Aggregation Info
        e = SubElement(root, 'id')
        e.text = str(aggr.uri)
        if not aggr._dc.title:
            raise OreException(
                "Atom Serialisation requires title on aggregation")
        else:
            e = SubElement(root, 'title')
            e.text = str(aggr._dc.title[0])
        if aggr._dc.description:
            e = SubElement(root, 'subtitle')
            e.text = str(aggr._dc.description[0])

        for who in aggr._dcterms.creator:
            e = SubElement(root, 'author')
            agent = aggr._agents_[who]
            self.make_agent(e, agent)

        for bn in aggr._dcterms.contributor:
            e = SubElement(root, 'contributor')
            agent = aggr._agents_[bn]
            self.make_agent(e, agent)

        for t in aggr._ore.similarTo:
            self.make_link(root, 'related', t, g)

        for t in aggr._dcterms.rights:
            self.make_link(root, 'license', t, g)

        for t in aggr._rdf.type:
            e = SubElement(root, 'category', term=str(t))
            try:
                scheme = list(g.objects(t,
                                        namespaces['rdfs']['isDefinedBy']))[0]
                e.set('scheme', str(scheme))
            except:
                pass
            try:
                label = list(g.objects(t, namespaces['rdfs']['label']))[0]
                e.set('label', str(label))
            except:
                pass

        orms = []
        for orm in aggr._resourceMaps_:
            if orm != rem:
                self.make_link(root, 'alternate', orm.uri, g)
                orms.append(orm.uri)
        for t in aggr._ore.isDescribedBy:
            # check not in orms
            if not t in orms:
                self.make_link(root, 'alternate', t, g)

        self.generate_rdf(root, aggr)

        ## ReM Info
        self.make_link(root, 'self', rem.uri, g)

        e = SubElement(root, 'updated')
        e.text = now()

        # ReM Author
        if rem._dcterms.creator:
            uri = rem._dcterms.creator[0]
            e = SubElement(root, 'generator', uri=str(uri))
            agent = rem._agents_[uri]
            n = agent._foaf.name[0]
            e.text = str(n)
            self.done_triples.append((uri, namespaces['foaf']['name'], n))

        # if no logo, put in nice ORE icon
        e = SubElement(root, 'icon')
        if aggr._foaf.logo:
            e.text = str(aggr._foaf.logo[0])
        elif rem._foaf.logo:
            e.text = str(rem._foaf.logo[0])
        else:
            e.text = "http://www.openarchives.org/ore/logos/ore_icon.png"

        if rem._dc.rights:
            e = SubElement(root, 'rights')
            e.text = rem._dc.rights[0]

        self.generate_rdf(root, rem)

        ## Process Entries
        for (res, proxy) in aggr._resources_:
            entry = SubElement(root, 'entry')

            e = SubElement(entry, 'id')
            if proxy:
                e.text = str(proxy.uri)
            else:
                e.text = "urn:uuid:%s" % gen_uuid()
            e = SubElement(entry, 'link', rel="alternate", href=str(res.uri))
            # type = dc:format
            fmt = list(g.objects(res.uri, namespaces['dc']['format']))
            if fmt:
                e.set('type', str(fmt[0]))

            if not res._dc.title:
                raise ValueError(
                    "All entries must have a title for ATOM serialisation")
            else:
                e = SubElement(entry, 'title')
                e.text = str(res._dc.title[0])
            for t in res._rdf.type:
                e = SubElement(entry, 'category', term=str(t))
                try:
                    scheme = list(
                        g.objects(t, namespaces['rdfs']['isDefinedBy']))[0]
                    e.set('scheme', str(scheme))
                except:
                    pass
                try:
                    label = list(g.objects(t, namespaces['rdfs']['label']))[0]
                    e.set('label', str(label))
                except:
                    pass
            for a in res._dcterms.creator:
                e = SubElement(entry, 'author')
                agent = res._agents_[a]
                self.make_agent(e, agent)
            for a in res._dcterms.contributor:
                e = SubElement(entry, 'contributor')
                agent = res._agents_[a]
                self.make_agent(e, agent)
            if res._dcterms.abstract:
                e = SubElement(entry, 'summary')
                e.text = str(res._dcterms.abstract[0])

            # Not sure about this at object level?
            for oa in res._ore.isAggregatedBy:
                if oa != aggr._uri_:
                    e = SubElement(entry, 'link', rel="related", href=str(oa))

            e = SubElement(entry, 'updated')
            e.text = now()

            if proxy and proxy._ore.lineage:
                e = SubElement(entry,
                               'link',
                               rel="via",
                               href=str(proxy._ore.lineage[0]))
            res._currProxy_ = proxy
            self.generate_rdf(entry, res)
            res._currProxy_ = None

        data = etree.tostring(root)
        data = data.replace('\n', '')
        data = self.spacesub.sub('', data)
        uri = str(rem._uri_)

        self.done_triples = []

        return ReMDocument(uri, data)
Example #4
0
    def serialize(self, rem, page=-1):
        aggr = rem._aggregation_
        g = self.merge_graphs(rem)

        # make nsmap better
        nm = g.namespace_manager
        nsmap = {'atom': str(namespaces['atom'])}
        poss = uniq(g.predicates()) + uniq(g.objects(None, RDF.type))
        for pred in poss:
            pf, ns, l = nm.compute_qname(pred)
            nsmap[pf] = ns

        root = Element("{%s}entry" % namespaces['atom'], nsmap=nsmap)

        # entry/id == tag for entry == ReM dc:identifier
        # if not exist, generate Yet Another uuid
        e = SubElement(root, '{%s}id' % namespaces['atom'])
        if rem._dc.identifier:
            dcid = rem._dc.identifier[0]
            e.text = str(dcid)
            self.done_triples.append(
                (rem._uri_, namespaces['dc']['identifier'], dcid))
        else:
            e.text = "urn:uuid:%s" % gen_uuid()

        # entry/title == Aggr's dc:title
        title = aggr._dc.title
        tns = 'dc'
        if not title:
            title = aggr._dcterms.title
            tns = 'dcterms'
        if not title:
            raise OreException(
                "Atom Serialisation requires title on aggregation")
        else:
            e = SubElement(root, '{%s}title' % namespaces['atom'])
            dctit = title[0]
            e.text = str(dctit)
            self.done_triples.append(
                (aggr._uri_, namespaces[tns]['title'], dctit))

        # entry/author == Aggr's dcterms:creator
        for who in aggr._dcterms.creator:
            e = SubElement(root, '{%s}author' % namespaces['atom'])
            agent = aggr._agents_[who]
            self.make_agent(e, agent)
            self.done_triples.append(
                (aggr._uri_, namespaces['dcterms']['creator'], agent._uri_))

        # entry/contributor == Aggr's dcterms:contributor
        for bn in aggr._dcterms.contributor:
            e = SubElement(root, '{%s}contributor' % namespaces['atom'])
            agent = aggr._agents_[who]
            self.make_agent(e, agent)
            self.done_triples.append(
                (aggr._uri_, namespaces['dcterms']['contributor'],
                 agent._uri_))

        # entry/category[@scheme="(magic)"][@term="(datetime)"]
        for t in aggr._dcterms.created:
            t = t.strip()
            e = SubElement(
                root,
                '{%s}category' % namespaces['atom'],
                term=str(t),
                scheme="http://www.openarchives.org/ore/terms/datetime/created"
            )
        for t in aggr._dcterms.modified:
            t = t.strip()
            e = SubElement(
                root,
                '{%s}category' % namespaces['atom'],
                term=str(t),
                scheme="http://www.openarchives.org/ore/terms/datetime/modified"
            )

        # entry/category == Aggr's rdf:type
        for t in aggr._rdf.type:
            e = SubElement(root,
                           '{%s}category' % namespaces['atom'],
                           term=str(t))
            try:
                scheme = list(g.objects(t,
                                        namespaces['rdfs']['isDefinedBy']))[0]
                e.set('scheme', str(scheme))
                self.done_triples.append(
                    (t, namespaces['rdfs']['isDefinedBy'], scheme))
            except:
                pass
            try:
                label = list(g.objects(t, namespaces['rdfs']['label']))[0]
                e.set('label', str(label))
                self.done_triples.append(
                    (t, namespaces['rdfs']['label'], label))
            except:
                pass
            self.done_triples.append(
                (aggr._uri_, namespaces['rdf']['type'], t))

        # entry/summary
        if aggr._dc.description:
            e = SubElement(root, '{%s}summary' % namespaces['atom'])
            desc = aggr._dc.description[0]
            e.text = str(desc)
            self.done_triples.append(
                (aggr._uri_, namespaces['dc']['description'], desc))

        # All aggr links:
        done = [
            namespaces['rdf']['type'], namespaces['ore']['aggregates'],
            namespaces['dcterms']['creator'],
            namespaces['dcterms']['contributor'], namespaces['dc']['title'],
            namespaces['dc']['description']
        ]
        for (p, o) in g.predicate_objects(aggr.uri):
            if not p in done:
                if isinstance(o, URIRef):
                    self.make_link(root, p, o, g)
                    self.done_triples.append((aggr._uri_, p, o))

        # entry/content   //  link[@rel="alternate"]
        # Do we have a splash page?
        altDone = 0
        atypes = aggr._rdf._type
        possAlts = []
        for (r, p) in aggr.resources:
            mytypes = r._rdf.type
            if namespaces['eurepo']['humanStartPage'] in mytypes:
                altDone = 1
                self.make_link(root, 'alternate', r.uri, g)
                break
            # check if share non Aggregation type
            # eg aggr == article and aggres == article, likely
            # to be good alternate
            for m in mytypes:
                if m != namespaces['ore']['Aggregation'] and \
                   m in atypes:
                    possAlt.append(r.uri)

        if not altDone and possAlts:
            # XXX more intelligent algorithm here
            self.make_link(root, '{%s}alternate' % namespaces['atom'],
                           possAlts[0], g)
            altDone = 1

        if not altDone and build_html_atom_content:
            e = SubElement(root, '{%s}content' % namespaces['atom'])
            e.set('type', 'html')
            # make some representative html
            # this can get VERY LONG so default to not doing this
            html = ['<ul>']
            for (r, p) in aggr.resources:
                html.append('<li><a href="%s">%s</a></li>' %
                            (r.uri, r.title[0]))
            html.append('</ul>')
            e.text = '\n'.join(html)
        else:
            e = SubElement(root, '{%s}content' % namespaces['atom'])
            e.set('type', 'html')
            e.text = "No Content"

        # entry/link[@rel='self'] == URI-R
        self.make_link(root, 'self', rem._uri_, g)
        # entry/link[@rel='ore:describes'] == URI-A
        self.make_link(root, namespaces['ore']['describes'], aggr._uri_, g)

        ### These are generated automatically in merge_graphs

        # entry/published == ReM's dcterms:created
        if rem._dcterms.created:
            e = SubElement(root, '{%s}published' % namespaces['atom'])
            c = rem._dcterms.created[0]
            md = str(c)
            if md.find('Z') == -1:
                # append Z
                md += "Z"
            e.text = md
            self.done_triples.append(
                (rem._uri_, namespaces['dcterms']['created'], c))

        # entry/updated == ReM's dcterms:modified
        e = SubElement(root, '{%s}updated' % namespaces['atom'])
        if rem._dcterms.modified:
            c = rem._dcterms.modified[0]
            md = str(c)
            if md.find('Z') == -1:
                # append Z
                md += "Z"
            e.text = str(md)

            self.done_triples.append(
                (rem._uri_, namespaces['dcterms']['modified'], c))
        else:
            e.text = now()

        # entry/rights == ReM's dc:rights
        if rem._dc.rights:
            e = SubElement(root, '{%s}rights' % namespaces['atom'])
            r = rem._dc.rights[0]
            e.text = str(r)
            self.done_triples.append(
                (rem._uri_, namespaces['dc']['rights'], r))

        # entry/source/author == ReM's dcterms:creator
        if rem._dcterms.creator:
            # Should at least be our generator! (right?)
            src = SubElement(root, '{%s}source' % namespaces['atom'])
            for who in rem._dcterms.creator:
                e = SubElement(src, '{%s}author' % namespaces['atom'])
                agent = rem._agents_[who]
                self.make_agent(e, agent)
                self.done_triples.append(
                    (rem._uri_, namespaces['dcterms']['creator'], agent._uri_))
            for who in rem._dcterms.contributor:
                e = SubElement(src, '{%s}contributor' % namespaces['atom'])
                agent = rem._agents_[who]
                self.make_agent(e, agent)
                self.done_triples.append(
                    (rem._uri_, namespaces['dcterms']['contributor'],
                     agent._uri_))
            e = SubElement(src,
                           '{%s}generator' % namespaces['atom'],
                           uri=str(libraryUri),
                           version=str(libraryVersion))
            e.text = str(libraryName)

        # Remove aggregation, resource map props already done
        # All of agg res needs to be done

        for (r, p) in aggr.resources:
            self.make_link(root, namespaces['ore']['aggregates'], r.uri, g)
            self.done_triples.append(
                (aggr._uri_, namespaces['ore']['aggregates'], r._uri_))

        # Now create ore:triples
        # and populate with rdf/xml

        trips = SubElement(root, '{%s}triples' % namespaces['ore'])
        self.generate_rdf(trips, g)

        data = etree.tostring(root, pretty_print=True)
        #data = data.replace('\n', '')
        #data = self.spacesub.sub('', data)
        uri = str(rem._uri_)

        self.done_triples = []

        return ReMDocument(uri, data, format='atom', mimeType=self.mimeType)
Example #5
0
    def parse(self, doc):
        root = etree.fromstring(doc.data)
        self.curr_root = root
        graph = Graph()
        # first construct aggr and rem

        self.all_objects = {}

        uri_a = root.xpath("/atom:entry/atom:link[@rel='http://www.openarchives.org/ore/terms/describes']/@href", namespaces=namespaces)
        uri_r = root.xpath("/atom:entry/atom:link[@rel='self']/@href", namespaces=namespaces)

        rem = ResourceMap(uri_r[0])
        aggr = Aggregation(uri_a[0])
        rem.set_aggregation(aggr)

        self.all_objects[rem._uri_] = rem
        self.all_objects[aggr._uri_] = aggr

        # Aggregation Info
        title = root.xpath("/atom:entry/atom:title/text()", namespaces=namespaces)
        aggr._dc.title = title[0]

        for auth in root.xpath('/atom:entry/atom:author', namespaces=namespaces):
            self.handle_person(auth, aggr, 'creator')
        for auth in root.xpath('/atom:entry/atom:contributor', namespaces=namespaces):
            self.handle_person(auth, aggr, 'contributor')
        for cat in root.xpath('/atom:entry/atom:category', namespaces=namespaces):
            self.handle_category(cat, aggr)
        for link in root.xpath('/atom:entry/atom:link', namespaces=namespaces):
            self.handle_link(link, aggr)

        summary = root.xpath("/atom:entry/atom:summary/text()", namespaces=namespaces)
        if summary:
            aggr._dc.description = summary[0]

        # Resource Map Info
        aid = root.xpath("/atom:entry/atom:id/text()", namespaces=namespaces)
        at = ArbitraryResource(aid[0])
        at._dcterms.hasVersion = rem._uri_
        rem.add_triple(at)
        self.all_objects[at._uri_] = at

        updated = root.xpath("/atom:entry/atom:updated/text()", namespaces=namespaces)
        if updated:
            rem._dcterms.modified = updated[0]        
        elif self.strict:
            raise OreException("Graph does not have mandatory 'ResourceMap dcterms:modified timestamp' triple")

        published = root.xpath("/atom:entry/atom:published/text()", namespaces=namespaces)
        if published:
            rem._dcterms.created = published[0]
        
        rights = root.xpath("/atom:entry/atom:rights/text()", namespaces=namespaces)
        if rights:
            rem._dc.rights = rights[0]

        lic = root.xpath("/atom:entry/atom:link[@rel='license']/@href", namespaces=namespaces)
        if lic:
            rem._dcterms.rights = URIRef(lic[0])

        for rauth in root.xpath('/atom:entry/atom:source/atom:author', namespaces=namespaces):
            self.handle_person(rauth, rem, 'creator')

        for rauth in root.xpath('/atom:entry/atom:source/atom:contributor', namespaces=namespaces):
            self.handle_person(rauth, rem, 'contributor')

        for rdf in root.xpath('/atom:entry/ore:triples/rdf:Description', namespaces=namespaces):
            try:
                about = URIRef(rdf.attrib['{%s}about' % namespaces['rdf']])
            except:
                # probably a blank node
                try:
                    about = BNode(rdf.attrib['{%s}nodeID' % namespaces['rdf']])
                except:
                    raise
            if about in self.all_objects:
                self.handle_rdf(rdf, self.all_objects[about])
            else:
                self.handle_rdf(rdf, aggr)
        self.all_objects = {}

        return rem
Example #6
0
    def process_graph(self, graph):

        # take graph and find objects, split up stuff into graph
        # Find ReM/Aggr        
        lres = list(graph.query("PREFIX ore: <%s> SELECT ?a ?b WHERE {?a ore:describes ?b .}" % namespaces['ore']))
        try:
            uri_r = lres[0][0]
            uri_a = lres[0][1]
        except IndexError:
            raise OreException("Graph does not have mandatory ore:describes triple")
        if self.strict and len(lres) != 1:
            raise OreException("Graph must contain exactly one ore:describes triple")

        if self.strict and not protocolUriRe.match(uri_r):
            raise OreException("Resource Map URI must be protocol-based URI: %s" % uri_r)
        if self.strict and not protocolUriRe.match(uri_a):
            raise OreException("Aggregation URI must be protocol-based URI: %s" % uri_a)

        remc = list(graph.query("PREFIX dcterms: <%s> SELECT ?a WHERE { <%s> dcterms:creator ?a .}" % (namespaces['dcterms'], uri_r)))
        if self.strict and not remc:
            raise OreException("Graph does not have mandatory 'ResourceMap dcterms:creator ?x' triple")

        remc = list(graph.query("PREFIX dcterms: <%s> SELECT ?a WHERE { <%s> dcterms:modified ?a .}" % (namespaces['dcterms'], uri_r)))
        if self.strict and not remc:
            raise OreException("Graph does not have mandatory 'ResourceMap dcterms:modified timestamp' triple")


        rem = ResourceMap(uri_r)
        aggr = Aggregation(uri_a)
        rem.set_aggregation(aggr)
        self.set_fields(rem, graph)
        self.set_fields(aggr, graph)

        things = {uri_r : rem, uri_a : aggr}

        res2 = graph.query("PREFIX ore: <http://www.openarchives.org/ore/terms/> SELECT ?b WHERE {<%s> ore:aggregates ?b .}" % uri_a )
        for uri_ar in res2:
            uri_ar = uri_ar[0]
            if self.strict and not protocolUriRe.match(uri_ar):
                raise OreException("Aggregated Resource URI must be protocol-based URI: %s" % uri_ar)

            res = AggregatedResource(uri_ar)
            things[uri_ar] = res
            proxy = list(graph.query("PREFIX ore: <http://www.openarchives.org/ore/terms/> SELECT ?a WHERE {?a ore:proxyFor <%s> .}" % uri_ar ))
            try:
                uri_p = proxy[0][0]
                p = Proxy(uri_p)
                p.set_forIn(res, aggr)
                things[uri_p] = p
                aggr.add_resource(res, p)
                self.set_fields(res, graph)
                self.set_fields(p, graph)
            except IndexError:
                aggr.add_resource(res, None)
                self.set_fields(res, graph)

        allThings = things.copy()

        agents = list(graph.query("PREFIX foaf: <%s> PREFIX dcterms: <%s> SELECT ?a WHERE { { ?a foaf:name ?b } UNION { ?a foaf:mbox ?b } UNION { ?b dcterms:creator ?a } UNION { ?b dcterms:contributor ?a } }" % (namespaces['foaf'], namespaces['dcterms'])))
        for a_uri in agents:
            a_uri = a_uri[0]
            a = Agent(a_uri)
            allThings[a_uri] = a
            self.set_fields(a, graph)
            for (subj, pred) in graph.subject_predicates(URIRef(a_uri)):
                if things.has_key(subj):
                    # direct manipulation, as will have already added predicate in set_fields
                    what = things[subj]
                    what._agents_[a_uri] = a

        # rem and aggr will have default rdf:type triples already
        allThings.update(rem.triples)
        allThings.update(aggr.triples)

        for subj in graph.subjects():
            if not allThings.has_key(subj):
                # triple needed
                ar = ArbitraryResource(subj)
                allThings[subj] = ar
                # find our graph
                for (pred, obj) in graph.predicate_objects(subj):
                    ar.graph.add((subj, pred, obj))

                # find shortest distance to main object to link to main graph
                # Breadth First Search
                found = 0
                checked = {}
                tocheck = list(graph.subject_predicates(subj))
                while tocheck:
                    subsubj = tocheck.pop(0)[0]
                    checked[subsubj] = 1
                    if things.has_key(subsubj):
                        things[subsubj]._triples_[ar.uri] = ar
                        found = 1
                        break
                    else:
                        extd = list(graph.subject_predicates(subsubj))
                        if extd:
                            for e in extd[0]:
                                if not checked.has_key(e):
                                    tocheck.append(e)
                        
                if not found:
                    if unconnectedAction == 'ignore':
                        # Input graph is not connected!
                        rem._triples_[ar.uri] = ar
                    elif unconnectedAction == 'warn':
                        print "Input Graph Not Connected at: %s" % subj
                    elif unconnectedAction == 'raise':
                        raise OreException("Input Graph Not Connected at: %s" % subj)

        return rem