def index_aggregate(a): doc = xapian.Document() doc.add_value(VAL_URI, a.identifier) docid = u"URI" + a.identifier doc.add_term(docid) log.debug("Aggregate: %s" % a.identifier) def add_value(g, val_id, subject, predicate): val = [] for s, p, o in g.triples((subject, predicate, None)): if not o.language or o.language == "en": ### TODO: fix this val.append(o) if val: val = u", ".join(val) doc.add_value(val_id, val) return val ## create an abbreviated graph to store in the xapian database extract = Graph() add_value(a, VAL_LABEL, a.identifier, RDFS.label) for g in a.contexts(): log.debug("Indexing: %s" % g.identifier) for pred in (RDF.type, RDFS.label, RDFS.comment, DC.title, DC.description, FOAF.name): for statement in a.triples((g.identifier, pred, None)): extract.add(statement) title = add_value(g, VAL_TITLE, g.identifier, DC.title) if title: doc.add_term(u"ZT" + title[:160]) name = add_value(g, VAL_NAME, g.identifier, FOAF.name) if name: doc.add_term(u"NA" + name[:160]) doc.set_data(extract.serialize(format="n3")) ## take any fields that contain text, stem them according to their ## language (or english if unsupported or unspecified) and put them ## in the index termgen = xapian.TermGenerator() termgen.set_document(doc) for pred in (RDFS.label, RDFS.comment, DC.title, DC.description, FOAF.name, FOAF.first_name, FOAF.last_name, FOAF.surname): for s, p, o in a.triples((None, pred, None)): termgen.increase_termpos() if o.language: try: stemmer = xapian.Stem(o.language) except xapian.InvalidArgumentError: stemmer = xapian.Stem("en") else: stemmer = xapian.Stem("en") termgen.set_stemmer(stemmer) termgen.index_text(o) return docid, doc
def create(self, data): # create object content = Graph() # apply form.rdftype content.add((content.identifier, RDF['type'], self.rdftype)) # apply form data form.applyChanges(self, content, data) for group in self.groups: form.applyChanges(group, content, data) return content
def get(self, identifier): # simple check out mechanism. # the handler returns the same graph as long as it's not put back if identifier in self._cache: return self._cache[identifier] graph = self.store.get_context(identifier) # make a copy of the graph cgraph = Graph(identifier=identifier) for t in graph: cgraph.add(t) self._cache[identifier] = cgraph return cgraph
def work(self, marc): proc = self.process() proc.use(marc.identifier) work = Graph(identifier=URIRef(marc.identifier + "/work")) work.add((work.identifier, RDF["type"], OBP["Work"])) work += self.rewrite(marc, work, DC["title"]) work += self.rewrite(marc, work, DC["description"]) work += self.rewrite(marc, work, BIBO["lccn"]) work += self.rewrite(marc, work, OBP["scn"]) contributors = self.contributors(marc) for c in contributors: work.add((work.identifier, DC["contributor"], c.identifier)) subjects = self.subjects(marc) for s in subjects: work.add((work.identifier, DC["subject"], s.identifier)) if not s.exists((s.identifier, RDF["type"], FOAF["Person"])): work += s manif = self.manifestation(marc) work.add((work.identifier, OBP["hasManifestation"], manif.identifier)) proc.result(work) self.context.add(work)
def contributors(self, marc): result = [] i = 0 for s, p, o in marc.triples( (marc.identifier, DC["contributor"], None)): proc = self.process() proc.use(marc.identifier) identifier = URIRef(marc.identifier + "/contributor/%d" % i) contributor = Graph(identifier=identifier) contributor += marc.bnc((o, None, None)).replace( (o, None, None), (identifier, None, None)) if not contributor.exists((identifier, RDF["type"], None)): contributor.add((identifier, RDF["type"], FOAF["Person"])) proc.result(contributor) self.context.add(contributor) result.append(contributor) i += 1 return result
def manifestation(self, marc): proc = self.process() proc.use(marc.identifier) manif = Graph(identifier=URIRef(marc.identifier + "/manifestation")) manif.add((manif.identifier, RDF["type"], OBP["Manifestation"])) publisher = self.publisher(marc) manif.add((manif.identifier, DC["publisher"], publisher.identifier)) for _s, _p, o in marc.triples( (marc.identifier, DC["publisher"], None)): for s, p, loc in marc.triples((o, DC["spatial"], None)): manif.add((manif.identifier, DC["spatial"], loc)) manif += self.rewrite(marc, manif, BIBO["isbn"]) manif += self.rewrite(marc, manif, BIBO["isbn10"]) manif += self.rewrite(marc, manif, BIBO["isbn13"]) manif += self.rewrite(marc, manif, DC["date"]) manif += self.rewrite(marc, manif, DC["extent"]) manif += self.rewrite(marc, manif, OBP["dimensions"]) manif += self.rewrite(marc, manif, OBP["edition"]) manif += self.rewrite(marc, manif, OBP["lccall"]) manif += self.rewrite(marc, manif, OBP["nlmcall"]) manif += self.rewrite(marc, manif, OBP["nbn"]) manif += self.rewrite(marc, manif, OBP["physicalDetail"]) manif += self.rewrite(marc, manif, RDFS["seeAlso"]) proc.result(manif) self.context.add(manif) return manif
def subjects(self, marc): result = [] i = 0 for s, p, o in marc.triples((marc.identifier, DC["subject"], None)): if isinstance(o, Literal): subject = Graph() subject.add((subject.identifier, RDF["value"], o)) result.append(subject) elif marc.exists((o, RDF["type"], FOAF["Person"])): proc = self.process() proc.use(marc.identifier) identifier = URIRef(marc.identifier + "/subject/%d" % i) subject = Graph(identifier=identifier) subject += marc.bnc((o, None, None)).replace( (o, None, None), (identifier, None, None)) proc.result(subject) self.context.add(subject) i += 1 else: subject = Graph(identifier=o) subject += marc.bnc((o, None, None)) result.append(subject) return result
def rdf_data(): s = LicensesService2() g = Graph(identifier=CC[""]) g.parse("http://creativecommons.org/schema.rdf") yield g fp = pkg_resources.resource_stream("licenses", os.path.join("n3", "license.n3")) g = Graph(identifier=LICENSES["lens"]) g.parse(fp, format="n3") fp.close() yield g for ld in s.get_licenses(): ident = LICENSES[ld["id"]] g = Graph(identifier=ident) l = License(ident, graph=g) l.label = Literal(ld["title"]) l.prefLabel = Literal(ld["title"]) l.notation = Literal(ld["id"]) l.lens = LICENSES.lens if ld.get("url"): url = URIRef(ld["url"]) sa = Graph() try: sa.parse(url) except: pass try: sa.parse(url, format="rdfa") except: pass sa.remove((url, XHV.icon, None)) sa.remove((url, XHV.alternate, None)) sa.remove((url, XHV.stylesheet, None)) for ll in sa.distinct_objects(url, XHV.license): l.license = ll sa.remove((url, XHV.license, None)) if sa.bnc((url, None, None)): [g.add((ident, p, o)) for s, p, o in sa.bnc((url, None, None))] l.sameAs = url else: l.seeAlso = URIRef(ld["url"]) yield g
def rdf_data(): s = LicensesService2() g = Graph(identifier=CC[""]) g.parse("http://creativecommons.org/schema.rdf") yield g fp = pkg_resources.resource_stream("licenses", os.path.join("n3", "license.n3")) g = Graph(identifier=LICENSES["lens"]) g.parse(fp, format="n3") fp.close() yield g for ld in s.get_licenses(): ident = LICENSES[ld["id"]] g = Graph(identifier=ident) l = License(ident, graph=g) l.label = Literal(ld["title"]) l.prefLabel = Literal(ld["title"]) l.notation = Literal(ld["id"]) l.lens = LICENSES.lens if ld.get("url"): url = URIRef(ld["url"]) sa = Graph() try: sa.parse(url) except: pass try: sa.parse(url, format="rdfa") except: pass sa.remove((url, XHV.icon, None)) sa.remove((url, XHV.alternate, None)) sa.remove((url, XHV.stylesheet, None)) for ll in sa.distinct_objects(url, XHV.license): l.license = ll sa.remove((url, XHV.license, None)) if sa.bnc((url, None, None)): [g.add((ident, p, o)) for s,p,o in sa.bnc((url, None, None))] l.sameAs = url else: l.seeAlso = URIRef(ld["url"]) yield g
def rdf(self, *av, **kw): g = Graph(*av, **kw) g.add((g.identifier, RDF["type"], OBP["MarcRecord"])) def merge(d, s): for k, v in d.items(): ns, term = k.split(":") p = namespaces[ns][term] for o in v: if isinstance(o, dict): b = BNode() g.add((s, p, b)) merge(o, b) else: g.add((s, p, o)) ident = g.identifier merge(self, ident) for s, p, o in g.triples((ident, BIBO["isbn"], None)): g.add((ident, RDFS["seeAlso"], URIRef("urn:isbn:%s" % o))) g.add((ident, RDFS["seeAlso"], URIRef("http://purl.org/NET/book/isbn/%s#book" % o))) g.add( (ident, RDFS["seeAlso"], URIRef("http://www4.wiwiss.fu-berlin.de/bookmashup/books/%s" % o))) if len(o) == 10: g.add((ident, BIBO["isbn10"], o)) elif len(o) == 13: g.add((ident, BIBO["isbn13"], o)) for s, p, o in g.triples((ident, BIBO["issn"], None)): g.add((ident, RDFS["seeAlso"], URIRef("urn:issn:%s" % o))) for s, p, o in g.triples((ident, BIBO["lccn"], None)): g.add( (ident, RDFS["seeAlso"], URIRef(u"http://lccn.loc.gov/" + o))) self.nbn(g) self.scn(g) self.lccall(g) self.lccopy(g) self.isPartOf(g) return g