def test_related_titles(self): # if two titles, one with a type and the other with an id, first should # be part of the second # sample from hughes-edwards1145 two_titles = '''<c01 level="file" xmlns="%s"> <did> <container type="box">1</container> <container type="folder">6</container> <unittitle><title type="poetry" render="doublequote">Mother to Son</title>, poem in <title source="oclc" authfilenumber="10870853">The People's Voice</title>, May 9, 1942 </unittitle> </did> </c01>''' % EAD_NAMESPACE g = self._render_item_to_rdf(two_titles) # there should be a manuscript in the output ms_triples = list(g.triples((None, rdflib.RDF.type, self.BIBO.Manuscript))) self.assert_(ms_triples, 'RDFa output should include an item with type bibo:Manuscript') # first element of the first triple should be the article node ms_node = ms_triples[0][0] self.assertEqual(u'Mother to Son', unicode(g.value(ms_node, self.DC.title)), 'poem title should be set as dc:title') self.assertEqual(u'poetry', unicode(g.value(ms_node, self.SCHEMA_ORG.genre)), 'genre should be set as poetry') book_uriref = rdflib.URIRef(title_rdf_identifier('oclc', '10870853')) self.assertTrue((ms_node, self.DC.isPartOf, book_uriref) in g, 'poem should be part of document with an id') self.assertTrue((book_uriref, rdflib.RDF.type, self.BIBO.Document), 'document with id should be a bibo:Document') self.assert_(g.triples((None, self.SCHEMA_ORG.mentions, book_uriref)), 'document with id should be mentioned by context (i.e. collection)')
def test_extra_titles(self): # more than two titles in a unittitle # sample from hughes-edwards1145 two_titles = '''<c01 level="file" xmlns="%s"> <did> <container type="box">1</container> <container type="folder">7</container> <unittitle><title type="essays" render="doublequote">The Need for Heroes</title>, essay in <title source="ISSN" authfilenumber=" 2169-1010">The Crisis</title>, June 1941 [includes pages in which poems <title type="poetry" render="doublequote">The Negro Speaks of Rivers</title> and <title type="poetry" render="doublequote">NAACP</title> appear] </unittitle> </did> </c01>''' % EAD_NAMESPACE g = self._render_item_to_rdf(two_titles) # first two titles should work roughly as related title does above # there should be *three* manuscripts in the output ms_triples = list(g.triples((None, rdflib.RDF.type, self.BIBO.Manuscript))) self.assertEqual(3, len(ms_triples), 'RDFa output should include three items with type bibo:Manuscript') # since rdf is unsorted, we have to find them by title ms_by_title = {} for ms in ms_triples: s, p, o = ms if unicode(g.value(s, self.DC.title)) == unicode('The Need for Heroes'): ms_by_title['Need'] = s if unicode(g.value(s, self.DC.title)) == unicode('The Negro Speaks of Rivers'): ms_by_title['Negro'] = s if unicode(g.value(s, self.DC.title)) == unicode('NAACP'): ms_by_title['NAACP'] = s self.assertEqual(u'essays', unicode(g.value(ms_by_title['Need'], self.SCHEMA_ORG.genre)), 'essay title genre should be set as essays') self.assertEqual(u'poetry', unicode(g.value(ms_by_title['Negro'], self.SCHEMA_ORG.genre)), 'poem title genre should be set as poetry') self.assertEqual(u'poetry', unicode(g.value(ms_by_title['NAACP'], self.SCHEMA_ORG.genre)), 'poem title genre should be set as poetry') book_uriref = rdflib.URIRef(title_rdf_identifier('issn', '2169-1010')) # things that should be consistent for each of the typed titles for ms_node in ms_by_title.itervalues(): # each item should be part of the book with title self.assertTrue((ms_node, self.DC.isPartOf, book_uriref) in g, 'item should be part of document with an id') # each item should be mentioned in context (i.e. collection) self.assert_(g.triples((None, self.SCHEMA_ORG.mentions, ms_node)), 'item should be mentioned by context (i.e. collection)') self.assertTrue((book_uriref, rdflib.RDF.type, self.BIBO.Periodical), 'document with issn should be a bibo:Periodical') self.assert_(g.triples((None, self.SCHEMA_ORG.mentions, book_uriref)), 'document with id should be mentioned by context (i.e. collection)')
def format_title(node, default_rel): "display a title node as semantic information" title_type = node.get("type", None) title_source = node.get("source", None) title_authfileno = node.get("authfilenumber", None) # lower-case title source for consistency of checking (isbn/issn/oclc) if title_source is not None: title_source = title_source.strip().lower() # strip space from authfileno, to avoid any issues due to human error if title_authfileno is not None: title_authfileno = title_authfileno.strip() start, end = "", "" # special case we can't do anything with # if a title is inside the bioghist, we can assume it was created by the # originator, *however* there is no inverse relationship to specify # a title was created by a person. We also can't assume # any relation to the collection. So, skip these titles for now. if node.xpath("ancestor::e:bioghist", **eadns): return (start, end) # similar special case: if a title is inside a series scopecontent note # which is related to a series unititle person (see note on series_section_rdfa), # do not generate any RDFa for that title if node.xpath("ancestor::e:scopecontent/preceding-sibling::e:did/e:unittitle[e:corpname or e:persname]", **eadns): return (start, end) # for now, ignore titles in correspondence series # (getting associated with the person inappropriately) if default_rel == "schema:knows arch:correspondedWith": return start, end # if isbn # or issn is available, include it meta_tags = "" if title_source in ["isbn", "issn"] and title_authfileno is not None: meta_tags += '<meta property="schema:%s" content="%s"/>' % (title_source, title_authfileno) # title attribute carries genre information if title_type is not None: meta_tags += '<meta property="schema:genre" content="%s"/>' % title_type # generate URI/URN for item when possible # TODO: abstract into reusable function resource_id = None if title_authfileno is not None: resource_id = title_rdf_identifier(title_source, title_authfileno) # resource attribute for inclusion resource = ' resource="%s"' % resource_id if resource_id else "" # if title is inside the scopecontent, it needs to be wrapped as a document # just use the generic "mentions" relation if node.xpath("ancestor::e:scopecontent", **eadns): # mark as a generic document or periodical and include whatever meta tags are available itemtype = "bibo:Periodical" if title_source == "issn" else "bibo:Document" start = '<span rel="schema:mentions" typeof="%s"%s><span property="dc:title">' % (itemtype, resource) end = "</span>%s</span>" % meta_tags # if default rel is set to mention, assume we are patching in extra titles # after file item unittitle context elif default_rel == "schema:mentions": if title_source == "issn": itemtype = "bibo:Periodical" elif title_source in ["isbn", "oclc"]: itemtype = "bibo:Document" else: # technically we should be checking if this is a printed materials series... itemtype = "bibo:Manuscript" # if this title has a type but no authfilenumber and there is a # sibling title with an id, relate them if ( title_type is not None and title_authfileno is None and node.xpath("parent::e:unittitle/e:title[@authfilenumber]", **eadns) ): rel_id = None rel_authfileno = node.xpath("normalize-space(parent::e:unittitle/e:title/@authfilenumber)", **eadns).strip() rel_source = node.xpath("normalize-space(parent::e:unittitle/e:title/@source)", **eadns).lower() rel_id = title_rdf_identifier(rel_source, rel_authfileno) if rel_id is not None: meta_tags += '<span property="dcterms:isPartOf" resource="%s"/>' % rel_id start = '<span rel="schema:mentions" typeof="%s"%s><span property="dc:title">' % (itemtype, resource) end = "</span>%s</span>" % meta_tags # Otherwise, only add semantic information if there is a title type OR # if title occurs in a file-level unittitle. # (in that case, we assume it is title of the item in the container) elif node.xpath('parent::e:unittitle and ancestor::e:*[@level="file"]', **eadns) or title_type is not None: start, end = '<span property="dc:title">', "</span>%s" % meta_tags # include meta tags after the title, since it should be in the # context of the item, which is the whole unitittle # check for special case: multiple titles with an author # (persname tagged with a role, i.e. this is a Belfast Group sheet), multiple_with_author = ( title_type is None and node.xpath("count(parent::e:unittitle/e:title)", **eadns) > 1 and node.xpath("preceding-sibling::e:persname[@role]", **eadns) ) # special case: no good way to relate more than two titles in a unittitle, # so just skip them when generating rdfa if node.xpath("count(preceding-sibling::e:title)", **eadns) >= 2 and not multiple_with_author: start, end = "", "" # if ISSN with preceding title, assume article in a periodical elif title_source == "issn" and node.xpath("count(preceding-sibling::e:title)", **eadns) == 1: # adapted from schema.org article example: http://schema.org/Article start = '<span property="dcterms:isPartOf" typeof="bibo:Periodical"%s><span property="dc:title">' % resource # include any meta tags (genre, issn) inside the periodical entity end = "</span>%s</span>" % meta_tags # otherwise, if current title has an id or no type and follows a title with a type, # assume generic part/whole relationship elif (title_authfileno is not None or title_type is None) and node.xpath( "count(./preceding-sibling::e:title[@type])", **eadns ) == 1: start = '<span property="dcterms:isPartOf" typeof="bibo:Document"%s><span property="dc:title">' % resource # include any meta tags (e.g. isbn) inside the document entity end = "</span>%s</span>" % meta_tags # if no type and there are multiple titles, AND there is a persname # tagged with a role (i.e. this is a Belfast Group sheet), # then use RDFa list notation to generate a sequence elif ( title_type is None and node.xpath("count(parent::e:unittitle/e:title)", **eadns) > 1 and node.xpath("preceding-sibling::e:persname[@role]", **eadns) ): start = '<span inlist="inlist" property="dc:title">' return (start, end)