Ejemplo n.º 1
0
    def test_related_titles(self):
        # if two titles, one with a type and the other with an id, first should
        # be part of the second
        # sample from hughes-edwards1145
        two_titles = '''<c01 level="file" xmlns="%s">
            <did>
                <container type="box">1</container>
                <container type="folder">6</container>
                <unittitle><title type="poetry" render="doublequote">Mother to Son</title>,
                        poem in
                        <title source="oclc" authfilenumber="10870853">The People's Voice</title>,
                        May 9, 1942
                </unittitle>
            </did>
        </c01>''' % EAD_NAMESPACE
        g = self._render_item_to_rdf(two_titles)

        # there should be a manuscript in the output
        ms_triples = list(g.triples((None, rdflib.RDF.type, self.BIBO.Manuscript)))
        self.assert_(ms_triples, 'RDFa output should include an item with type bibo:Manuscript')
        # first element of the first triple should be the article node
        ms_node = ms_triples[0][0]
        self.assertEqual(u'Mother to Son',
            unicode(g.value(ms_node, self.DC.title)),
            'poem title should be set as dc:title')
        self.assertEqual(u'poetry',
            unicode(g.value(ms_node, self.SCHEMA_ORG.genre)),
            'genre should be set as poetry')
        book_uriref = rdflib.URIRef(title_rdf_identifier('oclc', '10870853'))
        self.assertTrue((ms_node, self.DC.isPartOf, book_uriref) in  g,
            'poem should be part of document with an id')
        self.assertTrue((book_uriref, rdflib.RDF.type, self.BIBO.Document),
            'document with id should be a bibo:Document')
        self.assert_(g.triples((None, self.SCHEMA_ORG.mentions, book_uriref)),
            'document with id should be mentioned by context (i.e. collection)')
Ejemplo n.º 2
0
    def test_extra_titles(self):
        # more than two titles in a unittitle
        # sample from hughes-edwards1145
        two_titles = '''<c01 level="file" xmlns="%s">
            <did>
                <container type="box">1</container>
                <container type="folder">7</container>
                <unittitle><title type="essays" render="doublequote">The Need for Heroes</title>,
                   essay in <title source="ISSN" authfilenumber=" 2169-1010">The Crisis</title>,
                   June 1941 [includes pages in which poems
                   <title type="poetry" render="doublequote">The Negro Speaks of Rivers</title>
                   and <title type="poetry" render="doublequote">NAACP</title> appear]
                </unittitle>
            </did>
        </c01>''' % EAD_NAMESPACE
        g = self._render_item_to_rdf(two_titles)

        # first two titles should work roughly as related title does above

        # there should be *three* manuscripts in the output
        ms_triples = list(g.triples((None, rdflib.RDF.type, self.BIBO.Manuscript)))
        self.assertEqual(3, len(ms_triples),
            'RDFa output should include three items with type bibo:Manuscript')

        # since rdf is unsorted, we have to find them by title
        ms_by_title = {}
        for ms in ms_triples:
            s, p, o = ms
            if unicode(g.value(s, self.DC.title)) == unicode('The Need for Heroes'):
                ms_by_title['Need'] = s
            if unicode(g.value(s, self.DC.title)) == unicode('The Negro Speaks of Rivers'):
                ms_by_title['Negro'] = s
            if unicode(g.value(s, self.DC.title)) == unicode('NAACP'):
                ms_by_title['NAACP'] = s

        self.assertEqual(u'essays',
            unicode(g.value(ms_by_title['Need'], self.SCHEMA_ORG.genre)),
            'essay title genre should be set as essays')
        self.assertEqual(u'poetry',
            unicode(g.value(ms_by_title['Negro'], self.SCHEMA_ORG.genre)),
            'poem title genre should be set as poetry')
        self.assertEqual(u'poetry',
            unicode(g.value(ms_by_title['NAACP'], self.SCHEMA_ORG.genre)),
            'poem title genre should be set as poetry')
        book_uriref = rdflib.URIRef(title_rdf_identifier('issn', '2169-1010'))

        # things that should be consistent for each of the typed titles
        for ms_node in ms_by_title.itervalues():
            # each item should be part of the book with title
            self.assertTrue((ms_node, self.DC.isPartOf, book_uriref) in  g,
                'item should be part of document with an id')
            # each item should be mentioned in context (i.e. collection)
            self.assert_(g.triples((None, self.SCHEMA_ORG.mentions, ms_node)),
                'item should be mentioned by context (i.e. collection)')

        self.assertTrue((book_uriref, rdflib.RDF.type, self.BIBO.Periodical),
            'document with issn should be a bibo:Periodical')
        self.assert_(g.triples((None, self.SCHEMA_ORG.mentions, book_uriref)),
            'document with id should be mentioned by context (i.e. collection)')
Ejemplo n.º 3
0
def format_title(node, default_rel):
    "display a title node as semantic information"
    title_type = node.get("type", None)
    title_source = node.get("source", None)
    title_authfileno = node.get("authfilenumber", None)
    # lower-case title source for consistency of checking (isbn/issn/oclc)
    if title_source is not None:
        title_source = title_source.strip().lower()
    # strip space from authfileno, to avoid any issues due to human error
    if title_authfileno is not None:
        title_authfileno = title_authfileno.strip()

    start, end = "", ""

    # special case we can't do anything with
    # if a title is inside the bioghist, we can assume it was created by the
    # originator, *however* there is no inverse relationship to specify
    # a title was created by a person.  We also can't assume
    # any relation to the collection.  So, skip these titles for now.
    if node.xpath("ancestor::e:bioghist", **eadns):
        return (start, end)
    # similar special case: if a title is inside a series scopecontent note
    # which is related to a series unititle person (see note on series_section_rdfa),
    # do not generate any RDFa for that title
    if node.xpath("ancestor::e:scopecontent/preceding-sibling::e:did/e:unittitle[e:corpname or e:persname]", **eadns):
        return (start, end)

    # for now, ignore titles in correspondence series
    # (getting associated with the person inappropriately)
    if default_rel == "schema:knows arch:correspondedWith":
        return start, end

    # if isbn # or issn is available, include it
    meta_tags = ""
    if title_source in ["isbn", "issn"] and title_authfileno is not None:
        meta_tags += '<meta property="schema:%s" content="%s"/>' % (title_source, title_authfileno)
    # title attribute carries genre information
    if title_type is not None:
        meta_tags += '<meta property="schema:genre" content="%s"/>' % title_type

    # generate URI/URN for item when possible
    # TODO: abstract into reusable function
    resource_id = None
    if title_authfileno is not None:
        resource_id = title_rdf_identifier(title_source, title_authfileno)

    # resource attribute for inclusion
    resource = ' resource="%s"' % resource_id if resource_id else ""

    # if title is inside the scopecontent, it needs to be wrapped as a document
    # just use the generic "mentions" relation
    if node.xpath("ancestor::e:scopecontent", **eadns):
        # mark as a generic document or periodical and include whatever meta tags are available
        itemtype = "bibo:Periodical" if title_source == "issn" else "bibo:Document"
        start = '<span rel="schema:mentions" typeof="%s"%s><span property="dc:title">' % (itemtype, resource)
        end = "</span>%s</span>" % meta_tags

    # if default rel is set to mention, assume we are patching in extra titles
    # after file item unittitle context
    elif default_rel == "schema:mentions":
        if title_source == "issn":
            itemtype = "bibo:Periodical"
        elif title_source in ["isbn", "oclc"]:
            itemtype = "bibo:Document"
        else:
            # technically we should be checking if this is a printed materials series...
            itemtype = "bibo:Manuscript"

        # if this title has a type but no authfilenumber and there is a
        # sibling title with an id, relate them
        if (
            title_type is not None
            and title_authfileno is None
            and node.xpath("parent::e:unittitle/e:title[@authfilenumber]", **eadns)
        ):
            rel_id = None
            rel_authfileno = node.xpath("normalize-space(parent::e:unittitle/e:title/@authfilenumber)", **eadns).strip()
            rel_source = node.xpath("normalize-space(parent::e:unittitle/e:title/@source)", **eadns).lower()

            rel_id = title_rdf_identifier(rel_source, rel_authfileno)

            if rel_id is not None:
                meta_tags += '<span property="dcterms:isPartOf" resource="%s"/>' % rel_id

        start = '<span rel="schema:mentions" typeof="%s"%s><span property="dc:title">' % (itemtype, resource)
        end = "</span>%s</span>" % meta_tags

    # Otherwise, only add semantic information if there is a title type OR
    # if title occurs in a file-level unittitle.
    # (in that case, we assume it is title of the item in the container)
    elif node.xpath('parent::e:unittitle and ancestor::e:*[@level="file"]', **eadns) or title_type is not None:
        start, end = '<span property="dc:title">', "</span>%s" % meta_tags
        # include meta tags after the title, since it should be in the
        # context of the item, which is the whole unitittle

        # check for special case: multiple titles with an author
        # (persname tagged with a role, i.e. this is a Belfast Group sheet),
        multiple_with_author = (
            title_type is None
            and node.xpath("count(parent::e:unittitle/e:title)", **eadns) > 1
            and node.xpath("preceding-sibling::e:persname[@role]", **eadns)
        )

        # special case: no good way to relate more than two titles in a unittitle,
        # so just skip them when generating rdfa
        if node.xpath("count(preceding-sibling::e:title)", **eadns) >= 2 and not multiple_with_author:
            start, end = "", ""

        # if ISSN with preceding title, assume article in a periodical
        elif title_source == "issn" and node.xpath("count(preceding-sibling::e:title)", **eadns) == 1:
            # adapted from schema.org article example: http://schema.org/Article
            start = '<span property="dcterms:isPartOf" typeof="bibo:Periodical"%s><span property="dc:title">' % resource
            # include any meta tags (genre, issn) inside the periodical entity
            end = "</span>%s</span>" % meta_tags

        # otherwise, if current title has an id or no type and follows a title with a type,
        # assume generic part/whole relationship
        elif (title_authfileno is not None or title_type is None) and node.xpath(
            "count(./preceding-sibling::e:title[@type])", **eadns
        ) == 1:
            start = '<span property="dcterms:isPartOf" typeof="bibo:Document"%s><span property="dc:title">' % resource
            # include any meta tags (e.g. isbn) inside the document entity
            end = "</span>%s</span>" % meta_tags

        # if no type and there are multiple titles, AND there is a persname
        # tagged with a role (i.e. this is a Belfast Group sheet),
        # then use RDFa list notation to generate a sequence
        elif (
            title_type is None
            and node.xpath("count(parent::e:unittitle/e:title)", **eadns) > 1
            and node.xpath("preceding-sibling::e:persname[@role]", **eadns)
        ):
            start = '<span inlist="inlist" property="dc:title">'

    return (start, end)