Exemple #1
0
def prep_ead(ead, filename):
    """Prepare EAD xml for publication.  Currently does the following:

     - sets the eadid and ids on any series, subseries, and index elements based
       on filename and series unitid or index number.
     - removes any leading whitespace from controlaccess terms

    :param ead: :class:`~findingaids.fa.models.FindingAid` ead instance to be prepared
    :param string: filename of the EAD file (used as base eadid)
    :rtype: :class:`~findingaids.fa.models.FindingAid`
    """

    # eadid should be document name without .xml extension
    ead.eadid.value = os.path.basename(filename).replace('.xml', '')
    # set series ids
    if ead.dsc and ead.dsc.hasSeries():
        for i, series in enumerate(ead.dsc.c):
            set_series_ids(series, ead.eadid.value, i)
    # set index ids
    for i, index in enumerate(ead.archdesc.index):
        # generate index ids based on eadid and index number (starting at 1, not 0)
        index.id = "%s%sindex%s" % (ead.eadid.value, ID_DELIMITER, i+1)

    # remove any leading whitespace in list title fields
    # NOTE: only removing *leading* whitespace because these fields
    # can contain mixed content, and trailing whitespace here may be significant
    # - list title fields - origination nodes and unittitle
    for field in ead.node.xpath('e:archdesc/e:did/e:origination/node()|e:archdesc/e:did/e:unittitle',
                                namespaces={'e': EAD_NAMESPACE}):
        # the text of an lxml node is the text content *before* any child elements
        # in some finding aids, this could be blank, e.g.
        # <unittitle><title>Pitts v. Freeman</title> case files</unittitle>
        # note that this clean does NOT handle leading whitespace in a leading child element.
        if hasattr(field, 'text') and field.text is not None:
            field.text = unicode(field.text).lstrip()
    # - controlaccess fields (if any)
    if ead.archdesc.controlaccess and ead.archdesc.controlaccess.controlaccess:
        for ca in ead.archdesc.controlaccess.controlaccess:
            for term in ca.terms:
                # Using node.text here because term.value is normalized
                # NOT forcing normalization on control access terms because
                # XML editor line-wrap settings would force
                # re-running the prep step every time a document is edited.
                if term.node.text:
                    term.value = term.node.text.lstrip()

    # check that ARK is set correctly (both long and short-form)
    # - if eadid url is not set or is not an ark, generate an ark
    if ead.eadid.url is None or not is_ark(ead.eadid.url):
        ead.eadid.url = generate_ark(ead)
    # - if eadid identifier is not set or not an ark, calculate short-form ark from eadid url
    if ead.eadid.identifier is None or not is_ark(ead.eadid.identifier):
        ark_parts = parse_ark(ead.eadid.url)
        ead.eadid.identifier = 'ark:/%(naan)s/%(noid)s' % ark_parts

    # NOTE: doesn't seem to be explicitly cleaning whitespace in unittitle,
    # but unit tests make it seem that it is getting cleaned ... (??)

    return ead
Exemple #2
0
    def handle(self, *pids, **options):

        dry_run = options.get('dry_run', False)
        verbosity = int(options.get('verbosity', self.v_normal))

        repo = Repository()
        try:
            pidman = DjangoPidmanRestClient()
        except Exception as err:
            # error if pid manager config options not in localsettings
            raise CommandError(err)

        # if pids are specified on command line, only process those objects
        if pids:
            objs = [repo.get_object(pid, type=Volume) for pid in pids]

        # otherwise, look for all volume objects in fedora
        else:
            objs = repo.get_objects_with_cmodel(Volume.VOLUME_CONTENT_MODEL,
                                                type=Volume)

        stats = defaultdict(int)
        for obj in objs:
            if not obj.exists:
                if verbosity >= self.v_normal:
                    self.stdout.write(
                        '%s does not exist or is not accessible' % obj.pid)
                stats['skipped'] += 1
                continue

            stats['objs'] += 1
            if is_ark(obj.dc.content.identifier):
                parsed_ark = parse_ark(obj.dc.content.identifier)
                noid = parsed_ark['noid']
                try:
                    ark_info = pidman.get_ark(noid)
                except Exception as err:
                    # requested ARK is not in the configured pid manager
                    # (this should ONLY happen in dev/QA)
                    if verbosity >= self.v_normal:
                        if '404: NOT FOUND' in str(err):
                            msg = 'not found'
                            self.stdout.write(
                                'Error retriving ARK information for %s: Not Found'
                                % obj.pid)
                        else:
                            self.stdout.write(
                                'Error retriving ARK information for %s' %
                                obj.pid)
                    stats['skipped'] += 1
                    continue

                # update unqualified ark to resolve to readux volume landing page
                if not dry_run:
                    pidman.update_ark_target(noid,
                                             target_uri=self.volume_url(obj),
                                             active=True)

                # we expected a qualified ARK target for the PDF; update whether
                # it currently exists or not
                qual = 'PDF'
                stats[
                    'updated'] += 1  # count as updated in dry run mode (would be updated)
                if not dry_run:
                    pidman.update_ark_target(noid,
                                             qual,
                                             target_uri=self.pdf_url(obj),
                                             active=True)
                    # FIXME: catch possible exceptions here?

        # output summary
        if verbosity >= self.v_normal:
            msg = 'Processed %(objs)d object%%s; skipped %(skipped)d,%%s updated %(updated)d' % stats
            msg = msg % ('s' if stats['objs'] != 1 else '',
                         ' would have' if dry_run else '')
            self.stdout.write(msg)
    def test_is_ark(self):
        'Test is_ark method'
        # resolvable ark
        self.assertTrue(is_ark('http://pid.emory.edu/ark:/25593/1fx'))
        # resolvable arks with qualifier
        self.assertTrue(is_ark('http://pid.emory.edu/ark:/25593/1fx/qual'))
        self.assertTrue(is_ark('http://pid.emory.edu/ark:/25593/1fx/qual/1.23/foo-bar'))
        # resolvable ark with base path in url
        self.assertTrue(is_ark('http://test.site.com/pidman/ark:/25593/1fx/qual'))

        # short-form ark
        self.assertTrue(is_ark('ark:/25593/1fx'))
        # short-form arks with qualifier
        self.assertTrue(is_ark('ark:/25593/1fx/qual'))
        self.assertTrue(is_ark('ark:/25593/1fx/qual/1.23/foo-bar'))

        # non-arks
        self.assertFalse(is_ark('http://pid.emory.edu/'))
        self.assertFalse(is_ark('http://genes.is/noahs/ark'))
        self.assertFalse(is_ark('http://pid.emory.edu/'))
        self.assertFalse(is_ark('http://genes.is/noahs/ark'))
        self.assertFalse(is_ark('doi:10.1000/182'))
Exemple #4
0
def check_eadxml(ead):
    """Sanity checks specific to the EAD xml, independent of file or eXist.

    Checks the following:
     - series and index ids are present
     - fields used for search/browse title match code expectations:
        - at most one top-level origination
        - no leading whitespace in list-title (origination or unittitle)
        - alphabetical first letter (for first-letter browse)
     - eadid matches site URL regex

    :param ead: :class:`~findingaids.fa.models.FindingAid` ead instance to be checked
    :returns: list of all errors found
    :rtype: list
    """
    # NOTE: throughout, be sure to use unicode instead of string
    errors = []

    # check that series ids are set
    if ead.dsc and ead.dsc.hasSeries():
        for series in ead.dsc.c:
            errors.extend(check_series_ids(series))

    # check that any index ids are set
    for index in ead.archdesc.index:
        if not index.id:
            errors.append("%(node)s id attribute is not set for %(label)s"
                % {'node': local_name(index.node), 'label': unicode(index.head)})

    # eadid matches appropriate site URL regex
    if not re.match('^%s$' % EADID_URL_REGEX, ead.eadid.value):   # entire eadid should match regex
        errors.append("eadid '%s' does not match site URL regular expression" \
                      % ead.eadid.value)

    # multiple tests to ensure xml used for search/browse list-title matches what code expects
    # -- since list title is pulled from multiple places, give enough context so it can be found & corrected
    list_title_path = "%s/%s" % (local_name(ead.list_title.node.getparent()),
                                 local_name(ead.list_title.node))
    # - check for at most one top-level origination
    origination_count = ead.node.xpath('count(e:archdesc/e:did/e:origination)',
                                       namespaces={'e': EAD_NAMESPACE})
    if int(origination_count) > 1:
        errors.append("Site expects only one archdesc/did/origination; found %d" \
                        % origination_count)

    # container list formatting (based on encoding practice) expects only 2 containers per did
    # - dids with more than 2 containers
    containers = ead.node.xpath('//e:did[count(e:container) > 2]',
                                namespaces={'e': EAD_NAMESPACE})
    if len(containers):
        errors.append("Site expects maximum of 2 containers per did; found %d did(s) with more than 2" \
                        % len(containers))
        errors.append(['Line %d: %s' % (c.sourceline, tostring(c)) for c in containers])
    # - dids with only one container
    containers = ead.node.xpath('//e:did[count(e:container) = 1]',
                                namespaces={'e': EAD_NAMESPACE})
    if len(containers):
        errors.append("Site expects 2 containers per did; found %d did(s) with only 1" \
                        % len(containers))
        errors.append(['Line %d: %s' % (c.sourceline, tostring(c)) for c in containers])

    # - no leading whitespace in list title
    title_node = ead.node.xpath("%s/text()" % ead.list_title_xpath,
                                namespaces={'e': EAD_NAMESPACE})
    if hasattr(title_node[0], 'text'):
        title_text = title_node[0].text
    else:
        title_text = unicode(title_node)

    if title_text is None:
        errors.append("List title seems to be empty")
    elif re.match(r'\s+', title_text):
        # using node.text because unicode() normalizes, which obscures whitespace problems
        errors.append("Found leading whitespace in list title field (%s): '%s'" %
                      (list_title_path, ead.list_title.node.text))
        # report with enough context that they can find the appropriate element to fix

    # - first letter of title matches regex   -- only check if whitespace test fails
    elif not re.match(TITLE_LETTERS, ead.first_letter):
        errors.append("First letter ('%s') of list title field %s does not match browse letter URL regex '%s'" % \
                      (ead.first_letter, list_title_path, TITLE_LETTERS))

    # leading space in unit title (could be list title but might not be)
    # NOTE: title can contain and even start with subtags such as <title>
    # or <emph>, which is hard to account for with lxml or text() xpath.
    # Using format_ead to generate html that would be displayed, and then
    # stripping tags to check for any leading whitespace within a leading tag
    title = striptags(format_ead(ead.unittitle))
    if re.match(r'\s+', title):
        errors.append("Found leading whitespace in unittitle: '%s'" %
                      title)

    # leading whitespace in control access fields (if any)
    if ead.archdesc.controlaccess and ead.archdesc.controlaccess.controlaccess:
        for ca in ead.archdesc.controlaccess.controlaccess:
            for term in ca.terms:
                # NOTE: using node text because term.value is now normalized
                if re.match(r'\s+', unicode(term.node.text)):
                    errors.append("Found leading whitespace in controlaccess term '%s' (%s)" \
                                 % (term.node.text, local_name(term.node)))

    # eadid url should contain resolvable ARK
    if ead.eadid.url is None or not is_ark(ead.eadid.url):
        errors.append("eadid url is either not set or not an ARK. " +
            "To correct, run the prep process again.")

    # eadid identifier should contain short-form ARK
    if ead.eadid.identifier is None or not is_ark(ead.eadid.identifier):
        errors.append("eadid identifier is either not set or not an ARK" +
            "To correct, run the prep process again.")

    # short- and long-form ARKs should match each other
    if ead.eadid.url is not None and ead.eadid.identifier is not None and \
        not ead.eadid.url.endswith(ead.eadid.identifier):
        errors.append("eadid url and identifier do not match: url '%s' should end with identifier '%s'" \
                     % (ead.eadid.url, ead.eadid.identifier))

    return errors
Exemple #5
0
    def handle(self, *pids, **options):

        dry_run = options.get('dry_run', False)
        verbosity = int(options.get('verbosity', self.v_normal))

        repo = Repository()
        try:
            pidman = DjangoPidmanRestClient()
        except Exception as err:
            # error if pid manager config options not in localsettings
            raise CommandError(err)

        # if pids are specified on command line, only process those objects
        if pids:
            objs = [repo.get_object(pid, type=Volume) for pid in pids]

        # otherwise, look for all volume objects in fedora
        else:
            objs = repo.get_objects_with_cmodel(Volume.VOLUME_CONTENT_MODEL,
                                                type=Volume)

        stats = defaultdict(int)
        for obj in objs:
            if not obj.exists:
                if verbosity >= self.v_normal:
                    self.stdout.write('%s does not exist or is not accessible' % obj.pid)
                stats['skipped'] += 1
                continue

            stats['objs'] += 1
            if is_ark(obj.dc.content.identifier):
                parsed_ark = parse_ark(obj.dc.content.identifier)
                noid = parsed_ark['noid']
                try:
                    ark_info = pidman.get_ark(noid)
                except Exception as err:
                    # requested ARK is not in the configured pid manager
                    # (this should ONLY happen in dev/QA)
                    if verbosity >= self.v_normal:
                        if '404: NOT FOUND' in str(err):
                            msg = 'not found'
                            self.stdout.write('Error retriving ARK information for %s: Not Found' % obj.pid)
                        else:
                            self.stdout.write('Error retriving ARK information for %s' % obj.pid)
                    stats['skipped'] += 1
                    continue

                # update unqualified ark to resolve to readux volume landing page
                if not dry_run:
                    pidman.update_ark_target(noid,
                        target_uri=self.volume_url(obj),
                        active=True)

                # we expected a qualified ARK target for the PDF; update whether
                # it currently exists or not
                qual = 'PDF'
                stats['updated'] += 1   # count as updated in dry run mode (would be updated)
                if not dry_run:
                    pidman.update_ark_target(noid, qual,
                        target_uri=self.pdf_url(obj),
                        active=True)
                    # FIXME: catch possible exceptions here?

        # output summary
        if verbosity >= self.v_normal:
            msg = 'Processed %(objs)d object%%s; skipped %(skipped)d,%%s updated %(updated)d' % stats
            msg = msg % ('s' if stats['objs'] != 1 else '', ' would have' if dry_run else '')
            self.stdout.write(msg)
def ark_pid(value):
    '''Template filter to display just the pid portion of an ARK URI.
    Returns None if the value passed in is not recognized as an ARK.'''
    if is_ark(value):
        ark_parts = parse_ark(value)
        return ark_parts['noid']
    def test_is_ark(self):
        'Test is_ark method'
        # resolvable ark
        self.assertTrue(is_ark('http://pid.emory.edu/ark:/25593/1fx'))
        # resolvable arks with qualifier
        self.assertTrue(is_ark('http://pid.emory.edu/ark:/25593/1fx/qual'))
        self.assertTrue(
            is_ark('http://pid.emory.edu/ark:/25593/1fx/qual/1.23/foo-bar'))
        # resolvable ark with base path in url
        self.assertTrue(
            is_ark('http://test.site.com/pidman/ark:/25593/1fx/qual'))

        # short-form ark
        self.assertTrue(is_ark('ark:/25593/1fx'))
        # short-form arks with qualifier
        self.assertTrue(is_ark('ark:/25593/1fx/qual'))
        self.assertTrue(is_ark('ark:/25593/1fx/qual/1.23/foo-bar'))

        # non-arks
        self.assertFalse(is_ark('http://pid.emory.edu/'))
        self.assertFalse(is_ark('http://genes.is/noahs/ark'))
        self.assertFalse(is_ark('http://pid.emory.edu/'))
        self.assertFalse(is_ark('http://genes.is/noahs/ark'))
        self.assertFalse(is_ark('doi:10.1000/182'))