def prep_ead(ead, filename): """Prepare EAD xml for publication. Currently does the following: - sets the eadid and ids on any series, subseries, and index elements based on filename and series unitid or index number. - removes any leading whitespace from controlaccess terms :param ead: :class:`~findingaids.fa.models.FindingAid` ead instance to be prepared :param string: filename of the EAD file (used as base eadid) :rtype: :class:`~findingaids.fa.models.FindingAid` """ # eadid should be document name without .xml extension ead.eadid.value = os.path.basename(filename).replace('.xml', '') # set series ids if ead.dsc and ead.dsc.hasSeries(): for i, series in enumerate(ead.dsc.c): set_series_ids(series, ead.eadid.value, i) # set index ids for i, index in enumerate(ead.archdesc.index): # generate index ids based on eadid and index number (starting at 1, not 0) index.id = "%s%sindex%s" % (ead.eadid.value, ID_DELIMITER, i+1) # remove any leading whitespace in list title fields # NOTE: only removing *leading* whitespace because these fields # can contain mixed content, and trailing whitespace here may be significant # - list title fields - origination nodes and unittitle for field in ead.node.xpath('e:archdesc/e:did/e:origination/node()|e:archdesc/e:did/e:unittitle', namespaces={'e': EAD_NAMESPACE}): # the text of an lxml node is the text content *before* any child elements # in some finding aids, this could be blank, e.g. # <unittitle><title>Pitts v. Freeman</title> case files</unittitle> # note that this clean does NOT handle leading whitespace in a leading child element. if hasattr(field, 'text') and field.text is not None: field.text = unicode(field.text).lstrip() # - controlaccess fields (if any) if ead.archdesc.controlaccess and ead.archdesc.controlaccess.controlaccess: for ca in ead.archdesc.controlaccess.controlaccess: for term in ca.terms: # Using node.text here because term.value is normalized # NOT forcing normalization on control access terms because # XML editor line-wrap settings would force # re-running the prep step every time a document is edited. if term.node.text: term.value = term.node.text.lstrip() # check that ARK is set correctly (both long and short-form) # - if eadid url is not set or is not an ark, generate an ark if ead.eadid.url is None or not is_ark(ead.eadid.url): ead.eadid.url = generate_ark(ead) # - if eadid identifier is not set or not an ark, calculate short-form ark from eadid url if ead.eadid.identifier is None or not is_ark(ead.eadid.identifier): ark_parts = parse_ark(ead.eadid.url) ead.eadid.identifier = 'ark:/%(naan)s/%(noid)s' % ark_parts # NOTE: doesn't seem to be explicitly cleaning whitespace in unittitle, # but unit tests make it seem that it is getting cleaned ... (??) return ead
def handle(self, *pids, **options): dry_run = options.get('dry_run', False) verbosity = int(options.get('verbosity', self.v_normal)) repo = Repository() try: pidman = DjangoPidmanRestClient() except Exception as err: # error if pid manager config options not in localsettings raise CommandError(err) # if pids are specified on command line, only process those objects if pids: objs = [repo.get_object(pid, type=Volume) for pid in pids] # otherwise, look for all volume objects in fedora else: objs = repo.get_objects_with_cmodel(Volume.VOLUME_CONTENT_MODEL, type=Volume) stats = defaultdict(int) for obj in objs: if not obj.exists: if verbosity >= self.v_normal: self.stdout.write( '%s does not exist or is not accessible' % obj.pid) stats['skipped'] += 1 continue stats['objs'] += 1 if is_ark(obj.dc.content.identifier): parsed_ark = parse_ark(obj.dc.content.identifier) noid = parsed_ark['noid'] try: ark_info = pidman.get_ark(noid) except Exception as err: # requested ARK is not in the configured pid manager # (this should ONLY happen in dev/QA) if verbosity >= self.v_normal: if '404: NOT FOUND' in str(err): msg = 'not found' self.stdout.write( 'Error retriving ARK information for %s: Not Found' % obj.pid) else: self.stdout.write( 'Error retriving ARK information for %s' % obj.pid) stats['skipped'] += 1 continue # update unqualified ark to resolve to readux volume landing page if not dry_run: pidman.update_ark_target(noid, target_uri=self.volume_url(obj), active=True) # we expected a qualified ARK target for the PDF; update whether # it currently exists or not qual = 'PDF' stats[ 'updated'] += 1 # count as updated in dry run mode (would be updated) if not dry_run: pidman.update_ark_target(noid, qual, target_uri=self.pdf_url(obj), active=True) # FIXME: catch possible exceptions here? # output summary if verbosity >= self.v_normal: msg = 'Processed %(objs)d object%%s; skipped %(skipped)d,%%s updated %(updated)d' % stats msg = msg % ('s' if stats['objs'] != 1 else '', ' would have' if dry_run else '') self.stdout.write(msg)
def test_is_ark(self): 'Test is_ark method' # resolvable ark self.assertTrue(is_ark('http://pid.emory.edu/ark:/25593/1fx')) # resolvable arks with qualifier self.assertTrue(is_ark('http://pid.emory.edu/ark:/25593/1fx/qual')) self.assertTrue(is_ark('http://pid.emory.edu/ark:/25593/1fx/qual/1.23/foo-bar')) # resolvable ark with base path in url self.assertTrue(is_ark('http://test.site.com/pidman/ark:/25593/1fx/qual')) # short-form ark self.assertTrue(is_ark('ark:/25593/1fx')) # short-form arks with qualifier self.assertTrue(is_ark('ark:/25593/1fx/qual')) self.assertTrue(is_ark('ark:/25593/1fx/qual/1.23/foo-bar')) # non-arks self.assertFalse(is_ark('http://pid.emory.edu/')) self.assertFalse(is_ark('http://genes.is/noahs/ark')) self.assertFalse(is_ark('http://pid.emory.edu/')) self.assertFalse(is_ark('http://genes.is/noahs/ark')) self.assertFalse(is_ark('doi:10.1000/182'))
def check_eadxml(ead): """Sanity checks specific to the EAD xml, independent of file or eXist. Checks the following: - series and index ids are present - fields used for search/browse title match code expectations: - at most one top-level origination - no leading whitespace in list-title (origination or unittitle) - alphabetical first letter (for first-letter browse) - eadid matches site URL regex :param ead: :class:`~findingaids.fa.models.FindingAid` ead instance to be checked :returns: list of all errors found :rtype: list """ # NOTE: throughout, be sure to use unicode instead of string errors = [] # check that series ids are set if ead.dsc and ead.dsc.hasSeries(): for series in ead.dsc.c: errors.extend(check_series_ids(series)) # check that any index ids are set for index in ead.archdesc.index: if not index.id: errors.append("%(node)s id attribute is not set for %(label)s" % {'node': local_name(index.node), 'label': unicode(index.head)}) # eadid matches appropriate site URL regex if not re.match('^%s$' % EADID_URL_REGEX, ead.eadid.value): # entire eadid should match regex errors.append("eadid '%s' does not match site URL regular expression" \ % ead.eadid.value) # multiple tests to ensure xml used for search/browse list-title matches what code expects # -- since list title is pulled from multiple places, give enough context so it can be found & corrected list_title_path = "%s/%s" % (local_name(ead.list_title.node.getparent()), local_name(ead.list_title.node)) # - check for at most one top-level origination origination_count = ead.node.xpath('count(e:archdesc/e:did/e:origination)', namespaces={'e': EAD_NAMESPACE}) if int(origination_count) > 1: errors.append("Site expects only one archdesc/did/origination; found %d" \ % origination_count) # container list formatting (based on encoding practice) expects only 2 containers per did # - dids with more than 2 containers containers = ead.node.xpath('//e:did[count(e:container) > 2]', namespaces={'e': EAD_NAMESPACE}) if len(containers): errors.append("Site expects maximum of 2 containers per did; found %d did(s) with more than 2" \ % len(containers)) errors.append(['Line %d: %s' % (c.sourceline, tostring(c)) for c in containers]) # - dids with only one container containers = ead.node.xpath('//e:did[count(e:container) = 1]', namespaces={'e': EAD_NAMESPACE}) if len(containers): errors.append("Site expects 2 containers per did; found %d did(s) with only 1" \ % len(containers)) errors.append(['Line %d: %s' % (c.sourceline, tostring(c)) for c in containers]) # - no leading whitespace in list title title_node = ead.node.xpath("%s/text()" % ead.list_title_xpath, namespaces={'e': EAD_NAMESPACE}) if hasattr(title_node[0], 'text'): title_text = title_node[0].text else: title_text = unicode(title_node) if title_text is None: errors.append("List title seems to be empty") elif re.match(r'\s+', title_text): # using node.text because unicode() normalizes, which obscures whitespace problems errors.append("Found leading whitespace in list title field (%s): '%s'" % (list_title_path, ead.list_title.node.text)) # report with enough context that they can find the appropriate element to fix # - first letter of title matches regex -- only check if whitespace test fails elif not re.match(TITLE_LETTERS, ead.first_letter): errors.append("First letter ('%s') of list title field %s does not match browse letter URL regex '%s'" % \ (ead.first_letter, list_title_path, TITLE_LETTERS)) # leading space in unit title (could be list title but might not be) # NOTE: title can contain and even start with subtags such as <title> # or <emph>, which is hard to account for with lxml or text() xpath. # Using format_ead to generate html that would be displayed, and then # stripping tags to check for any leading whitespace within a leading tag title = striptags(format_ead(ead.unittitle)) if re.match(r'\s+', title): errors.append("Found leading whitespace in unittitle: '%s'" % title) # leading whitespace in control access fields (if any) if ead.archdesc.controlaccess and ead.archdesc.controlaccess.controlaccess: for ca in ead.archdesc.controlaccess.controlaccess: for term in ca.terms: # NOTE: using node text because term.value is now normalized if re.match(r'\s+', unicode(term.node.text)): errors.append("Found leading whitespace in controlaccess term '%s' (%s)" \ % (term.node.text, local_name(term.node))) # eadid url should contain resolvable ARK if ead.eadid.url is None or not is_ark(ead.eadid.url): errors.append("eadid url is either not set or not an ARK. " + "To correct, run the prep process again.") # eadid identifier should contain short-form ARK if ead.eadid.identifier is None or not is_ark(ead.eadid.identifier): errors.append("eadid identifier is either not set or not an ARK" + "To correct, run the prep process again.") # short- and long-form ARKs should match each other if ead.eadid.url is not None and ead.eadid.identifier is not None and \ not ead.eadid.url.endswith(ead.eadid.identifier): errors.append("eadid url and identifier do not match: url '%s' should end with identifier '%s'" \ % (ead.eadid.url, ead.eadid.identifier)) return errors
def handle(self, *pids, **options): dry_run = options.get('dry_run', False) verbosity = int(options.get('verbosity', self.v_normal)) repo = Repository() try: pidman = DjangoPidmanRestClient() except Exception as err: # error if pid manager config options not in localsettings raise CommandError(err) # if pids are specified on command line, only process those objects if pids: objs = [repo.get_object(pid, type=Volume) for pid in pids] # otherwise, look for all volume objects in fedora else: objs = repo.get_objects_with_cmodel(Volume.VOLUME_CONTENT_MODEL, type=Volume) stats = defaultdict(int) for obj in objs: if not obj.exists: if verbosity >= self.v_normal: self.stdout.write('%s does not exist or is not accessible' % obj.pid) stats['skipped'] += 1 continue stats['objs'] += 1 if is_ark(obj.dc.content.identifier): parsed_ark = parse_ark(obj.dc.content.identifier) noid = parsed_ark['noid'] try: ark_info = pidman.get_ark(noid) except Exception as err: # requested ARK is not in the configured pid manager # (this should ONLY happen in dev/QA) if verbosity >= self.v_normal: if '404: NOT FOUND' in str(err): msg = 'not found' self.stdout.write('Error retriving ARK information for %s: Not Found' % obj.pid) else: self.stdout.write('Error retriving ARK information for %s' % obj.pid) stats['skipped'] += 1 continue # update unqualified ark to resolve to readux volume landing page if not dry_run: pidman.update_ark_target(noid, target_uri=self.volume_url(obj), active=True) # we expected a qualified ARK target for the PDF; update whether # it currently exists or not qual = 'PDF' stats['updated'] += 1 # count as updated in dry run mode (would be updated) if not dry_run: pidman.update_ark_target(noid, qual, target_uri=self.pdf_url(obj), active=True) # FIXME: catch possible exceptions here? # output summary if verbosity >= self.v_normal: msg = 'Processed %(objs)d object%%s; skipped %(skipped)d,%%s updated %(updated)d' % stats msg = msg % ('s' if stats['objs'] != 1 else '', ' would have' if dry_run else '') self.stdout.write(msg)
def ark_pid(value): '''Template filter to display just the pid portion of an ARK URI. Returns None if the value passed in is not recognized as an ARK.''' if is_ark(value): ark_parts = parse_ark(value) return ark_parts['noid']
def test_is_ark(self): 'Test is_ark method' # resolvable ark self.assertTrue(is_ark('http://pid.emory.edu/ark:/25593/1fx')) # resolvable arks with qualifier self.assertTrue(is_ark('http://pid.emory.edu/ark:/25593/1fx/qual')) self.assertTrue( is_ark('http://pid.emory.edu/ark:/25593/1fx/qual/1.23/foo-bar')) # resolvable ark with base path in url self.assertTrue( is_ark('http://test.site.com/pidman/ark:/25593/1fx/qual')) # short-form ark self.assertTrue(is_ark('ark:/25593/1fx')) # short-form arks with qualifier self.assertTrue(is_ark('ark:/25593/1fx/qual')) self.assertTrue(is_ark('ark:/25593/1fx/qual/1.23/foo-bar')) # non-arks self.assertFalse(is_ark('http://pid.emory.edu/')) self.assertFalse(is_ark('http://genes.is/noahs/ark')) self.assertFalse(is_ark('http://pid.emory.edu/')) self.assertFalse(is_ark('http://genes.is/noahs/ark')) self.assertFalse(is_ark('doi:10.1000/182'))