def prepared_ead(request, archive, filename, mode): """Display information about changes made by preparing an EAD file for publication. If no changes are made, user will be redirected to main admin page with a message to that effect. In **summary** mode, displays a brief, color-coded summary of changes between original and prepped version of the file. In **diff** mode, displays a full, side-by-side diff generated by :class:`difflib.HtmlDiff`. (Note: because it is very large, the full diff is *not* embedded in the site template, and is intended to be opened in a new window.) :param filename: name of the file to prep; should be base filename only, document will be pulled from the configured source directory. :param mode: one of **diff** or **summary** """ # determine full path based on archive / svn arch = Archive.objects.get(slug=archive) # arch = get_object_or_404(Archive, slug=archive) fullpath = os.path.join(arch.svn_local_path, filename) changes = [] # TODO: expire cache if file has changed since prepped eadxml was cached prep_ead = prepared_eadxml(request, arch.slug, filename) if prep_ead.status_code == 200: orig_ead = load_xmlobject_from_file(fullpath, FindingAid) # validate or not? original_xml = orig_ead.serializeDocument() # store as serialized by xml object, so xml output will be the same prep_xml = prep_ead.content ead = load_xmlobject_from_string(prep_xml, FindingAid) # validate? if mode == 'diff': diff = difflib.HtmlDiff(8, 80) # set columns to wrap at 80 characters # generate a html table with line-by-line comparison (meant to be called in a new window) changes = diff.make_file(original_xml.split('\n'), prep_xml.split('\n')) return HttpResponse(changes) elif mode == 'summary': # prepared EAD should pass sanity checks required for publication errors = utils.check_eadxml(ead) changes = list(difflib.unified_diff(original_xml.split('\n'), prep_xml.split('\n'))) if not changes: messages.info(request, 'No changes made to <b>%s</b>; EAD is already prepared.' % filename) # redirect to main admin page with code 303 (See Other) return HttpResponseSeeOtherRedirect(reverse('fa-admin:index')) elif prep_ead.status_code == 500: # something went wrong with generating prep xml; could be one of: # - non-well-formed xml (failed to load original document at all) # - error generating an ARK for the document errors = [prep_ead.content] else: # this shouldn't happen; not 200 or 500 == something went dreadfully wrong errors = ['Something went wrong trying to load the specified document.', prep_ead.content] # pass along the output in case it is useful? return render(request, 'fa_admin/prepared.html', { 'filename': filename, 'changes': changes, 'errors': errors, 'xml_status': prep_ead.status_code, 'archive': arch})
def preview(request, archive): if request.method == 'POST': archive = get_object_or_404(Archive, slug=archive) filename = request.POST['filename'] errors = [] try: # only load to exist if document passes publication check ok, response, dbpath, fullpath = _prepublication_check(request, filename, archive, mode='preview') if ok is not True: return response db = ExistDB() # load the document to the *preview* collection in eXist with the same fileneame preview_dbpath = settings.EXISTDB_PREVIEW_COLLECTION + "/" + filename # make sure the preview collection exists, but don't complain if it's already there success = db.load(open(fullpath, 'r'), preview_dbpath, overwrite=True) except ExistDBException, e: success = False errors.append(e.message()) if success: # load the file as a FindingAid object so we can generate the preview url ead = load_xmlobject_from_file(fullpath, FindingAid) messages.success(request, 'Successfully loaded <b>%s</b> for preview.' % filename) # redirect to document preview page with code 303 (See Other) return HttpResponseSeeOtherRedirect(reverse('fa-admin:preview:findingaid', kwargs={'id': ead.eadid})) else: return render(request, 'fa_admin/publish-errors.html', {'errors': errors, 'filename': filename, 'mode': 'preview', 'exception': e})
def test_load_from_file_with_classname(self): """Test using shortcut to initialize named XmlObject class from string""" class TestObject(xmlmap.XmlObject): pass obj = xmlmap.load_xmlobject_from_file(self.FILE.name, TestObject) self.assert_(isinstance(obj, TestObject))
def test_load_from_file_with_validation(self): # has doctype, but not valid self.assertRaises(Exception, xmlmap.load_xmlobject_from_file, self.INVALID.name, validate=True) # no doctype self.assertRaises(Exception, xmlmap.load_xmlobject_from_file, self.FILE.name, validate=True) # doctype, valid obj = xmlmap.load_xmlobject_from_file(self.VALID.name, validate=True) self.assert_(isinstance(obj, xmlmap.XmlObject))
def handle(self, *args, **options): verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all v_normal = 1 if verbosity > v_normal: print "Preparing documents from all defined Archives" updated = 0 unchanged = 0 errored = 0 if len(args): files = args else: files = set() svn = svn_client() for archive in Archive.objects.all(): # update to make sure we have latest version of everything svn.update(str(archive.svn_local_path)) # apparently can't handle unicode files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml')))) for file in files: try: ead = load_xmlobject_from_file(file, FindingAid) orig_xml = ead.serializeDocument(pretty=True) unitid = unicode(ead.archdesc.unitid) match = self.unitid_regex.search(unitid) if not match: raise Exception('Could not determine collection number for %s - %s' % \ (file, unitid)) collection_num = match.group('number') if verbosity > v_normal: print "Identifier for %s is %s (%s)" % (file, collection_num, unitid) ead.archdesc.unitid.identifier = collection_num if orig_xml == ead.serializeDocument(pretty=True): if verbosity > v_normal: print "No changes made to %s" % file unchanged += 1 else: with open(file, 'w') as f: ead.serializeDocument(f, pretty=True) if verbosity > v_normal: print "Updated %s" % file updated += 1 except XMLSyntaxError, e: # xml is not well-formed print "Error: failed to load %s (document not well-formed XML?)" \ % file errored += 1 except Exception, e: # catch any other exceptions print "Error: failed to set identifier for %s : %s" % (file, e) errored += 1
def setUp(self): # temporarily replace pid client with mock for testing self._django_pid_client = utils.DjangoPidmanRestClient utils.DjangoPidmanRestClient = MockDjangoPidmanClient # save pid config settings to restore in teardown self._pid_config = { 'PIDMAN_HOST': settings.PIDMAN_HOST, 'PIDMAN_USER': settings.PIDMAN_USER, 'PIDMAN_PASSWORD': settings.PIDMAN_PASSWORD, 'PIDMAN_DOMAIN': settings.PIDMAN_DOMAIN } # initialize valid and invalid ead fixtures self.valid_eadfile = os.path.join(settings.BASE_DIR, 'fa_admin', 'fixtures', 'hartsfield558.xml') self.valid_ead = load_xmlobject_from_file(self.valid_eadfile, FindingAid) self.invalid_eadfile = os.path.join(settings.BASE_DIR, 'fa_admin', 'fixtures', 'hartsfield558_invalid.xml') self.invalid_ead = load_xmlobject_from_file(self.invalid_eadfile, FindingAid)
def preview(request, archive): if request.method == 'POST': archive = get_object_or_404(Archive, slug=archive) filename = request.POST['filename'] errors = [] err = None try: # only load to exist if document passes publication check ok, response, dbpath, fullpath = _prepublication_check(request, filename, archive, mode='preview') if ok is not True: return response db = ExistDB() # load the document to the *preview* collection in eXist with the same fileneame preview_dbpath = settings.EXISTDB_PREVIEW_COLLECTION + "/" + filename # make sure the preview collection exists, but don't complain if it's already there success = db.load(open(fullpath, 'r'), preview_dbpath) except ExistDBException as err: success = False errors.append(err.message()) if success: # load the file as a FindingAid object so we can generate the preview url ead = load_xmlobject_from_file(fullpath, FindingAid) messages.success(request, 'Successfully loaded <b>%s</b> for preview.' % filename) # redirect to document preview page with code 303 (See Other) return HttpResponseSeeOtherRedirect(reverse('fa-admin:preview:findingaid', kwargs={'id': ead.eadid})) else: # no exception but no success means the load failed; # *probably* due to insufficient permissions if errors == [] and success == False: errors.append('Failed to load the document to the preview collection') return render(request, 'fa_admin/publish-errors.html', {'errors': errors, 'filename': filename, 'mode': 'preview', 'exception': err}) # NOTE: preview list is not used anymore; functionality is handled # by main admin view; if we revisit preview list, to be more usable it # should be filterable by archive else: fa = get_findingaid(preview=True, only=['eadid', 'list_title', 'last_modified'], order_by='last_modified') return render(request, 'fa_admin/preview_list.html', {'findingaids': fa, #'querytime': [fa.queryTime()] })
def test_run(self): # process all files with patch('findingaids.fa.models.Archive.svn_local_path', self.tmpdir): self.command.run_command('-v', '2') output = self.command.output # check that correct unitid identifier was set ead = load_xmlobject_from_file(self.files['hartsfield558.xml'], FindingAid) self.assertEqual(558, ead.archdesc.unitid.identifier) self.assert_('2 documents updated' in output) self.assert_('1 document with errors' in output) # badly-formed xml - should be reported self.assert_(re.search(r'^Error.*badlyformed.xml.*not well-formed.*$', output, re.MULTILINE), 'unitid_identifier reports error for non well-formed xml') # files with errors should not be modified self.assertEqual(self.file_sizes['badlyformed.xml'], os.path.getsize(self.files['badlyformed.xml']), 'file with errors not modified by unitid_identifier script')
def prepared_eadxml(request, archive, filename): """On GET, serves out a prepared version of the EAD file in the specified archive subversion directory. Response header is set so the user should be prompted to download the xml, with a filename matching that of the original document. On POST, commits the prepared version of the EAD file to the subversion directory of the specified archive, with a log message indicating the user who requested the commit. Steps taken to prepare a document are documented in :meth:`~findingaids.fa_admin.utils.prep_ead`. :param filename: name of the file to prep; should be base filename only, document will be pulled from the configured source directory. """ # find relative to svn path if associated with an archive prepped_xml = cache.get(filename) arch = get_object_or_404(Archive, slug=archive) fullpath = os.path.join(arch.svn_local_path, filename) if prepped_xml is None: try: ead = load_xmlobject_from_file(fullpath, FindingAid) # validate or not? except XMLSyntaxError, e: # xml is not well-formed : return 500 with error message return HttpResponseServerError("Could not load document: %s" % e) # flash meesage that appear on the screen for user, message itself is generated in utils.py with message_logging(request, 'findingaids.fa_admin.utils', logging.INFO): try: ead = utils.prep_ead(ead, filename) prepped_xml = ead.serializeDocument() cache.set(filename, prepped_xml) except Exception as e: # any exception on prep is most likely ark generation return HttpResponseServerError('Failed to prep the document: ' + str(e))
def prepared_eadxml(request, archive, filename): """On GET, serves out a prepared version of the EAD file in the specified archive subversion directory. Response header is set so the user should be prompted to download the xml, with a filename matching that of the original document. On POST, commits the prepared version of the EAD file to the subversion directory of the specified archive, with a log message indicating the user who requested the commit. Steps taken to prepare a document are documented in :meth:`~findingaids.fa_admin.utils.prep_ead`. :param filename: name of the file to prep; should be base filename only, document will be pulled from the configured source directory. """ # find relative to svn path if associated with an archive arch = get_object_or_404(Archive, slug=archive) fullpath = os.path.join(arch.svn_local_path, filename) try: ead = load_xmlobject_from_file(fullpath, FindingAid) # validate or not? except XMLSyntaxError, e: # xml is not well-formed : return 500 with error message return HttpResponseServerError("Could not load document: %s" % e)
def test_load_from_file(self): """Test using shortcut to initialize XmlObject from a file""" obj = xmlmap.load_xmlobject_from_file(self.FILE.name) self.assert_(isinstance(obj, xmlmap.XmlObject))
def test_prep_ead(self): # valid fixtures is an ead with series/subseries, and index # - clear out fixture ark url to trigger generating a new one (simulated) del(self.valid_ead.eadid.url) del(self.valid_ead.eadid.identifier) ead = utils.prep_ead(self.valid_ead, self.valid_eadfile) self.assert_(isinstance(ead, FindingAid), "prep_ead should return an instance of FindingAid") self.assertEqual(u'hartsfield558', ead.eadid.value) self.assertEqual(u'hartsfield558_series1', ead.dsc.c[0].id) self.assertEqual(u'hartsfield558_subseries6.1', ead.dsc.c[5].c[0].id) self.assertEqual(u'hartsfield558_index1', ead.archdesc.index[0].id) # ark should be generated and stored in eadid url self.assertEqual(MockDjangoPidmanClient.test_ark, ead.eadid.url) # short-form ark should be stored in identifier attribute self.assert_(MockDjangoPidmanClient.test_ark.endswith(ead.eadid.identifier)) # ead with no series eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests', 'fixtures', 'pittsfreeman1036.xml') ead = load_xmlobject_from_file(eadfile, FindingAid) ead = utils.prep_ead(ead, eadfile) self.assert_(isinstance(ead, FindingAid), "prep_ead should return an instance of FindingAid") self.assertEqual(u'pittsfreeman1036', ead.eadid.value) # series with no unitid eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests', 'fixtures', 'raoul548.xml') ead = load_xmlobject_from_file(eadfile, FindingAid) ead = utils.prep_ead(ead, eadfile) self.assertEqual(u'raoul548_series3', ead.dsc.c[2].id) # whitespace cleanup ead = utils.prep_ead(self.invalid_ead, self.invalid_eadfile) # - no leading whitespace in list title # ead.archdesc.origination is getting normalized, so can't be used for testing origination = ead.node.xpath('//e:origination/e:persname', namespaces={'e': EAD_NAMESPACE}) self.assertEqual(u'Hartsfield, William Berry.', origination[0].text) # test the node text directly (does not include unitdate) self.assertEqual(u'William Berry Hartsfield papers, ', ead.unittitle.node.text) self.assertEqual(u'Gone with the wind (Motion picture)', ead.archdesc.controlaccess.controlaccess[0].title[0].value) self.assertEqual(u'Allen, Ivan.', ead.archdesc.controlaccess.controlaccess[1].person_name[0].value) self.assertEqual(u'Mines and mineral resources--Georgia.', ead.archdesc.controlaccess.controlaccess[3].subject[1].value) # unicode characters self.assertEqual(u'Motion pictures--Georgia. \u2026', ead.archdesc.controlaccess.controlaccess[3].subject[2].value) self.assertEqual(u'Motion pictures.', ead.archdesc.controlaccess.controlaccess[-1].genre_form[0].value) # remaining errors after clean-up: # 1 - duplicate origination # 2 - > 2 containers in a did (summary error and list of problem dids) # 2 - 1 container in a did (summary error and list of problem dids) # = 5 self.assertEqual(5, len(utils.check_eadxml(ead)), "only 3 errors (duplicate origination, 3 containers in a did, 1 container in a did) should be left in invalid test fixture after cleaning") # special case - unittitle begins with a <title> eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests', 'fixtures', 'pittsfreeman1036.xml') ead = load_xmlobject_from_file(eadfile, FindingAid) ead = utils.prep_ead(ead, eadfile) self.assertFalse(unicode(ead.list_title).startswith('None'), 'cleaned unittitle with leading <title> should not start with "None"')
def test_check_eadxml(self): # use invalid ead fixture to check error detection ead = self.invalid_ead ead.eadid.value = 'foo#~@/' # set invalid eadid for this test only # invalid fixture has several errors errors = utils.check_eadxml(ead) self.assertNotEqual(0, len(errors)) # - series/subseries ids missing, index id missing self.assert_("series c01 id attribute is not set for Series 1: Personal papers, 1918-1986" in errors, 'c01 missing id error reported') self.assert_("subseries c02 id attribute is not set for Subseries 6.1: Minerals and mining files, 1929-1970" in errors, 'c02 missing id error reported') self.assert_("index id attribute is not set for Index of Selected Correspondents" in errors, 'index missing id error reported') # - origination count error self.assert_("Site expects only one archdesc/did/origination; found 2" in errors, 'multiple origination error reported') # - whitespace in list title self.assert_("Found leading whitespace in list title field (origination/persname): " + "' Hartsfield, William Berry.'" in errors, 'leading whitespace in origination reported') # - eadid regex self.assert_("eadid '%s' does not match site URL regular expression" % ead.eadid.value in errors, 'eadid regex error reported') #ARK in url and identifier not set or invalid self.assert_("eadid url is either not set or not an ARK. " + "To correct, run the prep process again." in errors, 'eadid ark not in url') self.assert_("eadid identifier is either not set or not an ARK" + "To correct, run the prep process again." in errors, 'eadid ark not in identifier') #valid ARKs in url and identifier but do not match ark1 = "http://testpid.library.emory.edu/ark:/25593/1234" ark1_short = "ark:/25593/1234" ark2_short = "ark:/25593/567" ead.eadid.url = ark1 ead.eadid.identifier = ark2_short errors = utils.check_eadxml(ead) self.assert_("eadid url is either not set or not an ARK. " + "To correct, run the prep process again." not in errors, 'valid eadid ark set in url') self.assert_("eadid identifier is either not set or not an ARK" + "To correct, run the prep process again." not in errors, 'valid eadid ark set in identifier') self.assert_("eadid url and identifier do not match: url '%s' should end with identifier '%s'" % (ark1, ark2_short) in errors, 'eadid url and identifier do not march') # Change url and identifier to match ead.eadid.url = ark1 ead.eadid.identifier = ark1_short errors = utils.check_eadxml(ead) self.assert_("eadid url and identifier do not match: url '%s' should end with identifier '%s'" % (ark1, ark1_short) not in errors, 'eadid url and identifier march') # - list title first letter regex # simulate non-whitespace, non-alpha first letter in list title ead.list_title.node.text = "1234" # list title is not normally settable; overriding for test errors = utils.check_eadxml(ead) self.assert_("First letter ('1') of list title field origination/persname does not match browse letter URL regex '%s'" \ % TITLE_LETTERS in errors, 'title first letter regex error reported') # empty/unset list title field ead.list_title.node.text = None errors = utils.check_eadxml(ead) self.assert_("List title seems to be empty" in errors) # - whitespace in control access terms self.assert_("Found leading whitespace in controlaccess term ' Gone with the wind (Motion picture)' (title)" in errors, 'controlaccess title leading whitespace reported') self.assert_("Found leading whitespace in controlaccess term ' \t Selznick, David O., 1902-1965.' (persname)" in errors, 'controlaccess name leading whitespace reported') self.assert_("Found leading whitespace in controlaccess term ' \t Mines and mineral resources--Georgia.' (subject)" in errors, 'controlaccess subject leading whitespace reported') self.assert_("Found leading whitespace in controlaccess term ' Motion pictures.' (genreform)" in errors, 'controlaccess genre leading whitespace reported') # - did with > 2 containers self.assert_('Site expects maximum of 2 containers per did; found 1 did(s) with more than 2' in errors, 'did with more than 2 containers reported') # - did with only 1 container self.assert_('Site expects 2 containers per did; found 1 did(s) with only 1' in errors, 'did with only 1 container reported') # make sure we handle quirky document with a <title> at the beginning of the <unittitle> eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests', 'fixtures', 'pittsfreeman1036.xml') ead_nested_title = load_xmlobject_from_file(eadfile, FindingAid) errors = utils.check_eadxml(ead_nested_title) self.assert_(all('list title' not in err for err in errors), 'nested <title> in <unittitle> should not generate a list title whitespace error')
def test_check_ead(self): # check valid EAD - no errors -- good fixture, should pass all tests dbpath = settings.EXISTDB_TEST_COLLECTION + '/hartsfield558.xml' errors = utils.check_ead(self.valid_eadfile, dbpath) self.assertEqual(0, len(errors)) # should cause several errors - not schema valid, eadid, series/subseries ids missing, index id missing errors = utils.check_ead(self.invalid_eadfile, dbpath) self.assertNotEqual(0, len(errors)) self.assert_("attribute 'invalid': The attribute 'invalid' is not allowed" in errors[0]) # validation error message # NOTE: somewhere between lxml 2.3.1 and 3.0.1 we started getting # duplicate validation errors. work around it for now. # (errors seem to be aggregating instead of clearing out....) while errors[0] == errors[1]: errors.pop(0) self.assert_("Line 2" in errors[0], "validation error includes line number") # validation error message self.assert_("eadid 'hartsfield558.xml' does not match expected value" in errors[1]) self.assert_("series c01 id attribute is not set for Series 1" in errors[2]) self.assert_("subseries c02 id attribute is not set for Subseries 6.1" in errors[3]) self.assert_("index id attribute is not set for Index of Selected Correspondents" in errors[4]) errors = utils.check_ead(self.valid_eadfile, dbpath) self.assertEqual(0, len(errors)) # eadid uniqueness check in eXist self.db.load(open(self.valid_eadfile), dbpath) errors = utils.check_ead(self.valid_eadfile, dbpath) # same eadid, but present in the file that will be updated - no errors self.assertEqual(0, len(errors)) # upload same file to a different path - non-unique eadid error self.db.load(open(self.valid_eadfile), settings.EXISTDB_TEST_COLLECTION + '/hartsfield_other.xml') errors = utils.check_ead(self.valid_eadfile, dbpath) self.assertEqual(1, len(errors)) self.assert_("Database already contains 2 instances of eadid" in errors[0]) # remove version with correct path to test single conflicting eadid self.db.removeDocument(dbpath) errors = utils.check_ead(self.valid_eadfile, dbpath) self.assertEqual(1, len(errors)) self.assert_("Database contains eadid 'hartsfield558' in a different document" in errors[0]) # leading whitespace in unit title with tempfile.NamedTemporaryFile(prefix='findingaids-ead-', suffix='xml', delete=False) as tmpfile: # modify fixture to introduce leading whitespace ead = load_xmlobject_from_file(self.valid_eadfile, FindingAid) # check expects eadid to match filename ead.eadid.value = os.path.basename(tmpfile.name) # add whiespace at beginning of title ead.unittitle.text = "\n %s" % ead.unittitle.text ead.serializeDocument(tmpfile) # close to flush content tmpfile.close() errors = utils.check_ead(tmpfile.name, dbpath) os.remove(tmpfile.name) # should have 1 error for leading whitespace self.assertEqual(1, len(errors)) self.assert_(errors[0].startswith('Found leading whitespace in unittitle')) # pomerantz unit title starts with an <emph> tag; test that # this doesn't trip up check for leading whitespace in title dbpath = settings.EXISTDB_TEST_COLLECTION + '/pomerantz890.xml' pomerantz_eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests', 'fixtures', 'pomerantz890.xml') errors = utils.check_ead(pomerantz_eadfile, dbpath) # fixture contains subjects with leading whitespace, which is fine # we just care that the unittitle check passes self.assert_('Found leading whitespace in unittitle:' not in errors[0])
def handle(self, *args, **options): verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all v_normal = 1 v_all = 2 if options['pdf_only'] and options['skip_pdf_reload']: raise CommandError("Options -s and -p are not compatible") # check for required settings if not hasattr(settings, 'EXISTDB_ROOT_COLLECTION') or not settings.EXISTDB_ROOT_COLLECTION: raise CommandError("EXISTDB_ROOT_COLLECTION setting is missing") return if len(args): files = args else: # Note: copied from prep_ead manage command; move somewhere common? files = set() svn = svn_client() for archive in Archive.objects.all(): # update to make sure we have latest version of everything svn.update(str(archive.svn_local_path)) # apparently can't handle unicode files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml')))) if verbosity == v_all: print 'Documents will be loaded to configured eXist collection: %s' \ % settings.EXISTDB_ROOT_COLLECTION if options['skip_pdf_reload']: print "** Skipping PDFs cache reload" db = ExistDB() loaded = 0 errored = 0 pdf_tasks = {} start_time = datetime.now() if not options['pdf_only']: # unless PDF reload only has been specified, load files for file in files: try: # full path location where file will be loaded in exist db collection dbpath = settings.EXISTDB_ROOT_COLLECTION + "/" + os.path.basename(file) errors = check_ead(file, dbpath) if errors: # report errors, don't load errored += 1 print "Error: %s does not pass publication checks; not loading to eXist." % file if verbosity >= v_normal: print " Errors found:" for err in errors: print " %s" % err else: with open(file, 'r') as eadfile: success = db.load(eadfile, dbpath, overwrite=True) if success: loaded += 1 if verbosity >= v_normal: print "Loaded %s" % file # load the file as a FindingAid object to get the eadid for PDF reload ead = load_xmlobject_from_file(file, FindingAid) # trigger PDF regeneration in the cache and store task result # - unless user has requested PDF reload be skipped if not options['skip_pdf_reload']: pdf_tasks[ead.eadid.value] = reload_cached_pdf.delay(ead.eadid.value) # NOTE: unlike the web admin publish, this does not # generate TaskResult db records; task outcomes will be # checked & reported before the script finishes else: errored += 1 print "Error: failed to load %s to eXist" % file except ExistDBException, e: print "Error: failed to load %s to eXist" % file print e.message() errored += 1 # output a summary of what was done print "%d document%s loaded" % (loaded, 's' if loaded != 1 else '') print "%d document%s with errors" % (errored, 's' if errored != 1 else '')
def handle(self, *args, **options): verbosity = int(options.get('verbosity', self.v_normal)) svn_commit = options.get('commit', False) dry_run = options.get('dryrun', False) # check for required settings if not hasattr(settings, 'KEEP_SOLR_SERVER_URL') or not settings.KEEP_SOLR_SERVER_URL: raise CommandError("KEEP_SOLR_SERVER_URL setting is required for this script") return solr = solr_interface() if verbosity > self.v_normal: print "Preparing documents from all defined Archives" if dry_run: print "Running in dry-run mode; no changes will be made" updated = 0 unchanged = 0 errored = 0 if len(args): files = args else: # Note: copied from prep_ead manage command; move somewhere common? files = set() svn = svn_client() for archive in Archive.objects.all(): # update to make sure we have latest version of everything svn.update(str(archive.svn_local_path)) # apparently can't handle unicode files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml')))) for file in files: file_items = 0 daos = 0 try: if verbosity >= self.v_normal and len(files) > 1: self.stdout.write('\nProcessing %s' % os.path.basename(file)) ead = load_xmlobject_from_file(file, FindingAid) orig_xml = ead.serializeDocument() # keep to check if changed for c in self.ead_file_items(ead): # if item already contains any dao tags, skip it (no furher processing needed) if c.did.dao_list: continue match = self.has_digitized_content(unicode(c.did.unittitle)) if match: file_items += 1 try: id_list = self.id_list(match.groupdict()['ids']) except Exception as e: self.stdout.write('Error parsing ids from "%s" : %s' % \ (unicode(c.did.unittitle), e)) continue # if no ids were found even though title seemed to have digitized content, # error and skip to next if not id_list: self.stdout.write('Appears to have digitized content, but no ids found in "%s"' % \ (unicode(c.did.unittitle))) continue # dictionary for any Keep info corresponding to these ids id_info = {} # look up each id in the Keep for i in id_list: q = solr.query(solr.Q(dm1_id="%s" % i) | solr.Q(pid="emory:%s" % i)) \ .field_limit(['ark_uri', 'pid']) if q.count() == 1: id_info[i] = q[0] # remove the plain-text digitized ids from unittitle content # (handle as unicode to preserve any special characters) # NOTE: because unittitle could contain nested tags (dates, # titles, names, etc), iterate through the text nodes and # remove the digitized note wherever it occurs # - use lxml smart strings to update based on parent nodes text_nodes = c.did.unittitle.node.xpath('text()') for txt in text_nodes: updated_txt = re.sub(self.digitized_ids, u'', txt) if txt.is_text: txt.getparent().text = updated_txt else: txt.getparent().tail = updated_txt # ensure document has xlink namespace declared at the top # or else it will be repeated for each dao for i in id_list: info = id_info.get(i, None) # append a new dao for each id; audience will always be internal dao_opts = {'audience': 'internal'} href = None if info: # in some cases in production, a record is found but no # ark_uri is indexed in solr (indicates ark_uri not in MODS) try: href = info['ark_uri'] except KeyError: self.stdout.write('Warning: Keep record was found for %s but no ARK URI is indexed' \ % i) # if no record was found, *should* be a digital masters id if href is None: # if id already starts with dm, don't duplicate the prefix if i.startswith('dm'): dao_opts['id'] = i # if it's a digit, add dm prefix elif i.isdigit(): dao_opts['id'] = 'dm%s' % i # otherwise, warn and add the id in pid notation else: # only warn if we didn't already warn about info without ark uri if not info: self.stdout.write('Warning: non-digital masters id %s not found in the Keep' \ % i) # generate an ark anyway, since pids don't make valid ids href = 'http://pid.emory.edu/ark:/25593/%s' % i c.did.dao_list.append(eadmap.DigitalArchivalObject(**dao_opts)) if href is not None: c.did.dao_list[-1].href = href # clean up any extra namespaces (exist-db ns) cleanup_namespaces(c.did.dao_list[-1].node) daos += 1 # NOTE: could use pretty=True, but not used elsewhere in fa_admin, # so leaving off for consistency if orig_xml == ead.serializeDocument(): if verbosity > self.v_normal: self.stdout.write("No changes made to %s" % file) unchanged += 1 else: # in dry run, don't actually change the file if not dry_run: with open(file, 'w') as f: ead.serializeDocument(f) if verbosity >= self.v_normal: self.stdout.write("Updated %s; found %d item%s with digitized content, added %d <dao>%s" \ % (file, file_items, 's' if file_items != 1 else '', daos, 's' if daos != 1 else '')) updated += 1 except XMLSyntaxError: # xml is not well-formed self.stdout.write("Error: failed to load %s (document not well-formed XML?)" \ % file) errored += 1 # except Exception, e: # # catch any other exceptions # print "Error: failed to update %s : %s" % (file, e) # errored += 1 # TODO: might be nice to also report total number of daos added # summary of what was done self.stdout.write("\n%d document%s updated" % (updated, 's' if updated != 1 else '')) self.stdout.write("%d document%s unchanged" % (unchanged, 's' if unchanged != 1 else '')) self.stdout.write("%d document%s with errors" % (errored, 's' if errored != 1 else '')) if svn_commit: svn = svn_client() # seems to be the only way to set a commit log message via client def get_log_message(arg): # argument looks something like this: # [('foo', 'https://svn.library.emory.edu/svn/dev_ead-eua/trunk/eua0081affirmationvietnam.xml', 6, None, 4)] # ignoring since we will only use this function for a single commit return 'converted digitized item ids to <dao> tags' svn.log_msg_func = get_log_message for archive in Archive.objects.all(): # update to make sure we have latest version of everything svn.commit(str(archive.svn_local_path))
def handle(self, *args, **options): verbosity = int(options['verbosity']) self._setup_logging(verbosity) # check for required settings if not hasattr(settings, 'EXISTDB_ROOT_COLLECTION') or not settings.EXISTDB_ROOT_COLLECTION: raise CommandError("EXISTDB_ROOT_COLLECTION setting is missing") return if verbosity == self.v_all: print "Preparing documents from all defined Archives" updated = 0 unchanged = 0 errored = 0 if len(args): files = args else: files = set() svn = svn_client() for archive in Archive.objects.all(): # update to make sure we have latest version of everything svn.update(str(archive.svn_local_path)) # apparently can't handle unicode files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml')))) for file in files: try: ead = load_xmlobject_from_file(file, FindingAid) orig_xml = ead.serializeDocument(pretty=True) ead = utils.prep_ead(ead, file) # sanity check before saving dbpath = settings.EXISTDB_ROOT_COLLECTION + "/" + os.path.basename(file) errors = utils.check_ead(file, dbpath, xml=ead.serializeDocument()) if errors: errored += 1 print "Prepared EAD for %s does not pass sanity checks, not saving." % file if verbosity >= self.v_normal: print "Errors found:" for err in errors: # some errors include a list of error instances - display nicely if isinstance(err, list): for suberr in err: print " %s" % suberr else: print " %s" % err elif orig_xml == ead.serializeDocument(pretty=True): if verbosity >= self.v_normal: print "No changes made to %s" % file unchanged += 1 else: with open(file, 'w') as f: ead.serializeDocument(f, pretty=True) if verbosity >= self.v_normal: print "Updated %s" % file updated += 1 except XMLSyntaxError, e: # xml is not well-formed print "Error: failed to load %s (document not well-formed XML?)" \ % file errored += 1 except Exception, e: # catch any other exceptions print "Error: failed to prep %s : %s" % (file, e) errored += 1