def test_check_ead(self): # check valid EAD - no errors -- good fixture, should pass all tests dbpath = settings.EXISTDB_TEST_COLLECTION + '/hartsfield558.xml' errors = utils.check_ead(self.valid_eadfile, dbpath) self.assertEqual(0, len(errors)) # should cause several errors - not schema valid, eadid, series/subseries ids missing, index id missing errors = utils.check_ead(self.invalid_eadfile, dbpath) self.assertNotEqual(0, len(errors)) self.assert_("attribute 'invalid': The attribute 'invalid' is not allowed" in errors[0]) # validation error message # NOTE: somewhere between lxml 2.3.1 and 3.0.1 we started getting # duplicate validation errors. work around it for now. # (errors seem to be aggregating instead of clearing out....) while errors[0] == errors[1]: errors.pop(0) self.assert_("Line 2" in errors[0], "validation error includes line number") # validation error message self.assert_("eadid 'hartsfield558.xml' does not match expected value" in errors[1]) self.assert_("series c01 id attribute is not set for Series 1" in errors[2]) self.assert_("subseries c02 id attribute is not set for Subseries 6.1" in errors[3]) self.assert_("index id attribute is not set for Index of Selected Correspondents" in errors[4]) # eadid uniqueness check in eXist self.db.load(open(self.valid_eadfile), dbpath, True) errors = utils.check_ead(self.valid_eadfile, dbpath) # same eadid, but present in the file that will be updated - no errors self.assertEqual(0, len(errors)) # upload same file to a different path - non-unique eadid error self.db.load(open(self.valid_eadfile), settings.EXISTDB_TEST_COLLECTION + '/hartsfield_other.xml', True) errors = utils.check_ead(self.valid_eadfile, dbpath) self.assertEqual(1, len(errors)) self.assert_("Database already contains 2 instances of eadid" in errors[0]) # remove version with correct path to test single conflicting eadid self.db.removeDocument(dbpath) errors = utils.check_ead(self.valid_eadfile, dbpath) self.assertEqual(1, len(errors)) self.assert_("Database contains eadid 'hartsfield558' in a different document" in errors[0])
def _prepublication_check(request, filename, archive, mode='publish', xml=None): """ Pre-publication check logic common to :meth:`publish` and :meth:`preview`. Generates a full path to the file in the configured EAD source directory, and the expected published location in eXist, and then runs :meth:`~findingaids.fa_admin.utils.check_ead` to check the xml for errors. If there are errors, will generate an error response that can be displayed. :param request: request object passed into the view (for generating error response) :param filename: base filename of the ead file to be checked :param archive: :class:`~findingaids.fa.models.Archive`, used to locate the file on disk :param mode: optional mode, for display on error page (defaults to publish) :rtype: list :returns: list of the following: - boolean ok (if True, all checks passed) - HttpResponse response error response to display, if there were errors - dbpath - full path to publication location in configured eXist db - fullpath - full path to the file in the configured source directory """ # full path to the local file fullpath = os.path.join(archive.svn_local_path, filename) # full path in exist db collection dbpath = settings.EXISTDB_ROOT_COLLECTION + "/" + filename errors = utils.check_ead(fullpath, dbpath, xml) if errors: ok = False response = render(request, 'fa_admin/publish-errors.html', {'errors': errors, 'filename': filename, 'mode': mode}) else: ok = True response = None return [ok, response, dbpath, fullpath]
def handle(self, *args, **options): verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all v_normal = 1 v_all = 2 if options['pdf_only'] and options['skip_pdf_reload']: raise CommandError("Options -s and -p are not compatible") # check for required settings if not hasattr(settings, 'EXISTDB_ROOT_COLLECTION') or not settings.EXISTDB_ROOT_COLLECTION: raise CommandError("EXISTDB_ROOT_COLLECTION setting is missing") return if len(args): files = args else: # Note: copied from prep_ead manage command; move somewhere common? files = set() svn = svn_client() for archive in Archive.objects.all(): # update to make sure we have latest version of everything svn.update(str(archive.svn_local_path)) # apparently can't handle unicode files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml')))) if verbosity == v_all: print 'Documents will be loaded to configured eXist collection: %s' \ % settings.EXISTDB_ROOT_COLLECTION if options['skip_pdf_reload']: print "** Skipping PDFs cache reload" db = ExistDB() loaded = 0 errored = 0 pdf_tasks = {} start_time = datetime.now() if not options['pdf_only']: # unless PDF reload only has been specified, load files for file in files: try: # full path location where file will be loaded in exist db collection dbpath = settings.EXISTDB_ROOT_COLLECTION + "/" + os.path.basename(file) errors = check_ead(file, dbpath) if errors: # report errors, don't load errored += 1 print "Error: %s does not pass publication checks; not loading to eXist." % file if verbosity >= v_normal: print " Errors found:" for err in errors: print " %s" % err else: with open(file, 'r') as eadfile: success = db.load(eadfile, dbpath, overwrite=True) if success: loaded += 1 if verbosity >= v_normal: print "Loaded %s" % file # load the file as a FindingAid object to get the eadid for PDF reload ead = load_xmlobject_from_file(file, FindingAid) # trigger PDF regeneration in the cache and store task result # - unless user has requested PDF reload be skipped if not options['skip_pdf_reload']: pdf_tasks[ead.eadid.value] = reload_cached_pdf.delay(ead.eadid.value) # NOTE: unlike the web admin publish, this does not # generate TaskResult db records; task outcomes will be # checked & reported before the script finishes else: errored += 1 print "Error: failed to load %s to eXist" % file except ExistDBException, e: print "Error: failed to load %s to eXist" % file print e.message() errored += 1 # output a summary of what was done print "%d document%s loaded" % (loaded, 's' if loaded != 1 else '') print "%d document%s with errors" % (errored, 's' if errored != 1 else '')
def test_check_ead(self): # check valid EAD - no errors -- good fixture, should pass all tests dbpath = settings.EXISTDB_TEST_COLLECTION + '/hartsfield558.xml' errors = utils.check_ead(self.valid_eadfile, dbpath) self.assertEqual(0, len(errors)) # should cause several errors - not schema valid, eadid, series/subseries ids missing, index id missing errors = utils.check_ead(self.invalid_eadfile, dbpath) self.assertNotEqual(0, len(errors)) self.assert_("attribute 'invalid': The attribute 'invalid' is not allowed" in errors[0]) # validation error message # NOTE: somewhere between lxml 2.3.1 and 3.0.1 we started getting # duplicate validation errors. work around it for now. # (errors seem to be aggregating instead of clearing out....) while errors[0] == errors[1]: errors.pop(0) self.assert_("Line 2" in errors[0], "validation error includes line number") # validation error message self.assert_("eadid 'hartsfield558.xml' does not match expected value" in errors[1]) self.assert_("series c01 id attribute is not set for Series 1" in errors[2]) self.assert_("subseries c02 id attribute is not set for Subseries 6.1" in errors[3]) self.assert_("index id attribute is not set for Index of Selected Correspondents" in errors[4]) errors = utils.check_ead(self.valid_eadfile, dbpath) self.assertEqual(0, len(errors)) # eadid uniqueness check in eXist self.db.load(open(self.valid_eadfile), dbpath) errors = utils.check_ead(self.valid_eadfile, dbpath) # same eadid, but present in the file that will be updated - no errors self.assertEqual(0, len(errors)) # upload same file to a different path - non-unique eadid error self.db.load(open(self.valid_eadfile), settings.EXISTDB_TEST_COLLECTION + '/hartsfield_other.xml') errors = utils.check_ead(self.valid_eadfile, dbpath) self.assertEqual(1, len(errors)) self.assert_("Database already contains 2 instances of eadid" in errors[0]) # remove version with correct path to test single conflicting eadid self.db.removeDocument(dbpath) errors = utils.check_ead(self.valid_eadfile, dbpath) self.assertEqual(1, len(errors)) self.assert_("Database contains eadid 'hartsfield558' in a different document" in errors[0]) # leading whitespace in unit title with tempfile.NamedTemporaryFile(prefix='findingaids-ead-', suffix='xml', delete=False) as tmpfile: # modify fixture to introduce leading whitespace ead = load_xmlobject_from_file(self.valid_eadfile, FindingAid) # check expects eadid to match filename ead.eadid.value = os.path.basename(tmpfile.name) # add whiespace at beginning of title ead.unittitle.text = "\n %s" % ead.unittitle.text ead.serializeDocument(tmpfile) # close to flush content tmpfile.close() errors = utils.check_ead(tmpfile.name, dbpath) os.remove(tmpfile.name) # should have 1 error for leading whitespace self.assertEqual(1, len(errors)) self.assert_(errors[0].startswith('Found leading whitespace in unittitle')) # pomerantz unit title starts with an <emph> tag; test that # this doesn't trip up check for leading whitespace in title dbpath = settings.EXISTDB_TEST_COLLECTION + '/pomerantz890.xml' pomerantz_eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests', 'fixtures', 'pomerantz890.xml') errors = utils.check_ead(pomerantz_eadfile, dbpath) # fixture contains subjects with leading whitespace, which is fine # we just care that the unittitle check passes self.assert_('Found leading whitespace in unittitle:' not in errors[0])
def handle(self, *args, **options): verbosity = int(options['verbosity']) self._setup_logging(verbosity) # check for required settings if not hasattr(settings, 'EXISTDB_ROOT_COLLECTION') or not settings.EXISTDB_ROOT_COLLECTION: raise CommandError("EXISTDB_ROOT_COLLECTION setting is missing") return if verbosity == self.v_all: print "Preparing documents from all defined Archives" updated = 0 unchanged = 0 errored = 0 if len(args): files = args else: files = set() svn = svn_client() for archive in Archive.objects.all(): # update to make sure we have latest version of everything svn.update(str(archive.svn_local_path)) # apparently can't handle unicode files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml')))) for file in files: try: ead = load_xmlobject_from_file(file, FindingAid) orig_xml = ead.serializeDocument(pretty=True) ead = utils.prep_ead(ead, file) # sanity check before saving dbpath = settings.EXISTDB_ROOT_COLLECTION + "/" + os.path.basename(file) errors = utils.check_ead(file, dbpath, xml=ead.serializeDocument()) if errors: errored += 1 print "Prepared EAD for %s does not pass sanity checks, not saving." % file if verbosity >= self.v_normal: print "Errors found:" for err in errors: # some errors include a list of error instances - display nicely if isinstance(err, list): for suberr in err: print " %s" % suberr else: print " %s" % err elif orig_xml == ead.serializeDocument(pretty=True): if verbosity >= self.v_normal: print "No changes made to %s" % file unchanged += 1 else: with open(file, 'w') as f: ead.serializeDocument(f, pretty=True) if verbosity >= self.v_normal: print "Updated %s" % file updated += 1 except XMLSyntaxError, e: # xml is not well-formed print "Error: failed to load %s (document not well-formed XML?)" \ % file errored += 1 except Exception, e: # catch any other exceptions print "Error: failed to prep %s : %s" % (file, e) errored += 1