Example #1
0
    def test_check_ead(self):
        # check valid EAD - no errors  -- good fixture, should pass all tests
        dbpath = settings.EXISTDB_TEST_COLLECTION + '/hartsfield558.xml'
        errors = utils.check_ead(self.valid_eadfile, dbpath)
        self.assertEqual(0, len(errors))

        # should cause several errors - not schema valid, eadid, series/subseries ids missing, index id missing
        errors = utils.check_ead(self.invalid_eadfile, dbpath)
        self.assertNotEqual(0, len(errors))
        self.assert_("attribute 'invalid': The attribute 'invalid' is not allowed"
                     in errors[0])   # validation error message

        # NOTE: somewhere between lxml 2.3.1 and 3.0.1 we started getting
        # duplicate validation errors. work around it for now.
        # (errors seem to be aggregating instead of clearing out....)
        while errors[0] == errors[1]:
            errors.pop(0)
        self.assert_("Line 2" in errors[0], "validation error includes line number")   # validation error message
        self.assert_("eadid 'hartsfield558.xml' does not match expected value" in errors[1])
        self.assert_("series c01 id attribute is not set for Series 1" in errors[2])
        self.assert_("subseries c02 id attribute is not set for Subseries 6.1" in errors[3])
        self.assert_("index id attribute is not set for Index of Selected Correspondents" in errors[4])

        # eadid uniqueness check in eXist
        self.db.load(open(self.valid_eadfile), dbpath, True)
        errors = utils.check_ead(self.valid_eadfile, dbpath)
        # same eadid, but present in the file that will be updated - no errors
        self.assertEqual(0, len(errors))

        # upload same file to a different path - non-unique eadid error
        self.db.load(open(self.valid_eadfile), settings.EXISTDB_TEST_COLLECTION + '/hartsfield_other.xml', True)
        errors = utils.check_ead(self.valid_eadfile, dbpath)
        self.assertEqual(1, len(errors))
        self.assert_("Database already contains 2 instances of eadid" in errors[0])

        # remove version with correct path to test single conflicting eadid
        self.db.removeDocument(dbpath)
        errors = utils.check_ead(self.valid_eadfile, dbpath)
        self.assertEqual(1, len(errors))
        self.assert_("Database contains eadid 'hartsfield558' in a different document" in errors[0])
Example #2
0
def _prepublication_check(request, filename, archive, mode='publish', xml=None):
    """
    Pre-publication check logic common to :meth:`publish` and :meth:`preview`.

    Generates a full path to the file in the configured EAD source directory,
    and the expected published location in eXist, and then runs
    :meth:`~findingaids.fa_admin.utils.check_ead` to check the xml for errors.

    If there are errors, will generate an error response that can be displayed.

    :param request: request object passed into the view (for generating error response)
    :param filename: base filename of the ead file to be checked
    :param archive: :class:`~findingaids.fa.models.Archive`, used to locate
        the file on disk
    :param mode: optional mode, for display on error page (defaults to publish)

    :rtype: list
    :returns: list of the following:
      - boolean ok (if True, all checks passed)
      - HttpResponse response error response to display, if there were errors
      - dbpath - full path to publication location in configured eXist db
      - fullpath - full path to the file in the configured source directory
    """

    # full path to the local file
    fullpath = os.path.join(archive.svn_local_path, filename)
    # full path in exist db collection
    dbpath = settings.EXISTDB_ROOT_COLLECTION + "/" + filename
    errors = utils.check_ead(fullpath, dbpath, xml)
    if errors:
        ok = False
        response = render(request, 'fa_admin/publish-errors.html',
                {'errors': errors, 'filename': filename, 'mode': mode})
    else:
        ok = True
        response = None
    return [ok, response, dbpath, fullpath]
Example #3
0
    def handle(self, *args, **options):
        verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        v_normal = 1
        v_all = 2

        if options['pdf_only'] and options['skip_pdf_reload']:
            raise CommandError("Options -s and -p are not compatible")

        # check for required settings
        if not hasattr(settings, 'EXISTDB_ROOT_COLLECTION') or not settings.EXISTDB_ROOT_COLLECTION:
            raise CommandError("EXISTDB_ROOT_COLLECTION setting is missing")
            return

        if len(args):
            files = args
        else:
            # Note: copied from prep_ead manage command; move somewhere common?
            files = set()
            svn = svn_client()
            for archive in Archive.objects.all():
                # update to make sure we have latest version of everything
                svn.update(str(archive.svn_local_path))   # apparently can't handle unicode
                files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml'))))

        if verbosity == v_all:
            print 'Documents will be loaded to configured eXist collection: %s' \
                    % settings.EXISTDB_ROOT_COLLECTION
            if options['skip_pdf_reload']:
                print "** Skipping PDFs cache reload"

        db = ExistDB()

        loaded = 0
        errored = 0
        pdf_tasks = {}

        start_time = datetime.now()

        if not options['pdf_only']:
        # unless PDF reload only has been specified, load files

            for file in files:
                try:
                    # full path location where file will be loaded in exist db collection
                    dbpath = settings.EXISTDB_ROOT_COLLECTION + "/" + os.path.basename(file)
                    errors = check_ead(file, dbpath)
                    if errors:
                        # report errors, don't load
                        errored += 1
                        print "Error: %s does not pass publication checks; not loading to eXist." % file
                        if verbosity >= v_normal:
                            print "  Errors found:"
                            for err in errors:
                                print "    %s" % err
                    else:
                        with open(file, 'r') as eadfile:
                            success = db.load(eadfile, dbpath, overwrite=True)

                        if success:
                            loaded += 1
                            if verbosity >= v_normal:
                                print "Loaded %s" % file
                            # load the file as a FindingAid object to get the eadid for PDF reload
                            ead = load_xmlobject_from_file(file, FindingAid)

                            # trigger PDF regeneration in the cache and store task result
                            # - unless user has requested PDF reload be skipped
                            if not options['skip_pdf_reload']:
                                pdf_tasks[ead.eadid.value] = reload_cached_pdf.delay(ead.eadid.value)
                                # NOTE: unlike the web admin publish, this does not
                                # generate TaskResult db records; task outcomes will be
                                # checked & reported before the script finishes
                        else:
                            errored += 1
                            print "Error: failed to load %s to eXist" % file
                except ExistDBException, e:
                    print "Error: failed to load %s to eXist" % file
                    print e.message()
                    errored += 1

            # output a summary of what was done
            print "%d document%s loaded" % (loaded, 's' if loaded != 1 else '')
            print "%d document%s with errors" % (errored, 's' if errored != 1 else '')
Example #4
0
    def test_check_ead(self):
        # check valid EAD - no errors  -- good fixture, should pass all tests
        dbpath = settings.EXISTDB_TEST_COLLECTION + '/hartsfield558.xml'
        errors = utils.check_ead(self.valid_eadfile, dbpath)
        self.assertEqual(0, len(errors))

        # should cause several errors - not schema valid, eadid, series/subseries ids missing, index id missing
        errors = utils.check_ead(self.invalid_eadfile, dbpath)
        self.assertNotEqual(0, len(errors))
        self.assert_("attribute 'invalid': The attribute 'invalid' is not allowed"
                     in errors[0])   # validation error message

        # NOTE: somewhere between lxml 2.3.1 and 3.0.1 we started getting
        # duplicate validation errors. work around it for now.
        # (errors seem to be aggregating instead of clearing out....)
        while errors[0] == errors[1]:
            errors.pop(0)
        self.assert_("Line 2" in errors[0], "validation error includes line number")   # validation error message
        self.assert_("eadid 'hartsfield558.xml' does not match expected value" in errors[1])
        self.assert_("series c01 id attribute is not set for Series 1" in errors[2])
        self.assert_("subseries c02 id attribute is not set for Subseries 6.1" in errors[3])
        self.assert_("index id attribute is not set for Index of Selected Correspondents" in errors[4])

        errors = utils.check_ead(self.valid_eadfile, dbpath)
        self.assertEqual(0, len(errors))

        # eadid uniqueness check in eXist
        self.db.load(open(self.valid_eadfile), dbpath)
        errors = utils.check_ead(self.valid_eadfile, dbpath)
        # same eadid, but present in the file that will be updated - no errors
        self.assertEqual(0, len(errors))

        # upload same file to a different path - non-unique eadid error
        self.db.load(open(self.valid_eadfile), settings.EXISTDB_TEST_COLLECTION + '/hartsfield_other.xml')
        errors = utils.check_ead(self.valid_eadfile, dbpath)
        self.assertEqual(1, len(errors))
        self.assert_("Database already contains 2 instances of eadid" in errors[0])

        # remove version with correct path to test single conflicting eadid
        self.db.removeDocument(dbpath)
        errors = utils.check_ead(self.valid_eadfile, dbpath)
        self.assertEqual(1, len(errors))
        self.assert_("Database contains eadid 'hartsfield558' in a different document" in errors[0])

        # leading whitespace in unit title
        with tempfile.NamedTemporaryFile(prefix='findingaids-ead-',
                                         suffix='xml', delete=False) as tmpfile:
            # modify fixture to introduce leading whitespace
            ead = load_xmlobject_from_file(self.valid_eadfile, FindingAid)
            # check expects eadid to match filename
            ead.eadid.value = os.path.basename(tmpfile.name)
            # add whiespace at beginning of title
            ead.unittitle.text = "\n  %s" % ead.unittitle.text
            ead.serializeDocument(tmpfile)
            # close to flush content
            tmpfile.close()

            errors = utils.check_ead(tmpfile.name, dbpath)
            os.remove(tmpfile.name)

        # should have 1 error for leading whitespace
        self.assertEqual(1, len(errors))
        self.assert_(errors[0].startswith('Found leading whitespace in unittitle'))

        # pomerantz unit title starts with an <emph> tag; test that
        # this doesn't trip up check for leading whitespace in title
        dbpath = settings.EXISTDB_TEST_COLLECTION + '/pomerantz890.xml'
        pomerantz_eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests',
                                         'fixtures', 'pomerantz890.xml')
        errors = utils.check_ead(pomerantz_eadfile, dbpath)
        # fixture contains subjects with leading whitespace, which is fine
        # we just care that the unittitle check passes
        self.assert_('Found leading whitespace in unittitle:' not in
                     errors[0])
Example #5
0
    def handle(self, *args, **options):
        verbosity = int(options['verbosity'])

        self._setup_logging(verbosity)

        # check for required settings
        if not hasattr(settings, 'EXISTDB_ROOT_COLLECTION') or not settings.EXISTDB_ROOT_COLLECTION:
            raise CommandError("EXISTDB_ROOT_COLLECTION setting is missing")
            return


        if verbosity == self.v_all:
            print "Preparing documents from all defined Archives"

        updated = 0
        unchanged = 0
        errored = 0

        if len(args):
            files = args
        else:
            files = set()
            svn = svn_client()
            for archive in Archive.objects.all():
                # update to make sure we have latest version of everything
                svn.update(str(archive.svn_local_path))   # apparently can't handle unicode
                files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml'))))

        for file in files:
            try:
                ead = load_xmlobject_from_file(file, FindingAid)
                orig_xml = ead.serializeDocument(pretty=True)
                ead = utils.prep_ead(ead, file)
                # sanity check before saving
                dbpath = settings.EXISTDB_ROOT_COLLECTION + "/" + os.path.basename(file)
                errors = utils.check_ead(file, dbpath, xml=ead.serializeDocument())
                if errors:
                    errored += 1
                    print "Prepared EAD for %s does not pass sanity checks, not saving." % file
                    if verbosity >= self.v_normal:
                        print "Errors found:"
                        for err in errors:
                            # some errors include a list of error instances - display nicely
                            if isinstance(err, list):
                                for suberr in err:
                                    print "    %s" % suberr
                            else:
                                print "  %s" % err
                elif orig_xml == ead.serializeDocument(pretty=True):
                    if verbosity >= self.v_normal:
                        print "No changes made to %s" % file
                    unchanged += 1
                else:
                    with open(file, 'w') as f:
                        ead.serializeDocument(f, pretty=True)
                    if verbosity >= self.v_normal:
                        print "Updated %s" % file
                    updated += 1
            except XMLSyntaxError, e:
                # xml is not well-formed
                print "Error: failed to load %s (document not well-formed XML?)" \
                            % file
                errored += 1
            except Exception, e:
                # catch any other exceptions
                print "Error: failed to prep %s : %s" % (file, e)
                errored += 1