Esempio n. 1
0
def prepared_ead(request, archive, filename, mode):
    """Display information about changes made by preparing an EAD file for
    publication.  If no changes are made, user will be redirected to main admin
    page with a message to that effect.

    In **summary** mode, displays a brief, color-coded summary of changes between
    original and prepped version of the file.  In **diff** mode, displays a full,
    side-by-side diff generated by :class:`difflib.HtmlDiff`.  (Note: because it
    is very large, the full diff is *not* embedded in the site template, and is
    intended to be opened in a new window.)

    :param filename: name of the file to prep; should be base filename only,
        document will be pulled from the configured source directory.
    :param mode: one of **diff** or **summary**

    """

    # determine full path based on archive / svn
    arch = Archive.objects.get(slug=archive)
    # arch = get_object_or_404(Archive, slug=archive)
    fullpath = os.path.join(arch.svn_local_path, filename)
    changes = []

    # TODO: expire cache if file has changed since prepped eadxml was cached
    prep_ead = prepared_eadxml(request, arch.slug, filename)

    if prep_ead.status_code == 200:
        orig_ead = load_xmlobject_from_file(fullpath, FindingAid)  # validate or not?
        original_xml = orig_ead.serializeDocument()  # store as serialized by xml object, so xml output will be the same

        prep_xml = prep_ead.content
        ead = load_xmlobject_from_string(prep_xml, FindingAid)  # validate?
        if mode == 'diff':
            diff = difflib.HtmlDiff(8, 80)  # set columns to wrap at 80 characters
            # generate a html table with line-by-line comparison (meant to be called in a new window)
            changes = diff.make_file(original_xml.split('\n'), prep_xml.split('\n'))
            return HttpResponse(changes)
        elif mode == 'summary':
            # prepared EAD should pass sanity checks required for publication
            errors = utils.check_eadxml(ead)
            changes = list(difflib.unified_diff(original_xml.split('\n'), prep_xml.split('\n')))
            if not changes:
                messages.info(request, 'No changes made to <b>%s</b>; EAD is already prepared.' % filename)
                # redirect to main admin page with code 303 (See Other)
                return HttpResponseSeeOtherRedirect(reverse('fa-admin:index'))
    elif prep_ead.status_code == 500:
        # something went wrong with generating prep xml; could be one of:
        # - non-well-formed xml (failed to load original document at all)
        # - error generating an ARK for the document
        errors = [prep_ead.content]
    else:
        # this shouldn't happen; not 200 or 500 == something went dreadfully wrong
        errors = ['Something went wrong trying to load the specified document.',
                  prep_ead.content]     # pass along the output in case it is useful?

    return render(request, 'fa_admin/prepared.html', {
        'filename': filename,
        'changes': changes, 'errors': errors,
        'xml_status': prep_ead.status_code,
        'archive': arch})
Esempio n. 2
0
def preview(request, archive):
    if request.method == 'POST':

        archive = get_object_or_404(Archive, slug=archive)
        filename = request.POST['filename']

        errors = []

        try:
            # only load to exist if document passes publication check
            ok, response, dbpath, fullpath = _prepublication_check(request, filename,
                archive, mode='preview')
            if ok is not True:
                return response

            db = ExistDB()
            # load the document to the *preview* collection in eXist with the same fileneame
            preview_dbpath = settings.EXISTDB_PREVIEW_COLLECTION + "/" + filename
            # make sure the preview collection exists, but don't complain if it's already there
            success = db.load(open(fullpath, 'r'), preview_dbpath, overwrite=True)
        except ExistDBException, e:
            success = False
            errors.append(e.message())

        if success:
            # load the file as a FindingAid object so we can generate the preview url
            ead = load_xmlobject_from_file(fullpath, FindingAid)
            messages.success(request, 'Successfully loaded <b>%s</b> for preview.' % filename)
            # redirect to document preview page with code 303 (See Other)
            return HttpResponseSeeOtherRedirect(reverse('fa-admin:preview:findingaid', kwargs={'id': ead.eadid}))
        else:
            return render(request, 'fa_admin/publish-errors.html',
                    {'errors': errors, 'filename': filename, 'mode': 'preview', 'exception': e})
Esempio n. 3
0
    def test_load_from_file_with_classname(self):
        """Test using shortcut to initialize named XmlObject class from string"""
        class TestObject(xmlmap.XmlObject):
            pass

        obj = xmlmap.load_xmlobject_from_file(self.FILE.name, TestObject)
        self.assert_(isinstance(obj, TestObject))
Esempio n. 4
0
 def test_load_from_file_with_validation(self):
     # has doctype, but not valid
     self.assertRaises(Exception, xmlmap.load_xmlobject_from_file, self.INVALID.name, validate=True)
     # no doctype
     self.assertRaises(Exception, xmlmap.load_xmlobject_from_file, self.FILE.name, validate=True)
     # doctype, valid
     obj = xmlmap.load_xmlobject_from_file(self.VALID.name, validate=True)
     self.assert_(isinstance(obj, xmlmap.XmlObject))
Esempio n. 5
0
 def test_load_from_file_with_validation(self):
     # has doctype, but not valid
     self.assertRaises(Exception, xmlmap.load_xmlobject_from_file, self.INVALID.name, validate=True)
     # no doctype
     self.assertRaises(Exception, xmlmap.load_xmlobject_from_file, self.FILE.name, validate=True)
     # doctype, valid
     obj = xmlmap.load_xmlobject_from_file(self.VALID.name, validate=True)
     self.assert_(isinstance(obj, xmlmap.XmlObject))
Esempio n. 6
0
    def test_load_from_file_with_classname(self):
        """Test using shortcut to initialize named XmlObject class from string"""

        class TestObject(xmlmap.XmlObject):
            pass

        obj = xmlmap.load_xmlobject_from_file(self.FILE.name, TestObject)
        self.assert_(isinstance(obj, TestObject))
    def handle(self, *args, **options):
        verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        v_normal = 1

        if verbosity > v_normal:
            print "Preparing documents from all defined Archives"

        updated = 0
        unchanged = 0
        errored = 0

        if len(args):
            files = args
        else:
            files = set()
            svn = svn_client()
            for archive in Archive.objects.all():
                # update to make sure we have latest version of everything
                svn.update(str(archive.svn_local_path))   # apparently can't handle unicode
                files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml'))))

        for file in files:
            try:
                ead = load_xmlobject_from_file(file, FindingAid)
                orig_xml = ead.serializeDocument(pretty=True)
                unitid = unicode(ead.archdesc.unitid)

                match = self.unitid_regex.search(unitid)
                if not match:
                    raise Exception('Could not determine collection number for %s - %s' % \
                            (file, unitid))

                collection_num = match.group('number')
                if verbosity > v_normal:
                    print "Identifier for %s is %s (%s)" % (file, collection_num, unitid)
                ead.archdesc.unitid.identifier = collection_num

                if orig_xml == ead.serializeDocument(pretty=True):
                    if verbosity > v_normal:
                        print "No changes made to %s" % file
                    unchanged += 1
                else:
                    with open(file, 'w') as f:
                        ead.serializeDocument(f, pretty=True)
                    if verbosity > v_normal:
                        print "Updated %s" % file
                    updated += 1
            except XMLSyntaxError, e:
                # xml is not well-formed
                print "Error: failed to load %s (document not well-formed XML?)" \
                            % file
                errored += 1
            except Exception, e:
                # catch any other exceptions
                print "Error: failed to set identifier for %s : %s" % (file, e)
                errored += 1
Esempio n. 8
0
    def setUp(self):
        # temporarily replace pid client with mock for testing
        self._django_pid_client = utils.DjangoPidmanRestClient
        utils.DjangoPidmanRestClient = MockDjangoPidmanClient

        # save pid config settings to restore in teardown
        self._pid_config = {
            'PIDMAN_HOST': settings.PIDMAN_HOST,
            'PIDMAN_USER': settings.PIDMAN_USER,
            'PIDMAN_PASSWORD': settings.PIDMAN_PASSWORD,
            'PIDMAN_DOMAIN': settings.PIDMAN_DOMAIN
            }

        # initialize valid and invalid ead fixtures
        self.valid_eadfile = os.path.join(settings.BASE_DIR, 'fa_admin',
            'fixtures', 'hartsfield558.xml')
        self.valid_ead = load_xmlobject_from_file(self.valid_eadfile, FindingAid)

        self.invalid_eadfile = os.path.join(settings.BASE_DIR, 'fa_admin',
            'fixtures', 'hartsfield558_invalid.xml')
        self.invalid_ead = load_xmlobject_from_file(self.invalid_eadfile, FindingAid)
Esempio n. 9
0
def preview(request, archive):
    if request.method == 'POST':

        archive = get_object_or_404(Archive, slug=archive)
        filename = request.POST['filename']

        errors = []
        err = None

        try:
            # only load to exist if document passes publication check
            ok, response, dbpath, fullpath = _prepublication_check(request, filename,
                archive, mode='preview')
            if ok is not True:
                return response

            db = ExistDB()
            # load the document to the *preview* collection in eXist with the same fileneame
            preview_dbpath = settings.EXISTDB_PREVIEW_COLLECTION + "/" + filename
            # make sure the preview collection exists, but don't complain if it's already there
            success = db.load(open(fullpath, 'r'), preview_dbpath)
        except ExistDBException as err:
            success = False
            errors.append(err.message())

        if success:
            # load the file as a FindingAid object so we can generate the preview url
            ead = load_xmlobject_from_file(fullpath, FindingAid)
            messages.success(request, 'Successfully loaded <b>%s</b> for preview.' % filename)
            # redirect to document preview page with code 303 (See Other)
            return HttpResponseSeeOtherRedirect(reverse('fa-admin:preview:findingaid', kwargs={'id': ead.eadid}))
        else:
            # no exception but no success means the load failed;
            # *probably* due to insufficient permissions
            if errors == [] and success == False:
                errors.append('Failed to load the document to the preview collection')

            return render(request, 'fa_admin/publish-errors.html',
                    {'errors': errors, 'filename': filename, 'mode': 'preview', 'exception': err})

    # NOTE: preview list is not used anymore; functionality is handled
    # by main admin view; if we revisit preview list, to be more usable it
    # should be filterable by archive
    else:
        fa = get_findingaid(preview=True, only=['eadid', 'list_title', 'last_modified'],
                            order_by='last_modified')
        return render(request, 'fa_admin/preview_list.html',
                {'findingaids': fa, #'querytime': [fa.queryTime()]
                })
Esempio n. 10
0
    def test_run(self):
        # process all files
        with patch('findingaids.fa.models.Archive.svn_local_path', self.tmpdir):
            self.command.run_command('-v', '2')
            output = self.command.output

        # check that correct unitid identifier was set
        ead = load_xmlobject_from_file(self.files['hartsfield558.xml'], FindingAid)
        self.assertEqual(558, ead.archdesc.unitid.identifier)
        self.assert_('2 documents updated' in output)
        self.assert_('1 document with errors' in output)

        # badly-formed xml - should be reported
        self.assert_(re.search(r'^Error.*badlyformed.xml.*not well-formed.*$', output, re.MULTILINE),
            'unitid_identifier reports error for non well-formed xml')

        # files with errors should not be modified
        self.assertEqual(self.file_sizes['badlyformed.xml'],
                        os.path.getsize(self.files['badlyformed.xml']),
                    'file with errors not modified by unitid_identifier script')
Esempio n. 11
0
def prepared_eadxml(request, archive, filename):
    """On GET, serves out a prepared version of the EAD file in the specified
    archive subversion directory. Response header is set so the user should
    be prompted to download the xml, with a filename matching that of
    the original document.

    On POST, commits the prepared version of the EAD file to the subversion
    directory of the specified archive, with a log message indicating the user
    who requested the commit.

    Steps taken to prepare a document are documented in
    :meth:`~findingaids.fa_admin.utils.prep_ead`.

    :param filename: name of the file to prep; should be base filename only,
        document will be pulled from the configured source directory.
    """
    # find relative to svn path if associated with an archive
    prepped_xml = cache.get(filename)
    arch = get_object_or_404(Archive, slug=archive)
    fullpath = os.path.join(arch.svn_local_path, filename)
    if prepped_xml is None:
        try:
            ead = load_xmlobject_from_file(fullpath, FindingAid)  # validate or not?
        except XMLSyntaxError, e:
            # xml is not well-formed : return 500 with error message
            return HttpResponseServerError("Could not load document: %s" % e)

        # flash meesage that appear on the screen for user, message itself is generated in utils.py
        with message_logging(request, 'findingaids.fa_admin.utils', logging.INFO):
            try:
                ead = utils.prep_ead(ead, filename)
                prepped_xml = ead.serializeDocument()
                cache.set(filename, prepped_xml)
            except Exception as e:
                # any exception on prep is most likely ark generation
                return HttpResponseServerError('Failed to prep the document: ' + str(e))
Esempio n. 12
0
def prepared_eadxml(request, archive, filename):
    """On GET, serves out a prepared version of the EAD file in the specified
    archive subversion directory. Response header is set so the user should
    be prompted to download the xml, with a filename matching that of
    the original document.

    On POST, commits the prepared version of the EAD file to the subversion
    directory of the specified archive, with a log message indicating the user
    who requested the commit.

    Steps taken to prepare a document are documented in
    :meth:`~findingaids.fa_admin.utils.prep_ead`.

    :param filename: name of the file to prep; should be base filename only,
        document will be pulled from the configured source directory.
    """
    # find relative to svn path if associated with an archive
    arch = get_object_or_404(Archive, slug=archive)
    fullpath = os.path.join(arch.svn_local_path, filename)
    try:
        ead = load_xmlobject_from_file(fullpath, FindingAid)  # validate or not?
    except XMLSyntaxError, e:
        # xml is not well-formed : return 500 with error message
        return HttpResponseServerError("Could not load document: %s" % e)
Esempio n. 13
0
 def test_load_from_file(self):
     """Test using shortcut to initialize XmlObject from a file"""
     obj = xmlmap.load_xmlobject_from_file(self.FILE.name)
     self.assert_(isinstance(obj, xmlmap.XmlObject))
Esempio n. 14
0
 def test_load_from_file(self):
     """Test using shortcut to initialize XmlObject from a file"""
     obj = xmlmap.load_xmlobject_from_file(self.FILE.name)
     self.assert_(isinstance(obj, xmlmap.XmlObject))
Esempio n. 15
0
    def test_prep_ead(self):
        # valid fixtures is an ead with series/subseries, and index
        # - clear out fixture ark url to trigger generating a new one (simulated)
        del(self.valid_ead.eadid.url)
        del(self.valid_ead.eadid.identifier)
        ead = utils.prep_ead(self.valid_ead, self.valid_eadfile)
        self.assert_(isinstance(ead, FindingAid), "prep_ead should return an instance of FindingAid")
        self.assertEqual(u'hartsfield558', ead.eadid.value)
        self.assertEqual(u'hartsfield558_series1', ead.dsc.c[0].id)
        self.assertEqual(u'hartsfield558_subseries6.1', ead.dsc.c[5].c[0].id)
        self.assertEqual(u'hartsfield558_index1', ead.archdesc.index[0].id)
        # ark should be generated and stored in eadid url
        self.assertEqual(MockDjangoPidmanClient.test_ark, ead.eadid.url)
        # short-form ark should be stored in identifier attribute
        self.assert_(MockDjangoPidmanClient.test_ark.endswith(ead.eadid.identifier))

        # ead with no series
        eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests',
            'fixtures', 'pittsfreeman1036.xml')
        ead = load_xmlobject_from_file(eadfile, FindingAid)
        ead = utils.prep_ead(ead, eadfile)
        self.assert_(isinstance(ead, FindingAid), "prep_ead should return an instance of FindingAid")
        self.assertEqual(u'pittsfreeman1036', ead.eadid.value)

        # series with no unitid
        eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests',
            'fixtures', 'raoul548.xml')
        ead = load_xmlobject_from_file(eadfile, FindingAid)
        ead = utils.prep_ead(ead, eadfile)
        self.assertEqual(u'raoul548_series3', ead.dsc.c[2].id)

        # whitespace cleanup
        ead = utils.prep_ead(self.invalid_ead, self.invalid_eadfile)
        # - no leading whitespace in list title
        # ead.archdesc.origination is getting normalized, so can't be used for testing
        origination = ead.node.xpath('//e:origination/e:persname', namespaces={'e': EAD_NAMESPACE})
        self.assertEqual(u'Hartsfield, William Berry.', origination[0].text)
        # test the node text directly (does not include unitdate)
        self.assertEqual(u'William Berry Hartsfield papers, ', ead.unittitle.node.text)
        self.assertEqual(u'Gone with the wind (Motion picture)',
                        ead.archdesc.controlaccess.controlaccess[0].title[0].value)
        self.assertEqual(u'Allen, Ivan.',
                        ead.archdesc.controlaccess.controlaccess[1].person_name[0].value)
        self.assertEqual(u'Mines and mineral resources--Georgia.',
                        ead.archdesc.controlaccess.controlaccess[3].subject[1].value)
        # unicode characters
        self.assertEqual(u'Motion pictures--Georgia. \u2026',
                        ead.archdesc.controlaccess.controlaccess[3].subject[2].value)
        self.assertEqual(u'Motion pictures.',
                        ead.archdesc.controlaccess.controlaccess[-1].genre_form[0].value)
        # remaining errors after clean-up:
        # 1 - duplicate origination
        # 2 - > 2 containers in a did (summary error and list of problem dids)
        # 2 - 1 container in a did (summary error and list of problem dids)
        # = 5
        self.assertEqual(5, len(utils.check_eadxml(ead)),
            "only 3 errors (duplicate origination, 3 containers in a did, 1 container in a did) should be left in invalid test fixture after cleaning")

        # special case - unittitle begins with a <title>
        eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests',
            'fixtures', 'pittsfreeman1036.xml')
        ead = load_xmlobject_from_file(eadfile, FindingAid)
        ead = utils.prep_ead(ead, eadfile)
        self.assertFalse(unicode(ead.list_title).startswith('None'),
            'cleaned unittitle with leading <title> should not start with "None"')
Esempio n. 16
0
    def test_check_eadxml(self):
        # use invalid ead fixture to check error detection
        ead = self.invalid_ead
        ead.eadid.value = 'foo#~@/'    # set invalid eadid for this test only

        # invalid fixture has several errors
        errors = utils.check_eadxml(ead)
        self.assertNotEqual(0, len(errors))
        # - series/subseries ids missing, index id missing
        self.assert_("series c01 id attribute is not set for Series 1: Personal papers, 1918-1986"
                    in errors, 'c01 missing id error reported')
        self.assert_("subseries c02 id attribute is not set for Subseries 6.1: Minerals and mining files, 1929-1970"
                    in errors, 'c02 missing id error reported')
        self.assert_("index id attribute is not set for Index of Selected Correspondents"
                    in errors, 'index missing id error reported')
        # - origination count error
        self.assert_("Site expects only one archdesc/did/origination; found 2" in errors,
                    'multiple origination error reported')
        # - whitespace in list title
        self.assert_("Found leading whitespace in list title field (origination/persname): " +
                    "'  Hartsfield, William Berry.'" in errors, 'leading whitespace in origination reported')
        # - eadid regex
        self.assert_("eadid '%s' does not match site URL regular expression" % ead.eadid.value
                    in errors, 'eadid regex error reported')

        #ARK in url and identifier not set or invalid
        self.assert_("eadid url is either not set or not an ARK. " +
            "To correct, run the prep process again."
                    in errors, 'eadid ark not in url')
        self.assert_("eadid identifier is either not set or not an ARK" +
            "To correct, run the prep process again."
                    in errors, 'eadid ark not in identifier')

        #valid ARKs in url and identifier but do not match
        ark1 = "http://testpid.library.emory.edu/ark:/25593/1234"
        ark1_short = "ark:/25593/1234"
        ark2_short = "ark:/25593/567"
        ead.eadid.url = ark1
        ead.eadid.identifier = ark2_short
        errors = utils.check_eadxml(ead)

        self.assert_("eadid url is either not set or not an ARK. " +
            "To correct, run the prep process again."
                    not in errors, 'valid eadid ark set in url')
        self.assert_("eadid identifier is either not set or not an ARK" +
            "To correct, run the prep process again."
                    not in errors, 'valid eadid ark set in identifier')

        self.assert_("eadid url and identifier do not match: url '%s' should end with identifier '%s'" % (ark1, ark2_short)
                    in errors, 'eadid url and  identifier do not march')

        # Change url and identifier to match
        ead.eadid.url = ark1
        ead.eadid.identifier = ark1_short
        errors = utils.check_eadxml(ead)

        self.assert_("eadid url and identifier do not match: url '%s' should end with identifier '%s'" % (ark1, ark1_short)
                    not in errors, 'eadid url and  identifier march')

        # - list title first letter regex
        # simulate non-whitespace, non-alpha first letter in list title
        ead.list_title.node.text = "1234"  # list title is not normally settable; overriding for test
        errors = utils.check_eadxml(ead)
        self.assert_("First letter ('1') of list title field origination/persname does not match browse letter URL regex '%s'" \
                     % TITLE_LETTERS in errors, 'title first letter regex error reported')

        # empty/unset list title field
        ead.list_title.node.text = None
        errors = utils.check_eadxml(ead)
        self.assert_("List title seems to be empty" in errors)

        # - whitespace in control access terms
        self.assert_("Found leading whitespace in controlaccess term ' Gone with the wind (Motion picture)' (title)"
                    in errors, 'controlaccess title leading whitespace reported')
        self.assert_("Found leading whitespace in controlaccess term '  \t   Selznick, David O., 1902-1965.' (persname)"
                    in errors, 'controlaccess name leading whitespace reported')
        self.assert_("Found leading whitespace in controlaccess term '  \t   Mines and mineral resources--Georgia.' (subject)"
                    in errors, 'controlaccess subject leading whitespace reported')
        self.assert_("Found leading whitespace in controlaccess term ' Motion pictures.' (genreform)"
                    in errors, 'controlaccess genre leading whitespace reported')

        # - did with > 2 containers
        self.assert_('Site expects maximum of 2 containers per did; found 1 did(s) with more than 2'
                    in errors, 'did with more than 2 containers reported')

        # - did with only 1 container
        self.assert_('Site expects 2 containers per did; found 1 did(s) with only 1'
                    in errors, 'did with only 1 container reported')

        # make sure we handle quirky document with a <title> at the beginning of the <unittitle>
        eadfile = os.path.join(settings.BASE_DIR, 'fa',
            'tests', 'fixtures', 'pittsfreeman1036.xml')
        ead_nested_title = load_xmlobject_from_file(eadfile, FindingAid)
        errors = utils.check_eadxml(ead_nested_title)
        self.assert_(all('list title' not in err for err in errors),
                     'nested <title> in <unittitle> should not generate a list title whitespace error')
Esempio n. 17
0
    def test_check_ead(self):
        # check valid EAD - no errors  -- good fixture, should pass all tests
        dbpath = settings.EXISTDB_TEST_COLLECTION + '/hartsfield558.xml'
        errors = utils.check_ead(self.valid_eadfile, dbpath)
        self.assertEqual(0, len(errors))

        # should cause several errors - not schema valid, eadid, series/subseries ids missing, index id missing
        errors = utils.check_ead(self.invalid_eadfile, dbpath)
        self.assertNotEqual(0, len(errors))
        self.assert_("attribute 'invalid': The attribute 'invalid' is not allowed"
                     in errors[0])   # validation error message

        # NOTE: somewhere between lxml 2.3.1 and 3.0.1 we started getting
        # duplicate validation errors. work around it for now.
        # (errors seem to be aggregating instead of clearing out....)
        while errors[0] == errors[1]:
            errors.pop(0)
        self.assert_("Line 2" in errors[0], "validation error includes line number")   # validation error message
        self.assert_("eadid 'hartsfield558.xml' does not match expected value" in errors[1])
        self.assert_("series c01 id attribute is not set for Series 1" in errors[2])
        self.assert_("subseries c02 id attribute is not set for Subseries 6.1" in errors[3])
        self.assert_("index id attribute is not set for Index of Selected Correspondents" in errors[4])

        errors = utils.check_ead(self.valid_eadfile, dbpath)
        self.assertEqual(0, len(errors))

        # eadid uniqueness check in eXist
        self.db.load(open(self.valid_eadfile), dbpath)
        errors = utils.check_ead(self.valid_eadfile, dbpath)
        # same eadid, but present in the file that will be updated - no errors
        self.assertEqual(0, len(errors))

        # upload same file to a different path - non-unique eadid error
        self.db.load(open(self.valid_eadfile), settings.EXISTDB_TEST_COLLECTION + '/hartsfield_other.xml')
        errors = utils.check_ead(self.valid_eadfile, dbpath)
        self.assertEqual(1, len(errors))
        self.assert_("Database already contains 2 instances of eadid" in errors[0])

        # remove version with correct path to test single conflicting eadid
        self.db.removeDocument(dbpath)
        errors = utils.check_ead(self.valid_eadfile, dbpath)
        self.assertEqual(1, len(errors))
        self.assert_("Database contains eadid 'hartsfield558' in a different document" in errors[0])

        # leading whitespace in unit title
        with tempfile.NamedTemporaryFile(prefix='findingaids-ead-',
                                         suffix='xml', delete=False) as tmpfile:
            # modify fixture to introduce leading whitespace
            ead = load_xmlobject_from_file(self.valid_eadfile, FindingAid)
            # check expects eadid to match filename
            ead.eadid.value = os.path.basename(tmpfile.name)
            # add whiespace at beginning of title
            ead.unittitle.text = "\n  %s" % ead.unittitle.text
            ead.serializeDocument(tmpfile)
            # close to flush content
            tmpfile.close()

            errors = utils.check_ead(tmpfile.name, dbpath)
            os.remove(tmpfile.name)

        # should have 1 error for leading whitespace
        self.assertEqual(1, len(errors))
        self.assert_(errors[0].startswith('Found leading whitespace in unittitle'))

        # pomerantz unit title starts with an <emph> tag; test that
        # this doesn't trip up check for leading whitespace in title
        dbpath = settings.EXISTDB_TEST_COLLECTION + '/pomerantz890.xml'
        pomerantz_eadfile = os.path.join(settings.BASE_DIR, 'fa', 'tests',
                                         'fixtures', 'pomerantz890.xml')
        errors = utils.check_ead(pomerantz_eadfile, dbpath)
        # fixture contains subjects with leading whitespace, which is fine
        # we just care that the unittitle check passes
        self.assert_('Found leading whitespace in unittitle:' not in
                     errors[0])
Esempio n. 18
0
    def handle(self, *args, **options):
        verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        v_normal = 1
        v_all = 2

        if options['pdf_only'] and options['skip_pdf_reload']:
            raise CommandError("Options -s and -p are not compatible")

        # check for required settings
        if not hasattr(settings, 'EXISTDB_ROOT_COLLECTION') or not settings.EXISTDB_ROOT_COLLECTION:
            raise CommandError("EXISTDB_ROOT_COLLECTION setting is missing")
            return

        if len(args):
            files = args
        else:
            # Note: copied from prep_ead manage command; move somewhere common?
            files = set()
            svn = svn_client()
            for archive in Archive.objects.all():
                # update to make sure we have latest version of everything
                svn.update(str(archive.svn_local_path))   # apparently can't handle unicode
                files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml'))))

        if verbosity == v_all:
            print 'Documents will be loaded to configured eXist collection: %s' \
                    % settings.EXISTDB_ROOT_COLLECTION
            if options['skip_pdf_reload']:
                print "** Skipping PDFs cache reload"

        db = ExistDB()

        loaded = 0
        errored = 0
        pdf_tasks = {}

        start_time = datetime.now()

        if not options['pdf_only']:
        # unless PDF reload only has been specified, load files

            for file in files:
                try:
                    # full path location where file will be loaded in exist db collection
                    dbpath = settings.EXISTDB_ROOT_COLLECTION + "/" + os.path.basename(file)
                    errors = check_ead(file, dbpath)
                    if errors:
                        # report errors, don't load
                        errored += 1
                        print "Error: %s does not pass publication checks; not loading to eXist." % file
                        if verbosity >= v_normal:
                            print "  Errors found:"
                            for err in errors:
                                print "    %s" % err
                    else:
                        with open(file, 'r') as eadfile:
                            success = db.load(eadfile, dbpath, overwrite=True)

                        if success:
                            loaded += 1
                            if verbosity >= v_normal:
                                print "Loaded %s" % file
                            # load the file as a FindingAid object to get the eadid for PDF reload
                            ead = load_xmlobject_from_file(file, FindingAid)

                            # trigger PDF regeneration in the cache and store task result
                            # - unless user has requested PDF reload be skipped
                            if not options['skip_pdf_reload']:
                                pdf_tasks[ead.eadid.value] = reload_cached_pdf.delay(ead.eadid.value)
                                # NOTE: unlike the web admin publish, this does not
                                # generate TaskResult db records; task outcomes will be
                                # checked & reported before the script finishes
                        else:
                            errored += 1
                            print "Error: failed to load %s to eXist" % file
                except ExistDBException, e:
                    print "Error: failed to load %s to eXist" % file
                    print e.message()
                    errored += 1

            # output a summary of what was done
            print "%d document%s loaded" % (loaded, 's' if loaded != 1 else '')
            print "%d document%s with errors" % (errored, 's' if errored != 1 else '')
Esempio n. 19
0
    def handle(self, *args, **options):
        verbosity = int(options.get('verbosity', self.v_normal))
        svn_commit = options.get('commit', False)
        dry_run = options.get('dryrun', False)

        # check for required settings
        if not hasattr(settings, 'KEEP_SOLR_SERVER_URL') or not settings.KEEP_SOLR_SERVER_URL:
            raise CommandError("KEEP_SOLR_SERVER_URL setting is required for this script")
            return

        solr = solr_interface()

        if verbosity > self.v_normal:
            print "Preparing documents from all defined Archives"
            if dry_run:
                print "Running in dry-run mode; no changes will be made"

        updated = 0
        unchanged = 0
        errored = 0

        if len(args):
            files = args
        else:
            # Note: copied from prep_ead manage command; move somewhere common?
            files = set()
            svn = svn_client()
            for archive in Archive.objects.all():
                # update to make sure we have latest version of everything
                svn.update(str(archive.svn_local_path))   # apparently can't handle unicode
                files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml'))))


        for file in files:
            file_items = 0
            daos = 0
            try:
                if verbosity >= self.v_normal and len(files) > 1:
                    self.stdout.write('\nProcessing %s' % os.path.basename(file))

                ead = load_xmlobject_from_file(file, FindingAid)
                orig_xml = ead.serializeDocument()  # keep to check if changed

                for c in self.ead_file_items(ead):
                    # if item already contains any dao tags, skip it (no furher processing needed)
                    if c.did.dao_list:
                        continue

                    match = self.has_digitized_content(unicode(c.did.unittitle))
                    if match:
                        file_items += 1
                        try:
                            id_list = self.id_list(match.groupdict()['ids'])
                        except Exception as e:
                            self.stdout.write('Error parsing ids from "%s" : %s' % \
                                              (unicode(c.did.unittitle), e))
                            continue

                        # if no ids were found even though title seemed to have digitized content,
                        # error and skip to next
                        if not id_list:
                            self.stdout.write('Appears to have digitized content, but no ids found in "%s"' % \
                                              (unicode(c.did.unittitle)))
                            continue

                        # dictionary for any Keep info corresponding to these ids
                        id_info = {}

                        # look up each id in the Keep
                        for i in id_list:
                            q = solr.query(solr.Q(dm1_id="%s" % i) | solr.Q(pid="emory:%s" % i)) \
                                    .field_limit(['ark_uri', 'pid'])
                            if q.count() == 1:
                                id_info[i] = q[0]

                        # remove the plain-text digitized ids from unittitle content
                        # (handle as unicode to preserve any special characters)
                        # NOTE: because unittitle could contain nested tags (dates,
                        # titles, names, etc), iterate through the text nodes and
                        # remove the digitized note wherever it occurs
                        # - use lxml smart strings to update based on parent nodes
                        text_nodes = c.did.unittitle.node.xpath('text()')
                        for txt in text_nodes:
                            updated_txt = re.sub(self.digitized_ids, u'', txt)
                            if txt.is_text:
                                txt.getparent().text = updated_txt
                            else:
                                txt.getparent().tail = updated_txt

                        # ensure document has xlink namespace declared at the top
                        # or else it will be repeated for each dao

                        for i in id_list:
                            info = id_info.get(i, None)
                            # append a new dao for each id; audience will always be internal
                            dao_opts = {'audience': 'internal'}
                            href = None

                            if info:
                                # in some cases in production, a record is found but no
                                # ark_uri is indexed in solr (indicates ark_uri not in MODS)
                                try:
                                    href = info['ark_uri']
                                except KeyError:
                                    self.stdout.write('Warning: Keep record was found for %s but no ARK URI is indexed' \
                                        % i)

                            # if no record was found, *should* be a digital masters id
                            if href is None:
                                # if id already starts with dm, don't duplicate the prefix
                                if i.startswith('dm'):
                                    dao_opts['id'] = i
                                # if it's a digit, add dm prefix
                                elif i.isdigit():
                                    dao_opts['id'] = 'dm%s' % i
                                # otherwise, warn and add the id in pid notation
                                else:
                                    # only warn if we didn't already warn about info without ark uri
                                    if not info:
                                        self.stdout.write('Warning: non-digital masters id %s not found in the Keep' \
                                                           % i)
                                    # generate an ark anyway, since pids don't make valid ids
                                    href = 'http://pid.emory.edu/ark:/25593/%s' % i

                            c.did.dao_list.append(eadmap.DigitalArchivalObject(**dao_opts))
                            if href is not None:
                                c.did.dao_list[-1].href = href
                            # clean up any extra namespaces (exist-db ns)
                            cleanup_namespaces(c.did.dao_list[-1].node)

                            daos += 1

                # NOTE: could use pretty=True, but not used elsewhere in fa_admin,
                # so leaving off for consistency
                if orig_xml == ead.serializeDocument():
                    if verbosity > self.v_normal:
                        self.stdout.write("No changes made to %s" % file)
                    unchanged += 1
                else:
                    # in dry run, don't actually change the file
                    if not dry_run:
                        with open(file, 'w') as f:
                            ead.serializeDocument(f)
                    if verbosity >= self.v_normal:
                        self.stdout.write("Updated %s; found %d item%s with digitized content, added %d <dao>%s" \
                            % (file, file_items, 's' if file_items != 1 else '',
                               daos, 's' if daos != 1 else ''))
                    updated += 1

            except XMLSyntaxError:
                # xml is not well-formed
                self.stdout.write("Error: failed to load %s (document not well-formed XML?)" \
                                  % file)
                errored += 1
            # except Exception, e:
            #     # catch any other exceptions
            #     print "Error: failed to update %s : %s" % (file, e)
            #     errored += 1

        # TODO: might be nice to also report total number of daos added

        # summary of what was done
        self.stdout.write("\n%d document%s updated" % (updated, 's' if updated != 1 else ''))
        self.stdout.write("%d document%s unchanged" % (unchanged, 's' if unchanged != 1 else ''))
        self.stdout.write("%d document%s with errors" % (errored, 's' if errored != 1 else ''))

        if svn_commit:
            svn = svn_client()
            # seems to be the only way to set a commit log message via client
            def get_log_message(arg):
                # argument looks something like this:
                # [('foo', 'https://svn.library.emory.edu/svn/dev_ead-eua/trunk/eua0081affirmationvietnam.xml', 6, None, 4)]
                # ignoring since we will only use this function for a single commit
                return 'converted digitized item ids to <dao> tags'

            svn.log_msg_func = get_log_message

            for archive in Archive.objects.all():
                # update to make sure we have latest version of everything
                svn.commit(str(archive.svn_local_path))
Esempio n. 20
0
    def handle(self, *args, **options):
        verbosity = int(options['verbosity'])

        self._setup_logging(verbosity)

        # check for required settings
        if not hasattr(settings, 'EXISTDB_ROOT_COLLECTION') or not settings.EXISTDB_ROOT_COLLECTION:
            raise CommandError("EXISTDB_ROOT_COLLECTION setting is missing")
            return


        if verbosity == self.v_all:
            print "Preparing documents from all defined Archives"

        updated = 0
        unchanged = 0
        errored = 0

        if len(args):
            files = args
        else:
            files = set()
            svn = svn_client()
            for archive in Archive.objects.all():
                # update to make sure we have latest version of everything
                svn.update(str(archive.svn_local_path))   # apparently can't handle unicode
                files.update(set(glob.iglob(os.path.join(archive.svn_local_path, '*.xml'))))

        for file in files:
            try:
                ead = load_xmlobject_from_file(file, FindingAid)
                orig_xml = ead.serializeDocument(pretty=True)
                ead = utils.prep_ead(ead, file)
                # sanity check before saving
                dbpath = settings.EXISTDB_ROOT_COLLECTION + "/" + os.path.basename(file)
                errors = utils.check_ead(file, dbpath, xml=ead.serializeDocument())
                if errors:
                    errored += 1
                    print "Prepared EAD for %s does not pass sanity checks, not saving." % file
                    if verbosity >= self.v_normal:
                        print "Errors found:"
                        for err in errors:
                            # some errors include a list of error instances - display nicely
                            if isinstance(err, list):
                                for suberr in err:
                                    print "    %s" % suberr
                            else:
                                print "  %s" % err
                elif orig_xml == ead.serializeDocument(pretty=True):
                    if verbosity >= self.v_normal:
                        print "No changes made to %s" % file
                    unchanged += 1
                else:
                    with open(file, 'w') as f:
                        ead.serializeDocument(f, pretty=True)
                    if verbosity >= self.v_normal:
                        print "Updated %s" % file
                    updated += 1
            except XMLSyntaxError, e:
                # xml is not well-formed
                print "Error: failed to load %s (document not well-formed XML?)" \
                            % file
                errored += 1
            except Exception, e:
                # catch any other exceptions
                print "Error: failed to prep %s : %s" % (file, e)
                errored += 1