Exemple #1
0
    def repair_ark(self, obj):
        ark_target = None
        try:
            ark_target = self.pidman.get_ark_target(noid=obj.noid,
                                                    qualifier='')
        except:
            self.unrepaired_count += 1
            self.log(level=WARNING,
                     message="Failed to find ARK target for %s" % (obj.pid))
            return

        parsed_ark = parse_ark(ark_target['access_uri'])
        naan = parsed_ark['naan']
        noid = parsed_ark['noid']

        if hasattr(obj, 'mods'):
            obj.mods.content.identifiers.extend([
                mods.Identifier(type='ark', text='ark:/%s/%s' % (naan, noid)),
                mods.Identifier(type='uri', text=ark_target['access_uri'])
            ])
        else:
            obj.dc.content.identifier_list(ark_target['access_uri'])

        if self.options['dry_run']:
            self.unrepaired_count += 1
            self.log(message='ARK target found for %s' % obj.pid)
            return

        # save the collection object w/ updated ark
        try:
            self.log(level=INFO, message="Attempting to save %s" % obj.pid)
            obj.save(logMessage='Fixing missing ARK')
            self.repaired_count += 1
        except DigitalObjectSaveFailure:
            self.log(message="An error occurred while saving %s" % (obj.pid))
    def repair_ark(self, obj):
        ark_target = None
        try:
            ark_target = self.pidman.get_ark_target(noid=obj.noid, qualifier='')
        except:
            self.unrepaired_count += 1
            self.log(level=WARNING, message="Failed to find ARK target for %s" % (obj.pid))
            return


        parsed_ark = parse_ark(ark_target['access_uri'])
        naan = parsed_ark['naan']
        noid = parsed_ark['noid']

        if hasattr(obj, 'mods'):
            obj.mods.content.identifiers.extend([
                mods.Identifier(type='ark', text='ark:/%s/%s' % (naan, noid)),
                mods.Identifier(type='uri', text=ark_target['access_uri'])
                ])
        else:
            obj.dc.content.identifier_list(ark_target['access_uri'])

        if self.options['dry_run']:
            self.unrepaired_count += 1
            self.log(message='ARK target found for %s' % obj.pid)
            return

        # save the collection object w/ updated ark
        try:
            self.log(level=INFO, message="Attempting to save %s" % obj.pid)
            obj.save(logMessage='Fixing missing ARK')
            self.repaired_count += 1
        except DigitalObjectSaveFailure:
            self.log(message="An error occurred while saving %s" % (obj.pid))
Exemple #3
0
def prep_ead(ead, filename):
    """Prepare EAD xml for publication.  Currently does the following:

     - sets the eadid and ids on any series, subseries, and index elements based
       on filename and series unitid or index number.
     - removes any leading whitespace from controlaccess terms

    :param ead: :class:`~findingaids.fa.models.FindingAid` ead instance to be prepared
    :param string: filename of the EAD file (used as base eadid)
    :rtype: :class:`~findingaids.fa.models.FindingAid`
    """

    # eadid should be document name without .xml extension
    ead.eadid.value = os.path.basename(filename).replace('.xml', '')
    # set series ids
    if ead.dsc and ead.dsc.hasSeries():
        for i, series in enumerate(ead.dsc.c):
            set_series_ids(series, ead.eadid.value, i)
    # set index ids
    for i, index in enumerate(ead.archdesc.index):
        # generate index ids based on eadid and index number (starting at 1, not 0)
        index.id = "%s%sindex%s" % (ead.eadid.value, ID_DELIMITER, i+1)

    # remove any leading whitespace in list title fields
    # NOTE: only removing *leading* whitespace because these fields
    # can contain mixed content, and trailing whitespace here may be significant
    # - list title fields - origination nodes and unittitle
    for field in ead.node.xpath('e:archdesc/e:did/e:origination/node()|e:archdesc/e:did/e:unittitle',
                                namespaces={'e': EAD_NAMESPACE}):
        # the text of an lxml node is the text content *before* any child elements
        # in some finding aids, this could be blank, e.g.
        # <unittitle><title>Pitts v. Freeman</title> case files</unittitle>
        # note that this clean does NOT handle leading whitespace in a leading child element.
        if hasattr(field, 'text') and field.text is not None:
            field.text = unicode(field.text).lstrip()
    # - controlaccess fields (if any)
    if ead.archdesc.controlaccess and ead.archdesc.controlaccess.controlaccess:
        for ca in ead.archdesc.controlaccess.controlaccess:
            for term in ca.terms:
                # Using node.text here because term.value is normalized
                # NOT forcing normalization on control access terms because
                # XML editor line-wrap settings would force
                # re-running the prep step every time a document is edited.
                if term.node.text:
                    term.value = term.node.text.lstrip()

    # check that ARK is set correctly (both long and short-form)
    # - if eadid url is not set or is not an ark, generate an ark
    if ead.eadid.url is None or not is_ark(ead.eadid.url):
        ead.eadid.url = generate_ark(ead)
    # - if eadid identifier is not set or not an ark, calculate short-form ark from eadid url
    if ead.eadid.identifier is None or not is_ark(ead.eadid.identifier):
        ark_parts = parse_ark(ead.eadid.url)
        ead.eadid.identifier = 'ark:/%(naan)s/%(noid)s' % ark_parts

    # NOTE: doesn't seem to be explicitly cleaning whitespace in unittitle,
    # but unit tests make it seem that it is getting cleaned ... (??)

    return ead
Exemple #4
0
def ark_noid(ark_uri):
    '''Display just the NOID (nice opaque identifier) for an ARK given
    a full ARK URI.'''
    try:
        parsed_ark = parse_ark(ark_uri)
        return parsed_ark['noid']
    except:
        pass
def ark_noid(ark_uri):
    '''Display just the NOID (nice opaque identifier) for an ARK given
    a full ARK URI.'''
    try:
        parsed_ark = parse_ark(ark_uri)
        return parsed_ark['noid']
    except:
        pass
Exemple #6
0
def ark_id(ark_uri):
    '''Display just the ark identifier (ark:/###/###) given a full
    ARK URI.'''
    try:
        parsed_ark = parse_ark(ark_uri)
        return 'ark:/%(naan)s/%(noid)s' % parsed_ark
    except:
        pass
def ark_id(ark_uri):
    '''Display just the ark identifier (ark:/###/###) given a full
    ARK URI.'''
    try:
        parsed_ark = parse_ark(ark_uri)
        return 'ark:/%(naan)s/%(noid)s' % parsed_ark
    except:
        pass
    def test_parse_ark(self):
        'Test parse_ark method'

        # use these strings to construct various versions of valid arks
        # and confirm they are returned properly from the parse_ark method
        ark_parts = {
            'nma': 'http://pid.emory.edu/',
            'naan': '25593',
            'noid': '1fx',
            'qual': 'qual/1.23/foo-bar.baz'
        }

        # unqualified resolvable ark
        parsed_ark = parse_ark('%(nma)sark:/%(naan)s/%(noid)s' % ark_parts)
        self.assertEqual(ark_parts['nma'], parsed_ark['nma'])
        self.assertEqual(ark_parts['naan'], parsed_ark['naan'])
        self.assertEqual(ark_parts['noid'], parsed_ark['noid'])
        self.assertEqual(None, parsed_ark['qualifier'])  # not present

        # qualified resolvable ark
        parsed_ark = parse_ark('%(nma)sark:/%(naan)s/%(noid)s/%(qual)s' %
                               ark_parts)
        self.assertEqual(ark_parts['nma'], parsed_ark['nma'])
        self.assertEqual(ark_parts['naan'], parsed_ark['naan'])
        self.assertEqual(ark_parts['noid'], parsed_ark['noid'])
        self.assertEqual(ark_parts['qual'], parsed_ark['qualifier'])

        # short-form ark
        parsed_ark = parse_ark('ark:/%(naan)s/%(noid)s' % ark_parts)
        self.assertEqual(None, parsed_ark['nma'])  # not present
        self.assertEqual(ark_parts['naan'], parsed_ark['naan'])
        self.assertEqual(ark_parts['noid'], parsed_ark['noid'])
        self.assertEqual(None, parsed_ark['qualifier'])

        # short-form ark with qualifier
        parsed_ark = parse_ark('ark:/%(naan)s/%(noid)s/%(qual)s' % ark_parts)
        self.assertEqual(None, parsed_ark['nma'])  # not present
        self.assertEqual(ark_parts['naan'], parsed_ark['naan'])
        self.assertEqual(ark_parts['noid'], parsed_ark['noid'])
        self.assertEqual(ark_parts['qual'], parsed_ark['qualifier'])

        # non-arks
        self.assertEqual(None, parse_ark('doi:10.1000/182'),
                         'attempting to parse non-ark results in None')
    def test_parse_ark(self):
        'Test parse_ark method'

        # use these strings to construct various versions of valid arks
        # and confirm they are returned properly from the parse_ark method
        ark_parts = {
            'nma': 'http://pid.emory.edu/',
            'naan': '25593',
            'noid': '1fx',
            'qual': 'qual/1.23/foo-bar.baz'
        }

        # unqualified resolvable ark
        parsed_ark = parse_ark('%(nma)sark:/%(naan)s/%(noid)s' % ark_parts)
        self.assertEqual(ark_parts['nma'], parsed_ark['nma'])
        self.assertEqual(ark_parts['naan'], parsed_ark['naan'])
        self.assertEqual(ark_parts['noid'], parsed_ark['noid'])
        self.assertEqual(None, parsed_ark['qualifier'])     # not present

        # qualified resolvable ark
        parsed_ark = parse_ark('%(nma)sark:/%(naan)s/%(noid)s/%(qual)s' % ark_parts)
        self.assertEqual(ark_parts['nma'], parsed_ark['nma'])
        self.assertEqual(ark_parts['naan'], parsed_ark['naan'])
        self.assertEqual(ark_parts['noid'], parsed_ark['noid'])
        self.assertEqual(ark_parts['qual'], parsed_ark['qualifier'])

        # short-form ark
        parsed_ark = parse_ark('ark:/%(naan)s/%(noid)s' % ark_parts)
        self.assertEqual(None, parsed_ark['nma'])   # not present
        self.assertEqual(ark_parts['naan'], parsed_ark['naan'])
        self.assertEqual(ark_parts['noid'], parsed_ark['noid'])
        self.assertEqual(None, parsed_ark['qualifier'])

        # short-form ark with qualifier
        parsed_ark = parse_ark('ark:/%(naan)s/%(noid)s/%(qual)s' % ark_parts)
        self.assertEqual(None, parsed_ark['nma'])   # not present
        self.assertEqual(ark_parts['naan'], parsed_ark['naan'])
        self.assertEqual(ark_parts['noid'], parsed_ark['noid'])
        self.assertEqual(ark_parts['qual'], parsed_ark['qualifier'])

        # non-arks
        self.assertEqual(None, parse_ark('doi:10.1000/182'),
            'attempting to parse non-ark results in None')
Exemple #10
0
    def get_default_pid(self):
        '''Default pid logic for DigitalObjects in the Keep.  Mint a
        new ARK via the PID manager, store the ARK in the MODS
        metadata (if available) or Dublin Core, and use the noid
        portion of the ARK for a Fedora pid in the site-configured
        Fedora pidspace.'''

        if pidman is not None:
            # pidman wants a target for the new pid
            '''Get a pidman-ready target for a named view.'''

            # first just reverse the view name.
            pid = '%s:%s' % (self.default_pidspace, self.PID_TOKEN)
            target = reverse(self.NEW_OBJECT_VIEW, kwargs={'pid': pid})
            # reverse() encodes the PID_TOKEN and the :, so just unquote the url
            # (shouldn't contain anything else that needs escaping)
            target = urllib.unquote(target)

            # reverse() returns a full path - absolutize so we get scheme & server also
            target = absolutize_url(target)
            # pid name is not required, but helpful for managing pids
            pid_name = self.label
            # ask pidman for a new ark in the configured pidman domain
            ark = pidman.create_ark(settings.PIDMAN_DOMAIN,
                                    target,
                                    name=pid_name)
            # pidman returns the full, resolvable ark
            # parse into dictionary with nma, naan, and noid
            parsed_ark = parse_ark(ark)
            naan = parsed_ark['naan']  # name authority number
            noid = parsed_ark['noid']  # nice opaque identifier

            # if we have a mods datastream, store the ARK as mods:identifier
            if hasattr(self, 'mods'):
                # store full uri and short-form ark
                self.mods.content.identifiers.extend([
                    mods.Identifier(type='ark',
                                    text='ark:/%s/%s' % (naan, noid)),
                    mods.Identifier(type='uri', text=ark)
                ])
            else:
                # otherwise, add full uri ARK to dc:identifier
                self.dc.content.identifier_list.append(ark)

            # use the noid to construct a pid in the configured pidspace
            return '%s:%s' % (self.default_pidspace, noid)
        else:
            # if pidmanager is not available, fall back to default pid behavior
            return super(DigitalObject, self).get_default_pid()
Exemple #11
0
    def get_default_pid(self):
        '''Default pid logic for DigitalObjects in :mod:`readux`.  Mint a
        new ARK via the PID manager, store the ARK in the MODS
        metadata (if available) or Dublin Core, and use the noid
        portion of the ARK for a Fedora pid in the site-configured
        Fedora pidspace.'''
        global pidman

        if pidman is not None:
            # pidman wants a target for the new pid
            # generate a pidman-ready target for a named view

            # Use the object absolute url method
            # NOTE: this requires that all values used in a url be set
            # (i.e., page objects must have volume pid configured)
            self.pid = '%s:%s' % (self.default_pidspace, self.PID_TOKEN)
            target = self.get_absolute_url()

            # reverse() encodes the PID_TOKEN and the :, so just unquote the url
            # (shouldn't contain anything else that needs escaping)
            target = urllib.unquote(target)

            # reverse() returns a full path - absolutize so we get scheme & server also
            target = absolutize_url(target)
            # pid name is not required, but helpful for managing pids
            pid_name = self.label
            # ask pidman for a new ark in the configured pidman domain
            try:
                ark = pidman.create_ark(settings.PIDMAN_DOMAIN, target, name=pid_name)
            except httplib.BadStatusLine:
                logger.warn('Error creating ARK; re-initializing pidman client and trying again')
                pidman = DjangoPidmanRestClient()
                ark = pidman.create_ark(settings.PIDMAN_DOMAIN, target, name=pid_name)
            # pidman returns the full, resolvable ark
            # parse into dictionary with nma, naan, and noid
            parsed_ark = parse_ark(ark)
            noid = parsed_ark['noid']  # nice opaque identifier

            # Add full uri ARK to dc:identifier
            self.dc.content.identifier_list.append(ark)

            # use the noid to construct a pid in the configured pidspace
            return '%s:%s' % (self.default_pidspace, noid)
        else:
            # if pidmanager is not available, fall back to default pid behavior
            return super(DigitalObject, self).get_default_pid()
Exemple #12
0
    def get_default_pid(self):
        if not self._unused_pid_result:
            pidman = DjangoPidmanRestClient()
            result = pidman.search_pids(target=UNUSED_PID_URL)
            # if any were found, use results
            if result and result['results_count']:
                self._unused_pid_result = result['results']

        # if we have any unused pids, pop one off and use it
        if self._unused_pid_result:
            pid_info = self._unused_pid_result.pop()
            ark = pid_info['targets'][0]['access_uri']
            parsed_ark = parse_ark(ark)
            naan = parsed_ark['naan']  # name authority number
            noid = parsed_ark['noid']  # nice opaque identifier


            # use noid as basis for new pid
            pid = '%s:%s' % (self.default_pidspace, noid)
            # calculate target to new object
            target = reverse(self.NEW_OBJECT_VIEW, kwargs={'pid': pid})
            # reverse() returns a full path - absolutize so we get scheme & server also
            target = absolutize_url(target)
            # update pid ark label from object
            pidman.update_ark(noid, name=self.label)
            # update default ark target for new object url
            pidman.update_ark_target(noid, target_uri=target, active=True)

            # if we have a mods datastream, store the ARK as mods:identifier
            if hasattr(self, 'mods'):
                # store full uri and short-form ark
                self.mods.content.identifiers.extend([
                    mods.Identifier(type='ark', text='ark:/%s/%s' % (naan, noid)),
                    mods.Identifier(type='uri', text=ark)
                    ])

            # always add full uri ARK to dc:identifier
            self.dc.content.identifier_list.append(ark)

            # use the noid to construct a pid in the configured pidspace
            return '%s:%s' % (self.default_pidspace, noid)

        else:
            # if we run out of pids re-use, fall back to default behavior
            return super(PidReuseDigitalObject, self).get_default_pid()
Exemple #13
0
    def get_default_pid(self):
        '''Default pid logic for DigitalObjects in openemory.  Mint a
        new ARK via the PID manager, store the ARK in the MODS
        metadata (if available) or Dublin Core, and use the noid
        portion of the ARK for a Fedora pid in the site-configured
        Fedora pidspace.'''

        if pidman is not None:
            # pidman wants a target for the new pid
            '''Get a pidman-ready target for a named view.'''

            # first just reverse the view name.
            pid = '%s:%s' % (self.default_pidspace, self.PID_TOKEN)
            target = reverse("publication:view", kwargs={'pid': pid})
            # reverse() encodes the PID_TOKEN, so unencode just that part
            target = target.replace(self.ENCODED_PID_TOKEN, self.PID_TOKEN)
            # reverse() returns a full path - absolutize so we get scheme & server also
            target = absolutize_url(target)
            # pid name is not required, but helpful for managing pids
            pid_name = self.label
            # ask pidman for a new ark in the configured pidman domain
            ark_uri = pidman.create_ark(settings.PIDMAN_DOMAIN, target, name=pid_name)
            # pidman returns the full, resolvable ark
            # parse into dictionary with nma, naan, and noid
            parsed_ark = parse_ark(ark_uri)
            naan = parsed_ark['naan']  # name authority number
            noid = parsed_ark['noid']  # nice opaque identifier
            ark =  "ark:/%s/%s" % (naan, noid)

            # Add full uri ARK to dc:identifier and  descMetadata
            self.dc.content.identifier_list.append(ark_uri)
            self.descMetadata.content.ark_uri = ark_uri
            self.descMetadata.content.ark = ark

            # use the noid to construct a pid in the configured pidspace
            return '%s:%s' % (self.default_pidspace, noid)
        else:
            # if pidmanager is not available, fall back to default pid behavior
            return super(DigitalObject, self).get_default_pid()
Exemple #14
0
    def get_default_pid(self):
        '''Default pid logic for DigitalObjects in openemory.  Mint a
        new ARK via the PID manager, store the ARK in the MODS
        metadata (if available) or Dublin Core, and use the noid
        portion of the ARK for a Fedora pid in the site-configured
        Fedora pidspace.'''
                
        if pidman is not None:
            # pidman wants a target for the new pid
            '''Get a pidman-ready target for a named view.'''

            # first just reverse the view name.
            pid = '%s:%s' % (self.default_pidspace, self.PID_TOKEN)
            target = reverse("publication:view", kwargs={'pid': pid})
            # reverse() encodes the PID_TOKEN, so unencode just that part
            target = target.replace(self.ENCODED_PID_TOKEN, self.PID_TOKEN)
            # reverse() returns a full path - absolutize so we get scheme & server also
            target = absolutize_url(target)
            # pid name is not required, but helpful for managing pids
            pid_name = self.label
            # ask pidman for a new ark in the configured pidman domain
            ark_uri = pidman.create_ark(settings.PIDMAN_DOMAIN, target, name=pid_name)
            # pidman returns the full, resolvable ark
            # parse into dictionary with nma, naan, and noid
            parsed_ark = parse_ark(ark_uri)
            naan = parsed_ark['naan']  # name authority number
            noid = parsed_ark['noid']  # nice opaque identifier
            ark =  "ark:/%s/%s" % (naan, noid)

            # Add full uri ARK to dc:identifier and  descMetadata
            self.dc.content.identifier_list.append(ark_uri)
            self.descMetadata.content.ark_uri = ark_uri
            self.descMetadata.content.ark = ark
            
            # use the noid to construct a pid in the configured pidspace
            return '%s:%s' % (self.default_pidspace, noid)
        else:
            # if pidmanager is not available, fall back to default pid behavior
            return super(DigitalObject, self).get_default_pid()
def upload_for_ht(job, count=1):
    """
    Task to upload files to Box in the backgroud.
    """
    logger = logging.getLogger(__name__)
    kdip_dir = settings.KDIP_DIR

    for kdip in models.KDip.objects.filter(job__id=job.id).exclude(status='uploaded').exclude(status='upload_fail'):
        # Only create a PID if it doesn't already have one
        if job.upload_attempts == 0:
            if not kdip.pid:
                try:
                    pidman_client = DjangoPidmanRestClient()
                    pidman_domain = settings.PIDMAN_DOMAIN
                    pidman_policy = settings.PIDMAN_POLICY

                    ark = pidman_client.create_ark(domain='{}'.format(pidman_domain),
                                                   target_uri='http://myuri.org',
                                                   policy='{}'.format(pidman_policy),
                                                   name='{}'.format(kdip.kdip_id))

                    noid = parse_ark(ark)['noid']

                    kdip.pid = noid
                    kdip.save()

                    logger.info("Ark {} was created for {}".format(ark, kdip.kdip_id))
                except Exception as e:
                    trace = traceback.format_exc()
                    logger.error("Failed creating an ARK for %s: %s" %
                                 (kdip.kdip_id, e))
                    reason = "Box uplaod failed while making an ARK line 161 " + ' ' + trace
                    print 'ERROR: {}'.format(reason)
                    kdip_fail(job, kdip, reason)

            else:
                logger.info("{} already has pid {}".format(kdip.kdip_id, kdip.pid))

            if not os.path.exists(kdip.process_dir):
                os.makedirs(kdip.process_dir)

            # Gather everything and write the file's checksum to a file via the
            # `checksum` method. The copy the file to the temp directory.
            # HT does not want sub directories in the package.
            tiffs = glob.glob('{}/{}/TIFF/*.tif'.format(kdip.path, kdip.kdip_id))
            for tiff in tiffs:
                checksumfile(tiff, kdip.process_dir)
                shutil.copy(tiff, kdip.process_dir)

            altos = glob.glob('{}/{}/ALTO/*.xml'.format(kdip.path, kdip.kdip_id))
            for alto in altos:
                checksumfile(alto, kdip.process_dir)
                shutil.copy(alto, kdip.process_dir)
                if 'alto' in alto:
                    filename = alto.split('/')
                    page, crap, ext = filename[-1].split('.')
                    shutil.move(alto, '{}/{}.{}'.format(kdip.process_dir, page, ext))

            ocrs = glob.glob('{}/{}/OCR/*.txt'.format(kdip.path, kdip.kdip_id))
            for ocr in ocrs:
                checksumfile(ocr, kdip.process_dir)
                shutil.copy(ocr, kdip.process_dir)

            checksumfile(kdip.meta_yml, kdip.process_dir)
            checksumfile(kdip.marc_xml, kdip.process_dir)
            checksumfile(kdip.mets_xml, kdip.process_dir)

            shutil.copy(kdip.meta_yml, kdip.process_dir)
            shutil.copy(kdip.marc_xml, kdip.process_dir)
            shutil.copy(kdip.mets_xml, kdip.process_dir)

            # After copying all the files to the tmp directory. We verify that
            # the checksum matches the one we made before the move. This is done
            # using the `verify()` method.
            with open('{}/checksum.md5'.format(kdip.process_dir)) as f:
                content = f.readlines()
                for line in content:
                    parts = line.split()
                    verify = checksumverify(parts[0], kdip.process_dir, parts[1])
                    if verify is not True:
                        logger.error('Checksum check failes for %s.' %
                                     kdip.process_dir)

            # Make the zip files
            zipf = zipfile.ZipFile('{}.zip'.format(kdip.process_dir), 'w', zipfile.ZIP_DEFLATED, allowZip64=True)
            os.chdir(kdip.process_dir)
            zipdir('.', zipf)
            zipf.close()

            # Delete the process directory to save space
            # but we keep the zip file
            shutil.rmtree(kdip.process_dir)

        attempts = 0

        while attempts < 5:

            try:
                # Don't upload if no pid
                upload_file(job, kdip) if kdip.pid else kdip_fail(job, kdip, '{} has no pid.'.format(kdip.kdip_id))
                break
            except ConnectionError:
                trace = traceback.format_exc()
                attempts += 1
                sleep(5)
                reason = 'Connection Error, failed to upload {}.'.format(kdip.kdip_id)
                print 'ERROR: {}'.format(reason)
                kdip.status = 'retry'
                kdip.save()
                kdip_fail(job, kdip, reason) if attempts == 5 else logger.error(
                    '{} failed to upload on attempt {} : '.format(kdip.kdip_id, attempts, trace))

            except SysCallError:
                trace = traceback.format_exc()
                attempts = 5
                reason = "SSL Error while uploading {}: {}".format(kdip.kdip_id, trace)
                logger.error(reason)
                kdip_fail(job, kdip, reason)

            except TypeError:
                trace = traceback.format_exc()
                attempts = 5
                reason = "TypeError in upload package for {}: {}".format(kdip.kdip_id, trace)
                logger.error(reason)
                kdip_fail(job, kdip, reason)

            except MemoryError:
                trace = traceback.format_exc()
                attempts = 5
                reason = "MemoryError for " + kdip.kdip_id
                logger.error(reason)
                kdip_fail(job, kdip, reason)

            except Exception as e:
                trace = traceback.format_exc()
                attempts = 5
                reason = "Unexpected error for {}: {}, {}".format(kdip.kdip_id, str(e), trace)
                logger.error(reason)
                kdip_fail(job, kdip, reason)

    # Check to see if all the KDips uploaded.
    job.upload_attempts = job.upload_attempts + 1
    statuses = job.kdip_set.values_list('status', flat=True)
    if ('retry' in statuses) and (job.upload_attempts < 5):
        # job.upload_attempts = job.upload_attempts + 1
        return upload_for_ht(job, count - 1)
    elif ('upload_fail' in statuses) and (job.upload_attempts == 5):
        job.status = 'failed'
        job.save()
    elif job.upload_attempts == 5:
        job.status = 'being processed'
        job.save()
        recipients = settings.HATHITRUST_CONTACTS + settings.EMORY_MANAGERS
        kdip_list = '\n'.join(job.kdip_set.filter(
            status='uploaded').values_list('kdip_id', flat=True))
        logger.info(kdip_list)
        send_to = settings.HATHITRUST_CONTACTS + settings.EMORY_MANAGERS
        send_from = settings.EMORY_CONTACT
        send_mail('New Volumes from Emory have been uploaded', 'The following volumes have been uploaded and are ready:\n\n{}'.format(kdip_list), send_from, send_to, fail_silently=False)
    else:
        return upload_for_ht(job, count - 1)
Exemple #16
0
    def get_new_pid(self, obj):
        # TODO: first, make sure object label is set appropriately before
        # minting new pid or updating an existing one

        # check to see if there are any unused pids in the rushdie collection
        # that can be re-assigned
        unused_pids = pidman.search_pids(
            domain_uri=settings.PIDMAN_RUSHDIE_DOMAIN,
            target=settings.PIDMAN_RUSHDIE_UNUSED_URI)

        total_found = unused_pids.get('results_count', 0)
        logger.debug('Found %d unused rushdie pids' % total_found)

        # if any unused pids were found, use the first one
        if total_found:
            next_pid = unused_pids['results'][0]
            noid = next_pid['pid']

            print 'Found %d unused rushdie pid%s, using %s' % \
                (total_found, 's' if total_found != 1 else '', noid)

            # update pid metadata to reflect the updated object
            # update the ark name to match the current object
            pidman.update_ark(noid=noid, name=obj.label)
            # update the ark target and ensure it is active

            # generate the keep url for this object, using the same logic
            # in keep.common.fedora for minting new pids
            pid = ':'.join([obj.default_pidspace, noid])
            target = reverse(obj.NEW_OBJECT_VIEW, kwargs={'pid': pid})
            # reverse() encodes the PID_TOKEN and the :, so just unquote the url
            # (shouldn't contain anything else that needs escaping)
            target = urllib.unquote(target)
            # absolutize the url to include configured keep domain
            target = absolutize_url(target)
            # update the existing pid with the new Keep url
            pidman.update_ark_target(noid=noid, target_uri=target, active=True)

            ark_uri = next_pid['targets'][0]['access_uri']
            parsed_ark = parse_ark(ark_uri)
            naan = parsed_ark['naan']  # name authority number
            # short form of ark identifier
            ark = 'ark:/%s/%s' % (naan, noid)

            # NOTE: adding to the old object metadata is semi useless,
            # since the old object will not be saved and the migration,
            # but it provides convenient access to ark and ark_uri

            # store the ark in the object metadata
            # (this logic duplicated from base get_default_pid method)
            # if we have a mods datastream, store the ARK as mods:identifier
            if hasattr(obj, 'mods'):
                # store full uri and short-form ark
                obj.mods.content.identifiers.extend([
                    mods.Identifier(type='ark', text=ark),
                    mods.Identifier(type='uri', text=ark_uri)
                ])
            else:
                # otherwise, add full uri ARK to dc:identifier
                obj.dc.content.identifier_list.append(ark_uri)

            # return the pid to be used
            return pid

        else:
            # TEST this: can we use default get next pid for arrangement
            # objects (including email)?
            return obj.get_default_pid()
Exemple #17
0
    def handle(self, *pids, **options):

        dry_run = options.get('dry_run', False)
        verbosity = int(options.get('verbosity', self.v_normal))

        repo = Repository()
        try:
            pidman = DjangoPidmanRestClient()
        except Exception as err:
            # error if pid manager config options not in localsettings
            raise CommandError(err)

        # if pids are specified on command line, only process those objects
        if pids:
            objs = [repo.get_object(pid, type=Volume) for pid in pids]

        # otherwise, look for all volume objects in fedora
        else:
            objs = repo.get_objects_with_cmodel(Volume.VOLUME_CONTENT_MODEL,
                                                type=Volume)

        stats = defaultdict(int)
        for obj in objs:
            if not obj.exists:
                if verbosity >= self.v_normal:
                    self.stdout.write('%s does not exist or is not accessible' % obj.pid)
                stats['skipped'] += 1
                continue

            stats['objs'] += 1
            if is_ark(obj.dc.content.identifier):
                parsed_ark = parse_ark(obj.dc.content.identifier)
                noid = parsed_ark['noid']
                try:
                    ark_info = pidman.get_ark(noid)
                except Exception as err:
                    # requested ARK is not in the configured pid manager
                    # (this should ONLY happen in dev/QA)
                    if verbosity >= self.v_normal:
                        if '404: NOT FOUND' in str(err):
                            msg = 'not found'
                            self.stdout.write('Error retriving ARK information for %s: Not Found' % obj.pid)
                        else:
                            self.stdout.write('Error retriving ARK information for %s' % obj.pid)
                    stats['skipped'] += 1
                    continue

                # update unqualified ark to resolve to readux volume landing page
                if not dry_run:
                    pidman.update_ark_target(noid,
                        target_uri=self.volume_url(obj),
                        active=True)

                # we expected a qualified ARK target for the PDF; update whether
                # it currently exists or not
                qual = 'PDF'
                stats['updated'] += 1   # count as updated in dry run mode (would be updated)
                if not dry_run:
                    pidman.update_ark_target(noid, qual,
                        target_uri=self.pdf_url(obj),
                        active=True)
                    # FIXME: catch possible exceptions here?

        # output summary
        if verbosity >= self.v_normal:
            msg = 'Processed %(objs)d object%%s; skipped %(skipped)d,%%s updated %(updated)d' % stats
            msg = msg % ('s' if stats['objs'] != 1 else '', ' would have' if dry_run else '')
            self.stdout.write(msg)
Exemple #18
0
    def handle(self, *pids, **options):

        dry_run = options.get('dry_run', False)
        verbosity = int(options.get('verbosity', self.v_normal))

        repo = Repository()
        try:
            pidman = DjangoPidmanRestClient()
        except Exception as err:
            # error if pid manager config options not in localsettings
            raise CommandError(err)

        # if pids are specified on command line, only process those objects
        if pids:
            objs = [repo.get_object(pid, type=Volume) for pid in pids]

        # otherwise, look for all volume objects in fedora
        else:
            objs = repo.get_objects_with_cmodel(Volume.VOLUME_CONTENT_MODEL,
                                                type=Volume)

        stats = defaultdict(int)
        for obj in objs:
            if not obj.exists:
                if verbosity >= self.v_normal:
                    self.stdout.write(
                        '%s does not exist or is not accessible' % obj.pid)
                stats['skipped'] += 1
                continue

            stats['objs'] += 1
            if is_ark(obj.dc.content.identifier):
                parsed_ark = parse_ark(obj.dc.content.identifier)
                noid = parsed_ark['noid']
                try:
                    ark_info = pidman.get_ark(noid)
                except Exception as err:
                    # requested ARK is not in the configured pid manager
                    # (this should ONLY happen in dev/QA)
                    if verbosity >= self.v_normal:
                        if '404: NOT FOUND' in str(err):
                            msg = 'not found'
                            self.stdout.write(
                                'Error retriving ARK information for %s: Not Found'
                                % obj.pid)
                        else:
                            self.stdout.write(
                                'Error retriving ARK information for %s' %
                                obj.pid)
                    stats['skipped'] += 1
                    continue

                # update unqualified ark to resolve to readux volume landing page
                if not dry_run:
                    pidman.update_ark_target(noid,
                                             target_uri=self.volume_url(obj),
                                             active=True)

                # we expected a qualified ARK target for the PDF; update whether
                # it currently exists or not
                qual = 'PDF'
                stats[
                    'updated'] += 1  # count as updated in dry run mode (would be updated)
                if not dry_run:
                    pidman.update_ark_target(noid,
                                             qual,
                                             target_uri=self.pdf_url(obj),
                                             active=True)
                    # FIXME: catch possible exceptions here?

        # output summary
        if verbosity >= self.v_normal:
            msg = 'Processed %(objs)d object%%s; skipped %(skipped)d,%%s updated %(updated)d' % stats
            msg = msg % ('s' if stats['objs'] != 1 else '',
                         ' would have' if dry_run else '')
            self.stdout.write(msg)
Exemple #19
0
    def get_new_pid(self, obj):
        # TODO: first, make sure object label is set appropriately before
        # minting new pid or updating an existing one

        # check to see if there are any unused pids in the rushdie collection
        # that can be re-assigned
        unused_pids = pidman.search_pids(
            domain_uri=settings.PIDMAN_RUSHDIE_DOMAIN,
            target=settings.PIDMAN_RUSHDIE_UNUSED_URI)

        total_found = unused_pids.get('results_count', 0)
        logger.debug('Found %d unused rushdie pids' % total_found)

        # if any unused pids were found, use the first one
        if total_found:
            next_pid = unused_pids['results'][0]
            noid = next_pid['pid']

            print 'Found %d unused rushdie pid%s, using %s' % \
                (total_found, 's' if total_found != 1 else '', noid)

            # update pid metadata to reflect the updated object
            # update the ark name to match the current object
            pidman.update_ark(noid=noid, name=obj.label)
            # update the ark target and ensure it is active

            # generate the keep url for this object, using the same logic
            # in keep.common.fedora for minting new pids
            pid = ':'.join([obj.default_pidspace, noid])
            target = reverse(obj.NEW_OBJECT_VIEW, kwargs={'pid': pid})
            # reverse() encodes the PID_TOKEN and the :, so just unquote the url
            # (shouldn't contain anything else that needs escaping)
            target = urllib.unquote(target)
            # absolutize the url to include configured keep domain
            target = absolutize_url(target)
            # update the existing pid with the new Keep url
            pidman.update_ark_target(noid=noid, target_uri=target, active=True)

            ark_uri = next_pid['targets'][0]['access_uri']
            parsed_ark = parse_ark(ark_uri)
            naan = parsed_ark['naan']  # name authority number
            # short form of ark identifier
            ark = 'ark:/%s/%s' % (naan, noid)

            # NOTE: adding to the old object metadata is semi useless,
            # since the old object will not be saved and the migration,
            # but it provides convenient access to ark and ark_uri

            # store the ark in the object metadata
            # (this logic duplicated from base get_default_pid method)
            # if we have a mods datastream, store the ARK as mods:identifier
            if hasattr(obj, 'mods'):
                # store full uri and short-form ark
                obj.mods.content.identifiers.extend([
                    mods.Identifier(type='ark', text=ark),
                    mods.Identifier(type='uri', text=ark_uri)
                    ])
            else:
                # otherwise, add full uri ARK to dc:identifier
                obj.dc.content.identifier_list.append(ark_uri)

            # return the pid to be used
            return pid

        else:
            # TEST this: can we use default get next pid for arrangement
            # objects (including email)?
            return obj.get_default_pid()
def ark_pid(value):
    '''Template filter to display just the pid portion of an ARK URI.
    Returns None if the value passed in is not recognized as an ARK.'''
    if is_ark(value):
        ark_parts = parse_ark(value)
        return ark_parts['noid']