Esempio n. 1
0
    def handle(self, *pids, **options):
        # bind a handler for interrupt signal
        signal.signal(signal.SIGINT, self.interrupt_handler)

        verbosity = int(options.get('verbosity', self.v_normal))

        repo = Repository()
        try:
            pidman = DjangoPidmanRestClient()
        except Exception as err:
            # error if pid manager config options not in localsettings
            raise CommandError(err)

        old_page_target = '%s/books/pages/' % Site.objects.get_current().domain
        search_args = {'type':'ark', 'target': old_page_target, 'count': 10}
        # get a small result set to retrieve the total
        results = pidman.search_pids(**search_args)
        total = results['results_count']
        # then set a larger page size for actual processing
        search_args['count'] = 100
        if verbosity >= self.v_normal:
            print 'Found %d total page ARKs with targets to be updated' % total

        pbar = ProgressBar(widgets=[Percentage(),
            ' (', Counter(), ')',
            Bar(),
            ETA()],
            maxval=total).start()

        self.stats = defaultdict(int)
        self.processed = set()
        for ark in self.get_search_results(pidman, search_args):
            self.processed.add(ark['pid'])
            # get fedora pid from target uri
            target_uri = ark['targets'][0]['target_uri']
            baseurl, pid = target_uri.rstrip('/').rsplit('/', 1)
            try:
                page = repo.get_object(pid, type=Page)
                # this should probably only happen in dev/qa
                if not page.exists:
                    if verbosity > self.v_normal:
                        self.stderr.write('Page %s does not exist' % pid)
                    self.stats['notfound'] += 1
                else:
                    # check if volume exists?
                    pidman.update_ark_target(ark['pid'], target_uri=page.absolute_url)
                    self.stats['updated'] += 1
            except RequestFailed as rf:
                print 'Error accessing %s: %s' % (pid, rf)
                self.stats['error'] += 1

            pbar.update(len(self.processed))
            if self.interrupted:
                break

        if not self.interrupted:
            pbar.finish()
        # summarize
        self.stderr.write('Updated %(updated)d, %(error)d error(s), %(notfound)d not found' \
            % self.stats)
Esempio n. 2
0
def update_pid(kdip_pid, ht_url):
    client = DjangoPidmanRestClient()
    # Update the PID in pidman the the HathiTrust URL.
    client.update_target( \
        type="ark", noid=kdip_pid, target_uri=ht_url)
    # Add a new qualifier for HathiTrust.
    client.update_target( \
        type="ark", noid=kdip_pid, qualifier="HT", \
        target_uri=ht_url)
Esempio n. 3
0
def generate_ark(ead):
    '''Generate an ARK for the specified EAD document.  ARK will be created
    with a default target of the url for the main page of the specified EAD
    document in this site.

    :param ead: :class:`findingaids.fa.models.FindingAid` instance
    :returns: resolvable URL for generated ARK on success
    '''
    # catch init error and report simplified error to user
    try:
        pidclient = DjangoPidmanRestClient()
    except RuntimeError:
        raise Exception("Error initializing PID Manager client; please check site configuration.")

    # check that domain is set
    if not hasattr(settings, 'PIDMAN_DOMAIN'):
        raise Exception("Unable to generate ARK: PID manager domain is not configured.")

    # generate absolute url for ARK target
    ead_url = settings.SITE_BASE_URL.rstrip('/') + reverse('fa:findingaid',
                                               kwargs={'id': ead.eadid.value })

    try:
        # search for an existing ARK first, in case one was already created for this ead
        # limit search by the configured domain; look for an ARK with the expected target url
        found = pidclient.search_pids(type='ark', target=ead_url,
                                            domain_uri=settings.PIDMAN_DOMAIN)
        # at least one match
        if found and found['results_count']:
            if found['results_count'] > 1:
                # uh-oh - this shouldn't happen; warn the user
                logger.warning("Found %d ARKs when searching for an existing ARK for %s",
                    found['results_count'], ead.eadid.value)

            # use existing pid
            pid = found['results'][0]
            # find the unqualified target and get the access uri - primary resolvable ark url
            for t in pid['targets']:
                if 'qualifier' not in t or not t['qualifier']:
                    ark_url = t['access_uri']

            logger.info("Using existing ARK %s for %s", ark_url, ead.eadid.value)

            # what if no default target is not found? (unlikely but possible...)
            return ark_url

        # if no matches found, create a new ark and output a flash message to a user about it.
        ark_url = pidclient.create_ark(settings.PIDMAN_DOMAIN, ead_url,
                                   name=unicode(ead.unittitle))
        logger.info("Created a new ARK %s for %s", ark_url, ead.eadid.value)
        return ark_url

    # any error in the pidclient is raised as an HTTPError
    except HTTPError as err:
        raise Exception('Error generating ARK: %s' % err)
    def get_default_pid(self):
        # try to configure a pidman client to get pids.
        try:
            pidman = DjangoPidmanRestClient()
        except:
            raise CommandError("PIDMAN Not Configured. Plese check localsetting.py")

        target = get_pid_target('postcards:card')
        ark = pidman.create_ark(settings.PIDMAN_DOMAIN, target, self.label)
        arkbase, slash, noid = ark.rpartition('/')
        pid = '%s:%s' % (self.default_pidspace, noid)
        self.dc.content.identifier_list.append(ark) # Store local identifiers in DC
        return pid
Esempio n. 5
0
    def handle(self, *args, **options):
        self.options = options
        self.repaired_count = 0
        self.unrepaired_count = 0

        repo = Repository()
        self.pidman = DjangoPidmanRestClient()

        # populate list of objects to be processed
        objects = []
        for pid in args:
            try:
                obj = repo.get_object(pid=pid, type=CollectionObject)
                if obj.has_requisite_content_models:
                    objects.append(obj)
                else:
                    obj = repo.get_object(pid=pid, type=AudioObject)
                    if obj.has_requisite_content_models:
                        objects.append(obj)
            except Exception:
                self.log(message="Could not find Collection or Audio object for: %s" % pid)

        # get list of all collections from the repository
        # limited to the COLLECTION_CONTENT_MODEL as well as returns a Keep specific collection object
        if not args:
            objects = repo.get_objects_with_cmodel(CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject)

        if not objects:
            self.log(message="No Collections were found.")

        for obj in objects:
            self.repair_ark(obj)

        self.log(message="\n\n%s ARKs repaired\n%s ARKs were not repaired" % (self.repaired_count, self.unrepaired_count), no_label=True)
Esempio n. 6
0
    def get_default_pid(self):
        '''Default pid logic for DigitalObjects in :mod:`readux`.  Mint a
        new ARK via the PID manager, store the ARK in the MODS
        metadata (if available) or Dublin Core, and use the noid
        portion of the ARK for a Fedora pid in the site-configured
        Fedora pidspace.'''
        global pidman

        if pidman is not None:
            # pidman wants a target for the new pid
            # generate a pidman-ready target for a named view

            # Use the object absolute url method
            # NOTE: this requires that all values used in a url be set
            # (i.e., page objects must have volume pid configured)
            self.pid = '%s:%s' % (self.default_pidspace, self.PID_TOKEN)
            target = self.get_absolute_url()

            # reverse() encodes the PID_TOKEN and the :, so just unquote the url
            # (shouldn't contain anything else that needs escaping)
            target = urllib.unquote(target)

            # reverse() returns a full path - absolutize so we get scheme & server also
            target = absolutize_url(target)
            # pid name is not required, but helpful for managing pids
            pid_name = self.label
            # ask pidman for a new ark in the configured pidman domain
            try:
                ark = pidman.create_ark(settings.PIDMAN_DOMAIN, target, name=pid_name)
            except httplib.BadStatusLine:
                logger.warn('Error creating ARK; re-initializing pidman client and trying again')
                pidman = DjangoPidmanRestClient()
                ark = pidman.create_ark(settings.PIDMAN_DOMAIN, target, name=pid_name)
            # pidman returns the full, resolvable ark
            # parse into dictionary with nma, naan, and noid
            parsed_ark = parse_ark(ark)
            noid = parsed_ark['noid']  # nice opaque identifier

            # Add full uri ARK to dc:identifier
            self.dc.content.identifier_list.append(ark)

            # use the noid to construct a pid in the configured pidspace
            return '%s:%s' % (self.default_pidspace, noid)
        else:
            # if pidmanager is not available, fall back to default pid behavior
            return super(DigitalObject, self).get_default_pid()
Esempio n. 7
0
    def handle(self, *args, **options):
        self.options = options
        self.repaired_count = 0
        self.unrepaired_count = 0

        repo = Repository()
        self.pidman = DjangoPidmanRestClient()

        # populate list of objects to be processed
        objects = []
        for pid in args:
            try:
                obj = repo.get_object(pid=pid, type=CollectionObject)
                if obj.has_requisite_content_models:
                    objects.append(obj)
                else:
                    obj = repo.get_object(pid=pid, type=AudioObject)
                    if obj.has_requisite_content_models:
                        objects.append(obj)
            except Exception:
                self.log(
                    message="Could not find Collection or Audio object for: %s"
                    % pid)

        # get list of all collections from the repository
        # limited to the COLLECTION_CONTENT_MODEL as well as returns a Keep specific collection object
        if not args:
            objects = repo.get_objects_with_cmodel(
                CollectionObject.COLLECTION_CONTENT_MODEL,
                type=CollectionObject)

        if not objects:
            self.log(message="No Collections were found.")

        for obj in objects:
            self.repair_ark(obj)

        self.log(message="\n\n%s ARKs repaired\n%s ARKs were not repaired" %
                 (self.repaired_count, self.unrepaired_count),
                 no_label=True)
Esempio n. 8
0
    def test_constructor(self):
        'Test init from Django settings.'
        client = DjangoPidmanRestClient()
        self.assertEqual(
            client.baseurl['host'], 'testpidman.library.emory.edu',
            'Client Base URL %s not expected value.' % client.baseurl)

        # credentials are stored for passing to request
        username, password = client._auth
        self.assertEqual(
            'testuser', username,
            'Client username %s not the expected value' % username)
        self.assertEqual('testpass', password,
                         'Client password %s is not expected value' % password)
Esempio n. 9
0
    def get_default_pid(self):
        # try to configure a pidman client to get pids.
        try:
            pidman = DjangoPidmanRestClient()
        except:
            if getattr(settings, 'DEV_ENV', False):
                logger.warn('Failed to configure PID manager client; default pid logic will be used')
                pidman = None
            else:
                raise CommandError("PID manager is not configured. Please check localsetting.py")

        if pidman:
            target = get_pid_target('postcards:card')
            ark = pidman.create_ark(settings.PIDMAN_DOMAIN, target, self.label)
            arkbase, slash, noid = ark.rpartition('/')
            pid = '%s:%s' % (self.default_pidspace, noid)
            # Store local identifiers in DC
            self.dc.content.identifier_list.append(ark)
            return pid

        else:
            # if pidmanager is not available, fall back to default pid behavior
            return super(ImageObject, self).get_default_pid()
Esempio n. 10
0
    def get_pidman(self):
        """Initialize a new Pidman client using the DjangoPidmanRestClient
            wrapper. The credentials are pulled from the application settings.

            :return: a Pidman client to interact with the Pidman APIs
            :rtype: DjangoPidmanRestClient

        """
        # try to configure a pidman client to get pids.
        try:
            return DjangoPidmanRestClient()
        except CommandError as e:
            error_msg = """
            Cannot initialize DjangoPidmanRestClient.
            Please check your configuration for more details.
            """
            sys.stderr.write(error_msg)
            raise CommandError(e)
Esempio n. 11
0
    def handle(self, batch_id=None, folder_path=None, verbosity=1, noact=False,
               max_ingest=None, skip_purge=False, purge_only=False, *args, **options):

        # check batch object
        if batch_id is None:
            raise CommandError('Processing batch id is required')
        self.verbosity = int(verbosity)  # ensure we compare int to int
        if max_ingest is not None:
            self.max_ingest = int(max_ingest)

        # check folder path
        if folder_path is None:
            raise CommandError('Eudora folder base path is required')
        if not os.path.isdir(folder_path):
            raise CommandError('Eudora folder path "%s" is not a directory' % folder_path)
        self.noact = noact

        # check for any specified fedora credentials
        fedora_opts = {}
        if 'username' in options:
            fedora_opts['username'] = options['username']
        if 'password' in options:
            fedora_opts['password'] = options['password']
        self.repo = Repository(**fedora_opts)
        batch = self.repo.get_object(batch_id, type=ProcessingBatch)
        if not batch.exists:
            raise CommandError('Processing batch %s not found' % batch_id)
        print 'Looking for email messages in processing batch "%s"' \
              % batch.label

        try:
            pidman = DjangoPidmanRestClient()
        except:
            raise CommandError('Error initializing PID manager client; ' +
                               'please check settings.')

        self.stats = defaultdict(int)
        # purge old metadata email 'arrangement' objects that belong to this batch
        if not skip_purge:
            self.remove_arrangement_emails(batch)
        # ingest new objects for email mailboxes & messages
        if not purge_only:
            self.ingest_email(folder_path)
Esempio n. 12
0
    def get_default_pid(self):
        if not self._unused_pid_result:
            pidman = DjangoPidmanRestClient()
            result = pidman.search_pids(target=UNUSED_PID_URL)
            # if any were found, use results
            if result and result['results_count']:
                self._unused_pid_result = result['results']

        # if we have any unused pids, pop one off and use it
        if self._unused_pid_result:
            pid_info = self._unused_pid_result.pop()
            ark = pid_info['targets'][0]['access_uri']
            parsed_ark = parse_ark(ark)
            naan = parsed_ark['naan']  # name authority number
            noid = parsed_ark['noid']  # nice opaque identifier


            # use noid as basis for new pid
            pid = '%s:%s' % (self.default_pidspace, noid)
            # calculate target to new object
            target = reverse(self.NEW_OBJECT_VIEW, kwargs={'pid': pid})
            # reverse() returns a full path - absolutize so we get scheme & server also
            target = absolutize_url(target)
            # update pid ark label from object
            pidman.update_ark(noid, name=self.label)
            # update default ark target for new object url
            pidman.update_ark_target(noid, target_uri=target, active=True)

            # if we have a mods datastream, store the ARK as mods:identifier
            if hasattr(self, 'mods'):
                # store full uri and short-form ark
                self.mods.content.identifiers.extend([
                    mods.Identifier(type='ark', text='ark:/%s/%s' % (naan, noid)),
                    mods.Identifier(type='uri', text=ark)
                    ])

            # always add full uri ARK to dc:identifier
            self.dc.content.identifier_list.append(ark)

            # use the noid to construct a pid in the configured pidspace
            return '%s:%s' % (self.default_pidspace, noid)

        else:
            # if we run out of pids re-use, fall back to default behavior
            return super(PidReuseDigitalObject, self).get_default_pid()
Esempio n. 13
0
def upload_for_ht(job, count=1):
    """
    Task to upload files to Box in the backgroud.
    """
    logger = logging.getLogger(__name__)
    kdip_dir = settings.KDIP_DIR

    for kdip in models.KDip.objects.filter(job__id=job.id).exclude(status='uploaded').exclude(status='upload_fail'):
        # Only create a PID if it doesn't already have one
        if job.upload_attempts == 0:
            if not kdip.pid:
                try:
                    pidman_client = DjangoPidmanRestClient()
                    pidman_domain = settings.PIDMAN_DOMAIN
                    pidman_policy = settings.PIDMAN_POLICY

                    ark = pidman_client.create_ark(domain='{}'.format(pidman_domain),
                                                   target_uri='http://myuri.org',
                                                   policy='{}'.format(pidman_policy),
                                                   name='{}'.format(kdip.kdip_id))

                    noid = parse_ark(ark)['noid']

                    kdip.pid = noid
                    kdip.save()

                    logger.info("Ark {} was created for {}".format(ark, kdip.kdip_id))
                except Exception as e:
                    trace = traceback.format_exc()
                    logger.error("Failed creating an ARK for %s: %s" %
                                 (kdip.kdip_id, e))
                    reason = "Box uplaod failed while making an ARK line 161 " + ' ' + trace
                    print 'ERROR: {}'.format(reason)
                    kdip_fail(job, kdip, reason)

            else:
                logger.info("{} already has pid {}".format(kdip.kdip_id, kdip.pid))

            if not os.path.exists(kdip.process_dir):
                os.makedirs(kdip.process_dir)

            # Gather everything and write the file's checksum to a file via the
            # `checksum` method. The copy the file to the temp directory.
            # HT does not want sub directories in the package.
            tiffs = glob.glob('{}/{}/TIFF/*.tif'.format(kdip.path, kdip.kdip_id))
            for tiff in tiffs:
                checksumfile(tiff, kdip.process_dir)
                shutil.copy(tiff, kdip.process_dir)

            altos = glob.glob('{}/{}/ALTO/*.xml'.format(kdip.path, kdip.kdip_id))
            for alto in altos:
                checksumfile(alto, kdip.process_dir)
                shutil.copy(alto, kdip.process_dir)
                if 'alto' in alto:
                    filename = alto.split('/')
                    page, crap, ext = filename[-1].split('.')
                    shutil.move(alto, '{}/{}.{}'.format(kdip.process_dir, page, ext))

            ocrs = glob.glob('{}/{}/OCR/*.txt'.format(kdip.path, kdip.kdip_id))
            for ocr in ocrs:
                checksumfile(ocr, kdip.process_dir)
                shutil.copy(ocr, kdip.process_dir)

            checksumfile(kdip.meta_yml, kdip.process_dir)
            checksumfile(kdip.marc_xml, kdip.process_dir)
            checksumfile(kdip.mets_xml, kdip.process_dir)

            shutil.copy(kdip.meta_yml, kdip.process_dir)
            shutil.copy(kdip.marc_xml, kdip.process_dir)
            shutil.copy(kdip.mets_xml, kdip.process_dir)

            # After copying all the files to the tmp directory. We verify that
            # the checksum matches the one we made before the move. This is done
            # using the `verify()` method.
            with open('{}/checksum.md5'.format(kdip.process_dir)) as f:
                content = f.readlines()
                for line in content:
                    parts = line.split()
                    verify = checksumverify(parts[0], kdip.process_dir, parts[1])
                    if verify is not True:
                        logger.error('Checksum check failes for %s.' %
                                     kdip.process_dir)

            # Make the zip files
            zipf = zipfile.ZipFile('{}.zip'.format(kdip.process_dir), 'w', zipfile.ZIP_DEFLATED, allowZip64=True)
            os.chdir(kdip.process_dir)
            zipdir('.', zipf)
            zipf.close()

            # Delete the process directory to save space
            # but we keep the zip file
            shutil.rmtree(kdip.process_dir)

        attempts = 0

        while attempts < 5:

            try:
                # Don't upload if no pid
                upload_file(job, kdip) if kdip.pid else kdip_fail(job, kdip, '{} has no pid.'.format(kdip.kdip_id))
                break
            except ConnectionError:
                trace = traceback.format_exc()
                attempts += 1
                sleep(5)
                reason = 'Connection Error, failed to upload {}.'.format(kdip.kdip_id)
                print 'ERROR: {}'.format(reason)
                kdip.status = 'retry'
                kdip.save()
                kdip_fail(job, kdip, reason) if attempts == 5 else logger.error(
                    '{} failed to upload on attempt {} : '.format(kdip.kdip_id, attempts, trace))

            except SysCallError:
                trace = traceback.format_exc()
                attempts = 5
                reason = "SSL Error while uploading {}: {}".format(kdip.kdip_id, trace)
                logger.error(reason)
                kdip_fail(job, kdip, reason)

            except TypeError:
                trace = traceback.format_exc()
                attempts = 5
                reason = "TypeError in upload package for {}: {}".format(kdip.kdip_id, trace)
                logger.error(reason)
                kdip_fail(job, kdip, reason)

            except MemoryError:
                trace = traceback.format_exc()
                attempts = 5
                reason = "MemoryError for " + kdip.kdip_id
                logger.error(reason)
                kdip_fail(job, kdip, reason)

            except Exception as e:
                trace = traceback.format_exc()
                attempts = 5
                reason = "Unexpected error for {}: {}, {}".format(kdip.kdip_id, str(e), trace)
                logger.error(reason)
                kdip_fail(job, kdip, reason)

    # Check to see if all the KDips uploaded.
    job.upload_attempts = job.upload_attempts + 1
    statuses = job.kdip_set.values_list('status', flat=True)
    if ('retry' in statuses) and (job.upload_attempts < 5):
        # job.upload_attempts = job.upload_attempts + 1
        return upload_for_ht(job, count - 1)
    elif ('upload_fail' in statuses) and (job.upload_attempts == 5):
        job.status = 'failed'
        job.save()
    elif job.upload_attempts == 5:
        job.status = 'being processed'
        job.save()
        recipients = settings.HATHITRUST_CONTACTS + settings.EMORY_MANAGERS
        kdip_list = '\n'.join(job.kdip_set.filter(
            status='uploaded').values_list('kdip_id', flat=True))
        logger.info(kdip_list)
        send_to = settings.HATHITRUST_CONTACTS + settings.EMORY_MANAGERS
        send_from = settings.EMORY_CONTACT
        send_mail('New Volumes from Emory have been uploaded', 'The following volumes have been uploaded and are ready:\n\n{}'.format(kdip_list), send_from, send_to, fail_silently=False)
    else:
        return upload_for_ht(job, count - 1)
Esempio n. 14
0
class Command(BaseCommand):
    '''Repair missing ARKs for :class:`~keep.collection.models.CollectionObject` objects
    based on the correct ARK from PIDMAN.

    '''
    args = '[PID [PID...]]'
    help = '''Repair ARKs on Keep Collections or Audio objects.
    Optionally accepts a list of PIDs to be repaired.  If no pids are specified,
    will find all collection objects and attempt to repair them.'''

    option_list = BaseCommand.option_list + (make_option(
        '--dry-run',
        dest='dry_run',
        action='store_true',
        default=False,
        help='Report which ARKs would be repaired'), )

    def handle(self, *args, **options):
        self.options = options
        self.repaired_count = 0
        self.unrepaired_count = 0

        repo = Repository()
        self.pidman = DjangoPidmanRestClient()

        # populate list of objects to be processed
        objects = []
        for pid in args:
            try:
                obj = repo.get_object(pid=pid, type=CollectionObject)
                if obj.has_requisite_content_models:
                    objects.append(obj)
                else:
                    obj = repo.get_object(pid=pid, type=AudioObject)
                    if obj.has_requisite_content_models:
                        objects.append(obj)
            except Exception:
                self.log(
                    message="Could not find Collection or Audio object for: %s"
                    % pid)

        # get list of all collections from the repository
        # limited to the COLLECTION_CONTENT_MODEL as well as returns a Keep specific collection object
        if not args:
            objects = repo.get_objects_with_cmodel(
                CollectionObject.COLLECTION_CONTENT_MODEL,
                type=CollectionObject)

        if not objects:
            self.log(message="No Collections were found.")

        for obj in objects:
            self.repair_ark(obj)

        self.log(message="\n\n%s ARKs repaired\n%s ARKs were not repaired" %
                 (self.repaired_count, self.unrepaired_count),
                 no_label=True)

    def repair_ark(self, obj):
        ark_target = None
        try:
            ark_target = self.pidman.get_ark_target(noid=obj.noid,
                                                    qualifier='')
        except:
            self.unrepaired_count += 1
            self.log(level=WARNING,
                     message="Failed to find ARK target for %s" % (obj.pid))
            return

        parsed_ark = parse_ark(ark_target['access_uri'])
        naan = parsed_ark['naan']
        noid = parsed_ark['noid']

        if hasattr(obj, 'mods'):
            obj.mods.content.identifiers.extend([
                mods.Identifier(type='ark', text='ark:/%s/%s' % (naan, noid)),
                mods.Identifier(type='uri', text=ark_target['access_uri'])
            ])
        else:
            obj.dc.content.identifier_list(ark_target['access_uri'])

        if self.options['dry_run']:
            self.unrepaired_count += 1
            self.log(message='ARK target found for %s' % obj.pid)
            return

        # save the collection object w/ updated ark
        try:
            self.log(level=INFO, message="Attempting to save %s" % obj.pid)
            obj.save(logMessage='Fixing missing ARK')
            self.repaired_count += 1
        except DigitalObjectSaveFailure:
            self.log(message="An error occurred while saving %s" % (obj.pid))

    def log(self, level=INFO, message='', no_label=False):
        '''
        Convenience log function. WARNING level is only logged if the --verbosity flag is set to 2.
        INFO level is default and always logged. no_label can be set to True if a WARNING or INFO label
        is not desired.
        '''
        if level == WARNING and not int(self.options['verbosity']) == WARNING:
            return
        output_str = ''
        if not no_label:
            output_str = '%s: ' % LOG_LEVEL[level]
        print "%s%s" % (output_str, message)
Esempio n. 15
0
    def handle(self, *pids, **options):

        dry_run = options.get('dry_run', False)
        verbosity = int(options.get('verbosity', self.v_normal))

        repo = Repository()
        try:
            pidman = DjangoPidmanRestClient()
        except Exception as err:
            # error if pid manager config options not in localsettings
            raise CommandError(err)

        # if pids are specified on command line, only process those objects
        if pids:
            objs = [repo.get_object(pid, type=Volume) for pid in pids]

        # otherwise, look for all volume objects in fedora
        else:
            objs = repo.get_objects_with_cmodel(Volume.VOLUME_CONTENT_MODEL,
                                                type=Volume)

        stats = defaultdict(int)
        for obj in objs:
            if not obj.exists:
                if verbosity >= self.v_normal:
                    self.stdout.write(
                        '%s does not exist or is not accessible' % obj.pid)
                stats['skipped'] += 1
                continue

            stats['objs'] += 1
            if is_ark(obj.dc.content.identifier):
                parsed_ark = parse_ark(obj.dc.content.identifier)
                noid = parsed_ark['noid']
                try:
                    ark_info = pidman.get_ark(noid)
                except Exception as err:
                    # requested ARK is not in the configured pid manager
                    # (this should ONLY happen in dev/QA)
                    if verbosity >= self.v_normal:
                        if '404: NOT FOUND' in str(err):
                            msg = 'not found'
                            self.stdout.write(
                                'Error retriving ARK information for %s: Not Found'
                                % obj.pid)
                        else:
                            self.stdout.write(
                                'Error retriving ARK information for %s' %
                                obj.pid)
                    stats['skipped'] += 1
                    continue

                # update unqualified ark to resolve to readux volume landing page
                if not dry_run:
                    pidman.update_ark_target(noid,
                                             target_uri=self.volume_url(obj),
                                             active=True)

                # we expected a qualified ARK target for the PDF; update whether
                # it currently exists or not
                qual = 'PDF'
                stats[
                    'updated'] += 1  # count as updated in dry run mode (would be updated)
                if not dry_run:
                    pidman.update_ark_target(noid,
                                             qual,
                                             target_uri=self.pdf_url(obj),
                                             active=True)
                    # FIXME: catch possible exceptions here?

        # output summary
        if verbosity >= self.v_normal:
            msg = 'Processed %(objs)d object%%s; skipped %(skipped)d,%%s updated %(updated)d' % stats
            msg = msg % ('s' if stats['objs'] != 1 else '',
                         ' would have' if dry_run else '')
            self.stdout.write(msg)
Esempio n. 16
0
class Command(BaseCommand):
    '''Repair missing ARKs for :class:`~keep.collection.models.CollectionObject` objects
    based on the correct ARK from PIDMAN.

    '''
    args = '[PID [PID...]]'
    help = '''Repair ARKs on Keep Collections or Audio objects.
    Optionally accepts a list of PIDs to be repaired.  If no pids are specified,
    will find all collection objects and attempt to repair them.'''

    option_list = BaseCommand.option_list + (
        make_option('--dry-run',
            dest='dry_run',
            action='store_true',
            default=False,
            help='Report which ARKs would be repaired'),
        )

    def handle(self, *args, **options):
        self.options = options
        self.repaired_count = 0
        self.unrepaired_count = 0

        repo = Repository()
        self.pidman = DjangoPidmanRestClient()

        # populate list of objects to be processed
        objects = []
        for pid in args:
            try:
                obj = repo.get_object(pid=pid, type=CollectionObject)
                if obj.has_requisite_content_models:
                    objects.append(obj)
                else:
                    obj = repo.get_object(pid=pid, type=AudioObject)
                    if obj.has_requisite_content_models:
                        objects.append(obj)
            except Exception:
                self.log(message="Could not find Collection or Audio object for: %s" % pid)

        # get list of all collections from the repository
        # limited to the COLLECTION_CONTENT_MODEL as well as returns a Keep specific collection object
        if not args:
            objects = repo.get_objects_with_cmodel(CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject)

        if not objects:
            self.log(message="No Collections were found.")

        for obj in objects:
            self.repair_ark(obj)

        self.log(message="\n\n%s ARKs repaired\n%s ARKs were not repaired" % (self.repaired_count, self.unrepaired_count), no_label=True)

    def repair_ark(self, obj):
        ark_target = None
        try:
            ark_target = self.pidman.get_ark_target(noid=obj.noid, qualifier='')
        except:
            self.unrepaired_count += 1
            self.log(level=WARNING, message="Failed to find ARK target for %s" % (obj.pid))
            return


        parsed_ark = parse_ark(ark_target['access_uri'])
        naan = parsed_ark['naan']
        noid = parsed_ark['noid']

        if hasattr(obj, 'mods'):
            obj.mods.content.identifiers.extend([
                mods.Identifier(type='ark', text='ark:/%s/%s' % (naan, noid)),
                mods.Identifier(type='uri', text=ark_target['access_uri'])
                ])
        else:
            obj.dc.content.identifier_list(ark_target['access_uri'])

        if self.options['dry_run']:
            self.unrepaired_count += 1
            self.log(message='ARK target found for %s' % obj.pid)
            return

        # save the collection object w/ updated ark
        try:
            self.log(level=INFO, message="Attempting to save %s" % obj.pid)
            obj.save(logMessage='Fixing missing ARK')
            self.repaired_count += 1
        except DigitalObjectSaveFailure:
            self.log(message="An error occurred while saving %s" % (obj.pid))

    def log(self, level=INFO, message='', no_label=False):
        '''
        Convenience log function. WARNING level is only logged if the --verbosity flag is set to 2.
        INFO level is default and always logged. no_label can be set to True if a WARNING or INFO label
        is not desired.
        '''
        if level == WARNING and not int(self.options['verbosity']) == WARNING:
            return
        output_str = ''
        if not no_label:
            output_str = '%s: ' % LOG_LEVEL[level]
        print "%s%s" % (output_str, message)
Esempio n. 17
0
    def handle(self, *pids, **options):
        # bind a handler for interrupt signal
        signal.signal(signal.SIGINT, self.interrupt_handler)

        verbosity = int(options.get('verbosity', self.v_normal))

        repo = Repository()
        try:
            pidman = DjangoPidmanRestClient()
        except Exception as err:
            # error if pid manager config options not in localsettings
            raise CommandError(err)

        old_page_target = '%s/books/pages/' % Site.objects.get_current().domain
        search_args = {'type': 'ark', 'target': old_page_target, 'count': 10}
        # get a small result set to retrieve the total
        results = pidman.search_pids(**search_args)
        total = results['results_count']
        # then set a larger page size for actual processing
        search_args['count'] = 100
        if verbosity >= self.v_normal:
            print 'Found %d total page ARKs with targets to be updated' % total

        pbar = ProgressBar(
            widgets=[Percentage(), ' (',
                     Counter(), ')',
                     Bar(), ETA()],
            maxval=total).start()

        self.stats = defaultdict(int)
        self.processed = set()
        for ark in self.get_search_results(pidman, search_args):
            self.processed.add(ark['pid'])
            # get fedora pid from target uri
            target_uri = ark['targets'][0]['target_uri']
            baseurl, pid = target_uri.rstrip('/').rsplit('/', 1)
            try:
                page = repo.get_object(pid, type=Page)
                # this should probably only happen in dev/qa
                if not page.exists:
                    if verbosity > self.v_normal:
                        self.stderr.write('Page %s does not exist' % pid)
                    self.stats['notfound'] += 1
                else:
                    # check if volume exists?
                    pidman.update_ark_target(ark['pid'],
                                             target_uri=page.absolute_url)
                    self.stats['updated'] += 1
            except RequestFailed as rf:
                print 'Error accessing %s: %s' % (pid, rf)
                self.stats['error'] += 1

            pbar.update(len(self.processed))
            if self.interrupted:
                break

        if not self.interrupted:
            pbar.finish()
        # summarize
        self.stderr.write('Updated %(updated)d, %(error)d error(s), %(notfound)d not found' \
            % self.stats)
Esempio n. 18
0
    def remove_arrangement_emails(self, batch):
        '''Find and iterate over all items that are part of the specified batch.
        Purge email message objects and update the correspending ARK records
        for re-use on ingest.
        '''
        items = list(batch.rels_ext.content.objects(batch.uriref,
                                                    relsext.hasMember))
        for i in items:
            # for now, init as arrangement objects
            obj = self.repo.get_object(str(i), type=ArrangementObject)
            # NOTE: in dev/test, collection currently references all items
            # but only a handful actually exist in dev/test repo; just skip
            if not obj.exists:
                continue

            # number of objects
            self.stats['count'] += 1

            if not obj.filetech.exists or not obj.filetech.content.file:
                print 'Error: no file tech for %s; skipping' % obj.pid
                continue

            # 5300c email messages should only have one file path.
            # Identify email messages by file path starting with
            # email folder name and  no checksum
            file_info = obj.filetech.content.file[0]
            if not re.match(self.email_path_regex, file_info.path) or \
               file_info.md5:
                # not an email message - skip to next item
                continue

            self.stats['email'] += 1

            # if in no-act mode, nothing else to do
            if self.noact:
                continue

            # not in no-act mode : update pid, purge object
            try:
                # reinit client as a workaround for pidman errors (?)
                pidman = DjangoPidmanRestClient()
                # update ark name/domain
                pidman.update_ark(obj.noid,
                                  name=UNUSED_PID_NAME,
                                  domain=settings.PIDMAN_DOMAIN)
                # mark default target as inactive
                pidman.update_ark_target(obj.noid, active=False,
                                         target_uri=UNUSED_PID_URL)
                self.stats['pids'] +=1
                if self.verbosity > self.v_normal:
                    print 'Updated ARK for %s' % obj.noid

            except Exception as e:
                print 'Error updating ARK for %s: %s' % \
                      (obj.noid, e)

            # purge record
            try:
                self.repo.purge_object(obj.pid,
                                  'removing metadata arrangement 5300c email record')
                self.stats['purged'] += 1
                if self.verbosity > self.v_normal:
                    print 'Purged %s' % obj.pid

            except RequestFailed as e:
                self.stats['purge_error'] += 1
                print 'Error purging %s: %s' % (obj.pid, e)

        # summary
        if self.verbosity >= self.v_normal:
            print '''\nChecked %(count)d records, found %(email)d emails''' % self.stats
            if not self.noact:
                print 'Updated %(pids)d ARK(s); purged %(purged)d objects, error purging %(purge_error)d objects' \
                      % self.stats
Esempio n. 19
0
from keep.collection.models import SimpleCollection
from keep.common.models import PremisFixity, PremisObject, PremisEvent
from keep.common.fedora import ArkPidDigitalObject, Repository
from keep.common.utils import solr_interface
from keep.collection.models import CollectionObject
from keep.file.utils import sha1sum

logger = logging.getLogger(__name__)

# content models currently used for xacml access / restriction
ACCESS_ALLOWED_CMODEL = "info:fedora/emory-control:ArrangementAccessAllowed-1.0"
ACCESS_RESTRICTED_CMODEL = "info:fedora/emory-control:ArrangementAccessRestricted-1.0"

# try to configure a pidman client to get pids.
try:
    pidman = DjangoPidmanRestClient()
except:
    # if we're in dev mode then we can fall back on the fedora default
    # pid allocator. in non-dev, though, we really need pidman
    if getattr(settings, 'DEV_ENV', False):
        pidman = None
    else:
        raise

# FIXME: what about this one ? emory-control:RushdieResearcherAllowed-1.0


class Arrangement(models.Model):
    'Place-holder DB model to define permissions for "arrangement" objects'

    class Meta:
Esempio n. 20
0
    def handle(self, *pids, **options):

        dry_run = options.get('dry_run', False)
        verbosity = int(options.get('verbosity', self.v_normal))

        repo = Repository()
        try:
            pidman = DjangoPidmanRestClient()
        except Exception as err:
            # error if pid manager config options not in localsettings
            raise CommandError(err)

        # if pids are specified on command line, only process those objects
        if pids:
            objs = [repo.get_object(pid, type=Volume) for pid in pids]

        # otherwise, look for all volume objects in fedora
        else:
            objs = repo.get_objects_with_cmodel(Volume.VOLUME_CONTENT_MODEL,
                                                type=Volume)

        stats = defaultdict(int)
        for obj in objs:
            if not obj.exists:
                if verbosity >= self.v_normal:
                    self.stdout.write('%s does not exist or is not accessible' % obj.pid)
                stats['skipped'] += 1
                continue

            stats['objs'] += 1
            if is_ark(obj.dc.content.identifier):
                parsed_ark = parse_ark(obj.dc.content.identifier)
                noid = parsed_ark['noid']
                try:
                    ark_info = pidman.get_ark(noid)
                except Exception as err:
                    # requested ARK is not in the configured pid manager
                    # (this should ONLY happen in dev/QA)
                    if verbosity >= self.v_normal:
                        if '404: NOT FOUND' in str(err):
                            msg = 'not found'
                            self.stdout.write('Error retriving ARK information for %s: Not Found' % obj.pid)
                        else:
                            self.stdout.write('Error retriving ARK information for %s' % obj.pid)
                    stats['skipped'] += 1
                    continue

                # update unqualified ark to resolve to readux volume landing page
                if not dry_run:
                    pidman.update_ark_target(noid,
                        target_uri=self.volume_url(obj),
                        active=True)

                # we expected a qualified ARK target for the PDF; update whether
                # it currently exists or not
                qual = 'PDF'
                stats['updated'] += 1   # count as updated in dry run mode (would be updated)
                if not dry_run:
                    pidman.update_ark_target(noid, qual,
                        target_uri=self.pdf_url(obj),
                        active=True)
                    # FIXME: catch possible exceptions here?

        # output summary
        if verbosity >= self.v_normal:
            msg = 'Processed %(objs)d object%%s; skipped %(skipped)d,%%s updated %(updated)d' % stats
            msg = msg % ('s' if stats['objs'] != 1 else '', ' would have' if dry_run else '')
            self.stdout.write(msg)
Esempio n. 21
0
    def handle(self, *pids, **options):
        # testPid
        # settings.PIDMAN_HOST = 'https://testpid.library.emory.edu/'  # the web root where we'll ask for pids
        # settings.PIDMAN_USER = ''
        # settings.PIDMAN_PASSWORD = ''
        # settings.PIDMAN_DOMAIN = 'https://testpid.library.emory.edu/domains/18/'  # default domain (e.g. when minting pids)

        # prodPid
        # PIDMAN_HOST = 'https://pidqas.library.emory.edu/'

        # get a pidman client
        client = DjangoPidmanRestClient()

        # testFedora
        repo = Repository(settings.FEDORA_ROOT, username=settings.FEDORA_MANAGEMENT_USER, password=settings.FEDORA_MANAGEMENT_PASSWORD)

        # prodFedora
        #repo = Repository('https://fedora.library.emory.edu:8443/fedora/', username='******', password='******')

        # constants
        REPOMGMT = Namespace(rdflib.URIRef('info:fedora/fedora-system:def/relations-external#'))
        vol_list = repo.get_objects_with_cmodel('info:fedora/emory-control:ScannedVolume-1.0')

        print "Found " + str(len(vol_list)) + " books."

        # Get a file logger
        filename = "ecds/" + str(datetime.datetime.now().strftime("%I-%M-%S %B-%d-%Y")) + ".csv"
        f = open(filename, 'w+')

        # report all books
        f.write("Found " + str(len(vol_list)) + " books.")
        f.write("\n")

        # report titles
        f.write("TYPE,")
        f.write("PID,")
        f.write("NOID,")
        f.write("O_URI,")
        f.write("N_URI,")
        f.write("PAGE,")
        f.write("POST_URI,")
        # f.write("POST_PDF_URI,")
        f.write("\n")



        # go over all books
        for vol in vol_list:
            volDobj = repo.get_object(vol.pid.rstrip(), type=ScannedVolume)

            # get attributes
            pid = volDobj.pid
            noid = pid.split(":")[1]
            try:
                pidmanObj = client.get_pid("ark", noid)
            except Exception as e:
                f.write(str(pid))
                f.write("\n")
                f.write(str(e))
                continue # continue to the next item
            oriTargetUri = pidmanObj["targets"][0]["target_uri"]
            newTargetUri = oriTargetUri

            # if it has emory%3A
            if newTargetUri.find("emory%3A") != -1:
                newTargetUri = newTargetUri.replace("emory%3A", "emory:")

            # if it has readux%3A
            if newTargetUri.find("readux%3A") != -1:
                newTargetUri = newTargetUri.replace("readux%3A", "emory:")

            # if it has readux:
            if newTargetUri.find("readux:") != -1:
                newTargetUri = newTargetUri.replace("readux:", "emory:")

            # if it has webprd001.library.emory.edu/readux
            if newTargetUri.find("webprd001.library.emory.edu/readux") != -1:
                newTargetUri = newTargetUri.replace("webprd001.library.emory.edu/readux", "testreadux.ecds.emory.edu")

            # if it has webprd001.library.emory.edu
            if newTargetUri.find("webprd001.library.emory.edu/") != -1:
                newTargetUri = newTargetUri.replace("webprd001.library.emory.edu/", "testreadux.ecds.emory.edu/")

            # if it has /readux/
            if newTargetUri.find("/readux/") != -1:
                newTargetUri = newTargetUri.replace("/readux/", "/")


            newTargetUri = unicode(newTargetUri)

            # log attributes
            f.write("BOOK" + ", ")
            f.write(str(pid) + ", ")
            f.write(str(noid) + ", ")
            f.write(str(oriTargetUri) + ", ")
            f.write(str(newTargetUri) + ", ")
            f.write(str(len(volDobj.pageDObjs)) + ", ")
            f.write("\n")

            # report attributes
            print("BOOK - " + str(pid) + " - " + str(len(volDobj.pageDObjs)) + " pages")

            #TODO update target
            # if newTargetUri != oriTargetUri:
            #     response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri)
            #     updated_target_uri = response["target_uri"]
            #     response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri, qualifier="PDF")
            #     updated_pdf_target_uri = response["target_uri"]
            #     f.write(str(updated_target_uri) + ", ")
            #     f.write(str(updated_pdf_target_uri) + ", ")

            # update pages
            page_count = 0
            for p in volDobj.get_pages():
                page_count = page_count + 1

                # Get all relevant attributes
                pid = p
                noid = pid.split(":")[1]
                try:
                    pidmanObj = client.get_pid("ark", noid)
                except Exception as e:
                    f.write(str(pid))
                    f.write("\n")
                    f.write(str(e))
                    continue # continue to the next item
                oriTargetUri = pidmanObj["targets"][0]["target_uri"]
                newTargetUri = unicode(oriTargetUri)

                # if it has readux%3A
                if newTargetUri.find("readux%3A%7B%25PID%25%7D") != -1:
                    newTargetUri = newTargetUri.replace("readux%3A%7B%25PID%25%7D", pid)

                # if it has readux:abc1234
                if newTargetUri.find("readux:") != -1:
                    newTargetUri = newTargetUri.replace("readux:", "emory:")

                # if it has readux%3A
                if newTargetUri.find("readux%3A") != -1:
                    newTargetUri = newTargetUri.replace("readux%3A", "emory:")

                # if it has /readux/
                if newTargetUri.find("/readux/") != -1:
                    newTargetUri = newTargetUri.replace("/readux/", "/")

                # if it has webprd001.library.
                if newTargetUri.find("webprd001.library.emory.") != -1:
                    newTargetUri = newTargetUri.replace("webprd001.library.emory.", "testreadux.ecds.emory.")

                newTargetUri = unicode(newTargetUri)

                # Log attributes
                f.write("page"+ ", ")
                f.write(str(pid) + ", ")
                f.write(str(noid) + ", ")
                f.write(str(oriTargetUri) + ", ")
                f.write(str(newTargetUri) + ", ")
                f.write(str(page_count) + ", ")

                try:
                    print(str(page_count) + "/" + str(len(volDobj.pageDObjs)) + " - " + str(noid) + " - page update")
                    #TODO update target
                    # if newTargetUri != oriTargetUri:
                    #     response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri)
                    #     updated_target_uri = response["target_uri"]
                    #     response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri, qualifier="PDF")
                    #     updated_pdf_target_uri = response["target_uri"]
                    #     f.write(str(noid) + " - page success" + ", ")
                    #     f.write(str(noid) + " - page pdf success" + ", ")
                except:
                    print(str(page_count) + "/" + str(len(volDobj.pageDObjs)) + " - " + str(noid) + " - page fail")
                    f.write(str(noid) + " - page fail" + ", ")

                f.write("\n")

            f.write("\n")

        f.close()