def get_default_pid(self): '''Default pid logic for DigitalObjects in the Keep. Mint a new ARK via the PID manager, store the ARK in the MODS metadata (if available) or Dublin Core, and use the noid portion of the ARK for a Fedora pid in the site-configured Fedora pidspace.''' if pidman is not None: # pidman wants a target for the new pid '''Get a pidman-ready target for a named view.''' # first just reverse the view name. pid = '%s:%s' % (self.default_pidspace, self.PID_TOKEN) target = reverse(self.NEW_OBJECT_VIEW, kwargs={'pid': pid}) # reverse() encodes the PID_TOKEN and the :, so just unquote the url # (shouldn't contain anything else that needs escaping) target = urllib.unquote(target) # reverse() returns a full path - absolutize so we get scheme & server also target = absolutize_url(target) # pid name is not required, but helpful for managing pids pid_name = self.label # ask pidman for a new ark in the configured pidman domain ark = pidman.create_ark(settings.PIDMAN_DOMAIN, target, name=pid_name) # pidman returns the full, resolvable ark # parse into dictionary with nma, naan, and noid parsed_ark = parse_ark(ark) naan = parsed_ark['naan'] # name authority number noid = parsed_ark['noid'] # nice opaque identifier # if we have a mods datastream, store the ARK as mods:identifier if hasattr(self, 'mods'): # store full uri and short-form ark self.mods.content.identifiers.extend([ mods.Identifier(type='ark', text='ark:/%s/%s' % (naan, noid)), mods.Identifier(type='uri', text=ark) ]) else: # otherwise, add full uri ARK to dc:identifier self.dc.content.identifier_list.append(ark) # use the noid to construct a pid in the configured pidspace return '%s:%s' % (self.default_pidspace, noid) else: # if pidmanager is not available, fall back to default pid behavior return super(DigitalObject, self).get_default_pid()
def get_default_pid(self): if not self._unused_pid_result: pidman = DjangoPidmanRestClient() result = pidman.search_pids(target=UNUSED_PID_URL) # if any were found, use results if result and result['results_count']: self._unused_pid_result = result['results'] # if we have any unused pids, pop one off and use it if self._unused_pid_result: pid_info = self._unused_pid_result.pop() ark = pid_info['targets'][0]['access_uri'] parsed_ark = parse_ark(ark) naan = parsed_ark['naan'] # name authority number noid = parsed_ark['noid'] # nice opaque identifier # use noid as basis for new pid pid = '%s:%s' % (self.default_pidspace, noid) # calculate target to new object target = reverse(self.NEW_OBJECT_VIEW, kwargs={'pid': pid}) # reverse() returns a full path - absolutize so we get scheme & server also target = absolutize_url(target) # update pid ark label from object pidman.update_ark(noid, name=self.label) # update default ark target for new object url pidman.update_ark_target(noid, target_uri=target, active=True) # if we have a mods datastream, store the ARK as mods:identifier if hasattr(self, 'mods'): # store full uri and short-form ark self.mods.content.identifiers.extend([ mods.Identifier(type='ark', text='ark:/%s/%s' % (naan, noid)), mods.Identifier(type='uri', text=ark) ]) # always add full uri ARK to dc:identifier self.dc.content.identifier_list.append(ark) # use the noid to construct a pid in the configured pidspace return '%s:%s' % (self.default_pidspace, noid) else: # if we run out of pids re-use, fall back to default behavior return super(PidReuseDigitalObject, self).get_default_pid()
def update_progress(self, results, results_count): """Report the objects in Pidman and summarize in a CSV. :param results: results from pidman :param results_count: total count of objects founds within a collection """ # update progress on the screen sys.stdout.write("%i objects in total.\n" % results_count) sys.stdout.flush() # initialize a progress bar following the Readux example pbar = ProgressBar( widgets=[Percentage(), ' (', Counter(), ')', Bar(), ETA()], maxval=results_count).start() max_results_per_page = results["max_results_per_page"] pages = int(math.ceil(results_count / float(max_results_per_page))) + 1 current_count = 0 # iterate through all results fetched from pidman for page in range(1, pages): page_results = self.pidman.search_pids( domain_uri=settings.PIDMAN_RUSHDIE_DOMAIN, page=page) for page_result in page_results["results"]: pm_object_pid, pm_object_noid, pm_label, updated_pm_label, pm_target_uri = ( None, ) * 5 in_fedora, fedora_object, fedora_label, fedora_create_time_stamp = ( None, ) * 4 supposed_label, supposed_target_uri, status_label = ( None, ) * 3 exception_label = "no-exception" if not self.is_dry_run: status_label = "actual-run" else: status_label = "dry-run" try: pm_object_pid = "emory:" + page_result["pid"] pm_object_noid = page_result["pid"] pm_label = page_result["name"] pm_target_uri = page_result["targets"][0]["target_uri"] fedora_object = self.repo.get_object(pm_object_pid) in_fedora = True if fedora_object.exists else False # fedora object doesn't exist: # - mark item as PIDMAN_RUSHDIE_UNUSED_URI # - use generic target URI PIDMAN_RUSHDIE_UNUSED_URI # - set status_label as "unused-pid-identified" if not fedora_object.exists: if not self.is_dry_run: pid_response = self.pidman.update_pid( type="ark", noid=pm_object_noid, name=settings.PIDMAN_RUSHDIE_UNUSED) target_response = self.pidman.update_target( type="ark", noid=pm_object_noid, target_uri=settings.PIDMAN_RUSHDIE_UNUSED_URI) if pid_response[ "name"] == settings.PIDMAN_RUSHDIE_UNUSED and target_response[ "target_uri"] == settings.PIDMAN_RUSHDIE_UNUSED_URI: status_label += ", unused-pid-updated" else: status_label += ", unused-pid-update-failed" # supposed label and target_uri supposed_label = settings.PIDMAN_RUSHDIE_UNUSED supposed_target_uri = settings.PIDMAN_RUSHDIE_UNUSED_URI # fedora object exists # - update label to that in Fedora # - update target_uri to that in Fedora else: if not self.is_dry_run: # label update fedora_label = fedora_object.label if pm_label != fedora_label and fedora_label is not None: response = self.pidman.update_pid( type="ark", noid=pm_object_noid, name=fedora_label) if response["name"] == fedora_label: status_label += ", label-updated" else: status_label += ", label-update-failed" # target_uri update # create the target_uri using the logic that is used in creating objects from TheKeep keep_target = reverse( fedora_object.NEW_OBJECT_VIEW, kwargs={'pid': fedora_object.pid}) keep_target = urllib.unquote(keep_target) keep_target_uri = absolutize_url(keep_target) if pm_target_uri != keep_target_uri: response = self.pidman.update_target( type="ark", noid=pm_object_noid, target_uri=keep_target_uri) if keep_target_uri == response["target_uri"]: status_label += ", target_uri-updated" else: status_label += ", target_uri-update-failed" # supposed label and target_uri keep_target = reverse( fedora_object.NEW_OBJECT_VIEW, kwargs={'pid': fedora_object.pid}) keep_target = urllib.unquote(keep_target) supposed_label = fedora_object.label supposed_target_uri = absolutize_url(keep_target) fedora_create_time_stamp = fedora_object.created.strftime( "%Y-%m-%d %H:%M:%S") except Exception as e: exception_label = "Exception: %s" % str(e) self.summary_log.writerow((time.strftime("%Y-%m-%d %H:%M:%S", \ time.localtime()), \ status_label, \ pm_object_pid, \ pm_label, \ pm_target_uri, \ supposed_label, \ supposed_target_uri, \ str(in_fedora), \ fedora_label, \ fedora_create_time_stamp, \ exception_label)) current_count += 1 # update progress pbar.update(current_count)
def get_new_pid(self, obj): # TODO: first, make sure object label is set appropriately before # minting new pid or updating an existing one # check to see if there are any unused pids in the rushdie collection # that can be re-assigned unused_pids = pidman.search_pids( domain_uri=settings.PIDMAN_RUSHDIE_DOMAIN, target=settings.PIDMAN_RUSHDIE_UNUSED_URI) total_found = unused_pids.get('results_count', 0) logger.debug('Found %d unused rushdie pids' % total_found) # if any unused pids were found, use the first one if total_found: next_pid = unused_pids['results'][0] noid = next_pid['pid'] print 'Found %d unused rushdie pid%s, using %s' % \ (total_found, 's' if total_found != 1 else '', noid) # update pid metadata to reflect the updated object # update the ark name to match the current object pidman.update_ark(noid=noid, name=obj.label) # update the ark target and ensure it is active # generate the keep url for this object, using the same logic # in keep.common.fedora for minting new pids pid = ':'.join([obj.default_pidspace, noid]) target = reverse(obj.NEW_OBJECT_VIEW, kwargs={'pid': pid}) # reverse() encodes the PID_TOKEN and the :, so just unquote the url # (shouldn't contain anything else that needs escaping) target = urllib.unquote(target) # absolutize the url to include configured keep domain target = absolutize_url(target) # update the existing pid with the new Keep url pidman.update_ark_target(noid=noid, target_uri=target, active=True) ark_uri = next_pid['targets'][0]['access_uri'] parsed_ark = parse_ark(ark_uri) naan = parsed_ark['naan'] # name authority number # short form of ark identifier ark = 'ark:/%s/%s' % (naan, noid) # NOTE: adding to the old object metadata is semi useless, # since the old object will not be saved and the migration, # but it provides convenient access to ark and ark_uri # store the ark in the object metadata # (this logic duplicated from base get_default_pid method) # if we have a mods datastream, store the ARK as mods:identifier if hasattr(obj, 'mods'): # store full uri and short-form ark obj.mods.content.identifiers.extend([ mods.Identifier(type='ark', text=ark), mods.Identifier(type='uri', text=ark_uri) ]) else: # otherwise, add full uri ARK to dc:identifier obj.dc.content.identifier_list.append(ark_uri) # return the pid to be used return pid else: # TEST this: can we use default get next pid for arrangement # objects (including email)? return obj.get_default_pid()
def test_domain_with_scheme(self): self.site.domain = 'http://example.com' self.site.save() self.assertEqual('http://example.com/foo/', absolutize_url('/foo/'))
def update_progress(self, object_class, content_model_name, total_count): """Update the objects in Pidman and reports progress back to the user. :param object_class: the class of a object collection :param content_model_name: a human readable name for the content model/objects :param total_count: total count of objects founds within a collection :type content_model: str :type content_model_name: str :type total_count: number """ # initialize counters and a status label (needs change vs. does not need change) change_count = 0 nochange_count = 0 status_label = "" # update progress on the screen sys.stdout.write("Starting %s task. %i objects in total.\n" % (content_model_name, total_count)) sys.stdout.flush() # bind a handler for interrupt signal signal.signal(signal.SIGINT, self.interrupt_handler) # initialize a progress bar following the Readux example pbar = ProgressBar(widgets=[Percentage(), ' (', Counter(), ')', Bar(), ETA()], maxval=total_count).start() # use generator to process each object object_uris = self.repo.risearch.get_subjects(modelns.hasModel, object_class.CONTENT_MODELS[0]) for object_uri in object_uris: digital_object, digital_object_pid, digital_object_label, pidman_digital_obejct = (None,)*4 pidman_label, updated_pidman_label, pidman_target_uri, keep_target_uri, status_label, mismatch = (None,)*6 exception_string = "" hasException = False digital_object = self.repo.get_object(object_uri, object_class) digital_object_pid = digital_object.pid try: pidman_digital_obejct = self.pidman.search_pids(domain_uri=settings.PIDMAN_DOMAIN, pid=digital_object.noid) except Exception as e: hasException = True exception_string += "Object %s is not found in Pidman. \ Error message: %s \n" % (digital_object_pid, str(e)) if not hasException: try: pidman_label = pidman_digital_obejct["results"][0].get("name", None) except KeyError as e: hasException = True exception_string += "Pidman object %s doesn't exist or doesn't have a valid label." % digital_object_pid except Exception as e: hasException = True exception_string += "Pidman object %s is not accessible. Error message: %s" % (digital_object_pid, str(e)) try: digital_object_label = digital_object.label except AttributeError as e: hasException = True exception_string += "Fedora object %s doesn't have a label attribute." % digital_object_pid except Exception as e: hasException = True exception_string += "Fedora object %s is not accessible. \ Error message: %s \n" % (digital_object_pid, str(e)) try: pidman_target_uri = pidman_digital_obejct["results"][0]["targets"][0].get("target_uri", None) except KeyError as e: hasException = True exception_string += "Pidman object %s doesn't exist or doesn't have a valid target_uri." % digital_object_pid except Exception as e: hasException = True exception_string += "Pidman object %s is not accessible. Error message: %s" % (digital_object_pid, str(e)) try: # create the target_uri using the logic that is used in creating objects from TheKeep keep_target = reverse(digital_object.NEW_OBJECT_VIEW, kwargs={'pid': digital_object.pid}) keep_target = urllib.unquote(keep_target) keep_target_uri = absolutize_url(keep_target) except Exception as e: hasException = True exception_string += "Keep target_uri creation failed. Error message: %s" % (digital_object_pid, str(e)) try: if hasException: mismatch = "" else: if pidman_target_uri == keep_target_uri and pidman_label == digital_object_label: mismatch = "No" elif pidman_target_uri != keep_target_uri and pidman_label != digital_object_label: mismatch = "label&uri" elif pidman_target_uri != keep_target_uri: mismatch = "uri" elif pidman_label != digital_object_label: mismatch = "label" except Exception as e: hasException = True exception_string += "Either Pidman or Keep target_uri doesn't exist. Error message: %s" % (digital_object_pid, str(e)) if not hasException: # execute irreversible update when the dry run flag is not set # be cautious if not self.is_dry_run: try: if pidman_label != digital_object_label and digital_object_label is not None: response = self.pidman.update_pid(type="ark", noid=digital_object.noid, name=digital_object_label) updated_pidman_label = response["name"] status_label = "changed" except Exception as e: hasException = True exception_string += "Pidman object %s is not updated. Error message: %s" % (digital_object_pid, str(e)) # when the names are not the same if (pidman_label != digital_object_label): change_count += 1 if updated_pidman_label == digital_object_label: status_label = "changed" else: status_label = "change-needed" # when the names are the same else: nochange_count += 1 status_label = "ok" else: # log the failure in a file error_file_path = "%s/%s.log" % (self.error_path, digital_object.noid) error_log = open(error_file_path, 'w+') error_log.write('[TIME]: %s, [CONTENT_MODEL]: %s, [PID]: %s\n %s \n' % \ (time.strftime("%Y%m%d %H:%M:%S", time.localtime()), \ content_model_name, \ digital_object.noid, \ exception_string)) error_log.close() status_label = "error" # write to CSV self.summary_log.writerow((time.strftime("%Y-%m-%d %H:%M:%S", \ time.localtime()), \ status_label, \ content_model_name, \ digital_object_pid, \ digital_object_label, \ pidman_label, \ pidman_target_uri, \ keep_target_uri, \ mismatch, \ exception_string)) # update progress pbar.update(change_count + nochange_count) # break if anything goes wrong if self.interrupted: break # update finish when all tasks are completed if not self.interrupted: pbar.finish() # write statistics self.stdout.write("Total objects: %i \n" % total_count) self.stdout.write("No change: %i | Change required: %i | Failed (see logs): %i\n" \ % (nochange_count, change_count, (total_count - nochange_count - change_count)))
def update_progress(self, object_class, content_model_name, total_count): """Update the objects in Pidman and reports progress back to the user. :param object_class: the class of a object collection :param content_model_name: a human readable name for the content model/objects :param total_count: total count of objects founds within a collection :type content_model: str :type content_model_name: str :type total_count: number """ # initialize counters and a status label (needs change vs. does not need change) change_count = 0 nochange_count = 0 status_label = "" # update progress on the screen sys.stdout.write("Starting %s task. %i objects in total.\n" % (content_model_name, total_count)) sys.stdout.flush() # bind a handler for interrupt signal signal.signal(signal.SIGINT, self.interrupt_handler) # initialize a progress bar following the Readux example pbar = ProgressBar( widgets=[Percentage(), ' (', Counter(), ')', Bar(), ETA()], maxval=total_count).start() # use generator to process each object object_uris = self.repo.risearch.get_subjects( modelns.hasModel, object_class.CONTENT_MODELS[0]) for object_uri in object_uris: digital_object, digital_object_pid, digital_object_label, pidman_digital_obejct = ( None, ) * 4 pidman_label, updated_pidman_label, pidman_target_uri, keep_target_uri, status_label, mismatch = ( None, ) * 6 exception_string = "" hasException = False digital_object = self.repo.get_object(object_uri, object_class) digital_object_pid = digital_object.pid try: pidman_digital_obejct = self.pidman.search_pids( domain_uri=settings.PIDMAN_DOMAIN, pid=digital_object.noid) except Exception as e: hasException = True exception_string += "Object %s is not found in Pidman. \ Error message: %s \n" % (digital_object_pid, str(e)) if not hasException: try: pidman_label = pidman_digital_obejct["results"][0].get( "name", None) except KeyError as e: hasException = True exception_string += "Pidman object %s doesn't exist or doesn't have a valid label." % digital_object_pid except Exception as e: hasException = True exception_string += "Pidman object %s is not accessible. Error message: %s" % ( digital_object_pid, str(e)) try: digital_object_label = digital_object.label except AttributeError as e: hasException = True exception_string += "Fedora object %s doesn't have a label attribute." % digital_object_pid except Exception as e: hasException = True exception_string += "Fedora object %s is not accessible. \ Error message: %s \n" % (digital_object_pid, str(e)) try: pidman_target_uri = pidman_digital_obejct["results"][0][ "targets"][0].get("target_uri", None) except KeyError as e: hasException = True exception_string += "Pidman object %s doesn't exist or doesn't have a valid target_uri." % digital_object_pid except Exception as e: hasException = True exception_string += "Pidman object %s is not accessible. Error message: %s" % ( digital_object_pid, str(e)) try: # create the target_uri using the logic that is used in creating objects from TheKeep keep_target = reverse(digital_object.NEW_OBJECT_VIEW, kwargs={'pid': digital_object.pid}) keep_target = urllib.unquote(keep_target) keep_target_uri = absolutize_url(keep_target) except Exception as e: hasException = True exception_string += "Keep target_uri creation failed. Error message: %s" % ( digital_object_pid, str(e)) try: if hasException: mismatch = "" else: if pidman_target_uri == keep_target_uri and pidman_label == digital_object_label: mismatch = "No" elif pidman_target_uri != keep_target_uri and pidman_label != digital_object_label: mismatch = "label&uri" elif pidman_target_uri != keep_target_uri: mismatch = "uri" elif pidman_label != digital_object_label: mismatch = "label" except Exception as e: hasException = True exception_string += "Either Pidman or Keep target_uri doesn't exist. Error message: %s" % ( digital_object_pid, str(e)) if not hasException: # execute irreversible update when the dry run flag is not set # be cautious if not self.is_dry_run: try: if pidman_label != digital_object_label and digital_object_label is not None: response = self.pidman.update_pid( type="ark", noid=digital_object.noid, name=digital_object_label) updated_pidman_label = response["name"] status_label = "changed" except Exception as e: hasException = True exception_string += "Pidman object %s is not updated. Error message: %s" % ( digital_object_pid, str(e)) # when the names are not the same if (pidman_label != digital_object_label): change_count += 1 if updated_pidman_label == digital_object_label: status_label = "changed" else: status_label = "change-needed" # when the names are the same else: nochange_count += 1 status_label = "ok" else: # log the failure in a file error_file_path = "%s/%s.log" % (self.error_path, digital_object.noid) error_log = open(error_file_path, 'w+') error_log.write('[TIME]: %s, [CONTENT_MODEL]: %s, [PID]: %s\n %s \n' % \ (time.strftime("%Y%m%d %H:%M:%S", time.localtime()), \ content_model_name, \ digital_object.noid, \ exception_string)) error_log.close() status_label = "error" # write to CSV self.summary_log.writerow((time.strftime("%Y-%m-%d %H:%M:%S", \ time.localtime()), \ status_label, \ content_model_name, \ digital_object_pid, \ digital_object_label, \ pidman_label, \ pidman_target_uri, \ keep_target_uri, \ mismatch, \ exception_string)) # update progress pbar.update(change_count + nochange_count) # break if anything goes wrong if self.interrupted: break # update finish when all tasks are completed if not self.interrupted: pbar.finish() # write statistics self.stdout.write("Total objects: %i \n" % total_count) self.stdout.write("No change: %i | Change required: %i | Failed (see logs): %i\n" \ % (nochange_count, change_count, (total_count - nochange_count - change_count)))