def main(argv): repo = Repository(root='%s/fedora/' % HOST, username='******' % fedoraUser, password='******' % fedoraPass) philologic_pids = repo.get_objects_with_cmodel(cmodel_uri='info:fedora/niu-objects:cmodel') phil_doc = open('phil_doc.csv', 'w') image_ids = [] d = defaultdict(int) for p in philologic_pids: philologic = p.getDatastreamObject('OBJ').content substring = 'ARTFL-figure-missing' if substring in philologic: print 'Processing %s' % p images = [] image_count = 0 url = '%s/fedora/objects/%s/datastreams/OBJ/content' % (HOST, p) passwordManager = urllib2.HTTPPasswordMgrWithDefaultRealm() fedoraAdmin = "%s/fedora" % HOST passwordManager.add_password(None, fedoraAdmin, fedoraUser, fedoraPass) handler = urllib2.HTTPBasicAuthHandler(passwordManager) fedoraOpener = urllib2.build_opener(handler) soup = BeautifulSoup(fedoraOpener.open(url), 'html.parser') spans = soup.find_all('span', 'ARTFL-figure-missing') for span in spans: image = span['sysid'] images.append(image) image_count+= 1 image_ids.extend(images) images_string = ';'.join(images) phil_doc.write('%s,%s,%s\n' % (p, image_count, images_string)) print 'Successfully processed %s' % p for i in image_ids: d[i] += 1 with open('phil_image.csv', 'w') as outfile: phil_image = csv.writer(outfile) for key, value in d.items(): phil_image.writerow([key, value]) phil_doc.close()
def main(argv): # Make Fedora connection repo = Repository(root='http://localhost:8080/fedora/', username='******', password='******') # Retreive pids using content model philologic_pids = repo.get_objects_with_cmodel(cmodel_uri='info:fedora/niu-objects:cmodel') # Loop through Philologic pids and retreive each object for p in philologic_pids: print 'Processing %s' % p # Extract the text philologic = p.getDatastreamObject('OBJ').content text=strip_tags(philologic) # Add FULL_TEXT full_text = p.getDatastreamObject('FULL_TEXT') full_text.label='Full text' full_text.mimetype='text/plain' full_text.versionable=True full_text.state='A' full_text.checksum_type='MD5' full_text.content = text full_text.save()
class Command(BaseCommand): ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo ''' args = "[netid netid ...]" help = __doc__ option_list = BaseCommand.option_list + ( make_option('--noact', '-n', action='store_true', default=False, help='Fixed all caps title in articles'), ) def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 #connection to repository self.repo = Repository(settings.FEDORA_ROOT, username=settings.FEDORA_MANAGEMENT_USER, password=settings.FEDORA_PASSWORD) pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication) coll = self.repo.get_object(pid=settings.PID_ALIASES['oe-collection']) try: articles = Paginator(pid_set, 100) except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) continue for article in objs: try: if not article.exists: self.output(0, "Skipping %s because pid does not exist" % article.pid) continue else: print coll print article.pid article.collection = coll ark_uri = '%sark:/25593/%s' % (settings.PIDMAN_HOST, article.pid.split(':')[1]) article.dc.content.identifier_list.extend([ark_uri]) article.save() except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message)) # self.counts['errors'] +=1 def output(self, v, msg): '''simple function to handle logging output based on verbosity''' if self.verbosity >= v: self.stdout.write("%s\n" % msg)
def all(): """ Returns all collections in the repository as :class:`~genrepo.collection.models.CollectionObject` """ repo = Repository() colls = repo.get_objects_with_cmodel(CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject) return colls
def all(): """ Returns all collections in the repository as :class:`~genrepo.collection.models.CollectionObject` """ repo = Repository() colls = repo.get_objects_with_cmodel( CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject) return colls
def handle(self, *args, **kwargs): verbosity = kwargs.get('verbosity', self.v_normal) # pids specified on command-line take precedence pids = kwargs.get('pids', []) repo = Repository() # if no pids were specified, find all AFFs if not pids: objs = repo.get_objects_with_cmodel(DiskImage.DISKIMAGE_CONTENT_MODEL, type=DiskImage) for obj in objs: # objects found by risearch *should* exist, but # just in case of discrepancies (hopefully only in QA), # ignore non-existent objects if not obj.exists: self.stderr.write(self.style.WARNING('%s does not exist' % obj.pid)) continue # check premis for to find Disk Images in AFF format; # exclude any that have already been migrated if obj.provenance.exists: premis = obj.provenance.content if premis.object and premis.object.format \ and premis.object.format.name == 'AFF' \ and not obj.migrated: pids.append(obj.pid) # create a celery result set and queue conversion of each pid requested # or found in fedora migration_tasks = celery.result.ResultSet([]) for pid in pids: migration_tasks.add(migrate_aff_diskimage.delay(pid)) # wait for tasks to complete while migration_tasks.waiting(): try: migration_tasks.join() except Exception: # exceptions from tasks gets propagated here, but ignore # them and report based on success/failure pass print '%d migrations completed, %s failures' % \ (migration_tasks.completed_count(), 'some' if migration_tasks.failed() else 'no') for result in migration_tasks.results: if result.state == celery.states.FAILURE: print 'Error: %s' % result.result else: print 'Success: %s' % result.result
def handle(self, *args, **kwargs): verbosity = kwargs.get('verbosity', self.v_normal) repo = Repository() objs = repo.get_objects_with_cmodel(DiskImage.DISKIMAGE_CONTENT_MODEL, type=DiskImage) for obj in objs: img_fmt = None # use premis object format to distinguish AD1 disk images if obj.provenance.exists: premis = obj.provenance.content if premis.object and premis.object.format: img_fmt = premis.object.format.name if img_fmt == 'AD1': print '%s %s' % (obj.pid, obj.content.label) if img_fmt is None and verbosity >= self.v_normal: self.stderr.write('Warning: %s has no premis object format' % obj.pid)
def handle(self, *args, **options): self.verbosity = int( options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 #counters counts = defaultdict(int) # check required options if not options['username']: raise CommandError('Username is required') else: if not options['password'] or options['password'] == '': options['password'] = getpass() #connection to repository repo = Repository(username=options['username'], password=options['password']) coll = repo.get_object(pid=settings.PID_ALIASES['oe-collection']) #if pids specified, use that list try: if len(args) != 0: pids = list(args) pid_set = [repo.get_object(pid=p, type=Article) for p in pids] else: #search for Articles. pid_set = repo.get_objects_with_cmodel( Article.ARTICLE_CONTENT_MODEL, Article) except Exception as e: raise CommandError('Error getting pid list (%s)' % e.message) try: articles = Paginator(pid_set, 20) counts['total'] = articles.count except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0, "Error getting page: %s : %s " % (p, e.message)) counts['errors'] += 1 continue for article in objs: try: if not article.exists: self.output( 1, "Skipping %s because pid does not exist" % article.pid) counts['skipped'] += 1 continue else: self.output(0, "Processing %s" % article.pid) # Add to collection article.collection = coll self.output( 1, "Adding %s to collection %s" % (article.pid, coll.pid)) counts['collection'] += 1 # Add itemID for OAI if article.is_published: article.oai_itemID = "oai:ark:/25593/%s" % article.noid self.output(1, "Adding itemID to %s" % article.pid) counts['itemId'] += 1 # Modify DB NS article._prep_dc_for_oai() self.output( 1, "Modified DC namespaces for %s" % (article.pid)) counts['DC'] += 1 # save article if not options['noact']: article.save() except Exception as e: self.output( 0, "Error processing pid: %s : %s " % (article.pid, e.message)) counts['errors'] += 1 # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % counts['total']) self.stdout.write("Added to collection: %s\n" % counts['collection']) self.stdout.write("Added itemID: %s\n" % counts['itemId']) self.stdout.write("Modified DC NS: %s\n" % counts['DC']) self.stdout.write("Skipped: %s\n" % counts['skipped']) self.stdout.write("Errors: %s\n" % counts['errors'])
def handle(self, *args, **options): #counters counts = defaultdict(int) # check required options if not options['username']: raise CommandError('Username is required') else: if not options['password'] or options['password'] == '': options['password'] = getpass() if not options['file']: raise CommandError('File is required') with open(options['file'], 'r') as myfile: data=myfile.read().splitlines() #connection to repository repo = Repository(username=options['username'], password=options['password']) try: #if pids specified, use that list if len(data) != 0: pids = list(data) pid_set = [repo.get_object(pid=p, type=Video) for p in pids] else: #search for Articles pid_set = repo.get_objects_with_cmodel(Video.VIDEO_CONTENT_MODEL, Video) except Exception as e: raise CommandError('Error gettings pids (%s)' % e.message) try: objects = Paginator(pid_set, 20) counts['total'] = objects.count except Exception as e: self.output("Error paginating items: : %s " % (e.message)) #process all Objects for p in objects.page_range: try: objs = objects.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output("Error getting page: %s : %s " % (p, e.message)) counts['errors'] +=1 continue for a in objs: try: if not a.exists: self.output("Skipping %s because pid does not exist" % a.pid) counts['skipped'] +=1 continue else: self.output("Processing %s" % a.pid) a.content.mimetype = 'video/quicktime' # save object if not options['noact']: a.save("cleanup mimetype") self.output("SAVED %s" % a.pid) counts['saved'] +=1 counts['processed'] +=1 except Exception as e: self.output("Error processing pid: %s : %s " % (a.pid, e.message)) counts['errors'] +=1 # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % counts['total']) self.stdout.write("Total number processed: %s\n" % counts['processed']) self.stdout.write("Total number saved: %s\n" % counts['saved']) self.stdout.write("Skipped: %s\n" % counts['skipped']) self.stdout.write("Errors: %s\n" % counts['errors'])
def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 #counters counts = defaultdict(int) # check required options if not options['username']: raise CommandError('Username is required') else: if not options['password'] or options['password'] == '': options['password'] = getpass() #connection to repository repo = Repository(username=options['username'], password=options['password']) coll = repo.get_object(pid=settings.PID_ALIASES['oe-collection']) #if pids specified, use that list try: if len(args) != 0: pids = list(args) pid_set = [repo.get_object(pid=p,type=Article) for p in pids] else: #search for Articles. pid_set = repo.get_objects_with_cmodel(Article.ARTICLE_CONTENT_MODEL, Article) except Exception as e: raise CommandError('Error getting pid list (%s)' % e.message) try: articles = Paginator(pid_set, 20) counts['total'] = articles.count except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) counts['errors'] +=1 continue for article in objs: try: if not article.exists: self.output(1, "Skipping %s because pid does not exist" % article.pid) counts['skipped'] +=1 continue else: self.output(0,"Processing %s" % article.pid) # Add to collection article.collection = coll self.output(1, "Adding %s to collection %s" % (article.pid, coll.pid)) counts['collection']+= 1 # Add itemID for OAI if article.is_published: article.oai_itemID = "oai:ark:/25593/%s" % article.noid self.output(1, "Adding itemID to %s" % article.pid) counts['itemId']+= 1 # Modify DB NS article._prep_dc_for_oai() self.output(1, "Modified DC namespaces for %s" % (article.pid)) counts['DC']+= 1 # save article if not options['noact']: article.save() except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message)) counts['errors'] +=1 # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % counts['total']) self.stdout.write("Added to collection: %s\n" % counts['collection']) self.stdout.write("Added itemID: %s\n" % counts['itemId']) self.stdout.write("Modified DC NS: %s\n" % counts['DC']) self.stdout.write("Skipped: %s\n" % counts['skipped']) self.stdout.write("Errors: %s\n" % counts['errors'])
def handle(self, *pids, **options): # testPid # settings.PIDMAN_HOST = 'https://testpid.library.emory.edu/' # the web root where we'll ask for pids # settings.PIDMAN_USER = '' # settings.PIDMAN_PASSWORD = '' # settings.PIDMAN_DOMAIN = 'https://testpid.library.emory.edu/domains/18/' # default domain (e.g. when minting pids) # prodPid # PIDMAN_HOST = 'https://pidqas.library.emory.edu/' # get a pidman client client = DjangoPidmanRestClient() # testFedora repo = Repository(settings.FEDORA_ROOT, username=settings.FEDORA_MANAGEMENT_USER, password=settings.FEDORA_MANAGEMENT_PASSWORD) # prodFedora #repo = Repository('https://fedora.library.emory.edu:8443/fedora/', username='******', password='******') # constants REPOMGMT = Namespace(rdflib.URIRef('info:fedora/fedora-system:def/relations-external#')) vol_list = repo.get_objects_with_cmodel('info:fedora/emory-control:ScannedVolume-1.0') print "Found " + str(len(vol_list)) + " books." # Get a file logger filename = "ecds/" + str(datetime.datetime.now().strftime("%I-%M-%S %B-%d-%Y")) + ".csv" f = open(filename, 'w+') # report all books f.write("Found " + str(len(vol_list)) + " books.") f.write("\n") # report titles f.write("TYPE,") f.write("PID,") f.write("NOID,") f.write("O_URI,") f.write("N_URI,") f.write("PAGE,") f.write("POST_URI,") # f.write("POST_PDF_URI,") f.write("\n") # go over all books for vol in vol_list: volDobj = repo.get_object(vol.pid.rstrip(), type=ScannedVolume) # get attributes pid = volDobj.pid noid = pid.split(":")[1] try: pidmanObj = client.get_pid("ark", noid) except Exception as e: f.write(str(pid)) f.write("\n") f.write(str(e)) continue # continue to the next item oriTargetUri = pidmanObj["targets"][0]["target_uri"] newTargetUri = oriTargetUri # if it has emory%3A if newTargetUri.find("emory%3A") != -1: newTargetUri = newTargetUri.replace("emory%3A", "emory:") # if it has readux%3A if newTargetUri.find("readux%3A") != -1: newTargetUri = newTargetUri.replace("readux%3A", "emory:") # if it has readux: if newTargetUri.find("readux:") != -1: newTargetUri = newTargetUri.replace("readux:", "emory:") # if it has webprd001.library.emory.edu/readux if newTargetUri.find("webprd001.library.emory.edu/readux") != -1: newTargetUri = newTargetUri.replace("webprd001.library.emory.edu/readux", "testreadux.ecds.emory.edu") # if it has webprd001.library.emory.edu if newTargetUri.find("webprd001.library.emory.edu/") != -1: newTargetUri = newTargetUri.replace("webprd001.library.emory.edu/", "testreadux.ecds.emory.edu/") # if it has /readux/ if newTargetUri.find("/readux/") != -1: newTargetUri = newTargetUri.replace("/readux/", "/") newTargetUri = unicode(newTargetUri) # log attributes f.write("BOOK" + ", ") f.write(str(pid) + ", ") f.write(str(noid) + ", ") f.write(str(oriTargetUri) + ", ") f.write(str(newTargetUri) + ", ") f.write(str(len(volDobj.pageDObjs)) + ", ") f.write("\n") # report attributes print("BOOK - " + str(pid) + " - " + str(len(volDobj.pageDObjs)) + " pages") #TODO update target # if newTargetUri != oriTargetUri: # response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri) # updated_target_uri = response["target_uri"] # response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri, qualifier="PDF") # updated_pdf_target_uri = response["target_uri"] # f.write(str(updated_target_uri) + ", ") # f.write(str(updated_pdf_target_uri) + ", ") # update pages page_count = 0 for p in volDobj.get_pages(): page_count = page_count + 1 # Get all relevant attributes pid = p noid = pid.split(":")[1] try: pidmanObj = client.get_pid("ark", noid) except Exception as e: f.write(str(pid)) f.write("\n") f.write(str(e)) continue # continue to the next item oriTargetUri = pidmanObj["targets"][0]["target_uri"] newTargetUri = unicode(oriTargetUri) # if it has readux%3A if newTargetUri.find("readux%3A%7B%25PID%25%7D") != -1: newTargetUri = newTargetUri.replace("readux%3A%7B%25PID%25%7D", pid) # if it has readux:abc1234 if newTargetUri.find("readux:") != -1: newTargetUri = newTargetUri.replace("readux:", "emory:") # if it has readux%3A if newTargetUri.find("readux%3A") != -1: newTargetUri = newTargetUri.replace("readux%3A", "emory:") # if it has /readux/ if newTargetUri.find("/readux/") != -1: newTargetUri = newTargetUri.replace("/readux/", "/") # if it has webprd001.library. if newTargetUri.find("webprd001.library.emory.") != -1: newTargetUri = newTargetUri.replace("webprd001.library.emory.", "testreadux.ecds.emory.") newTargetUri = unicode(newTargetUri) # Log attributes f.write("page"+ ", ") f.write(str(pid) + ", ") f.write(str(noid) + ", ") f.write(str(oriTargetUri) + ", ") f.write(str(newTargetUri) + ", ") f.write(str(page_count) + ", ") try: print(str(page_count) + "/" + str(len(volDobj.pageDObjs)) + " - " + str(noid) + " - page update") #TODO update target # if newTargetUri != oriTargetUri: # response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri) # updated_target_uri = response["target_uri"] # response = client.update_target(type="ark", noid=noid, target_uri=newTargetUri, qualifier="PDF") # updated_pdf_target_uri = response["target_uri"] # f.write(str(noid) + " - page success" + ", ") # f.write(str(noid) + " - page pdf success" + ", ") except: print(str(page_count) + "/" + str(len(volDobj.pageDObjs)) + " - " + str(noid) + " - page fail") f.write(str(noid) + " - page fail" + ", ") f.write("\n") f.write("\n") f.close()
def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 # check required options if (not options['div']) and (not options['author']) and (not options['lead']): raise CommandError('At least one of the options div, author or lead is required') if not options['username']: raise CommandError('Username is required') else: if not options['password'] or options['password'] == '': options['password'] = getpass() #connection to repository repo = Repository(username=options['username'], password=options['password']) pid_set = repo.get_objects_with_cmodel(Article.ARTICLE_CONTENT_MODEL, Article) try: articles = Paginator(pid_set, 100) self.counts['total'] = articles.count except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) self.counts['errors'] +=1 continue for article in objs: try: if not article.exists: self.output(0, "Skipping %s because pid does not exist" % article.pid) self.counts['skipped'] +=1 continue else: self.output(2,"Processing %s" % article.pid) if options['div']: self.division(article) if options['author']: self.author(article) if options['lead']: self.lead(article) except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message)) self.counts['errors'] +=1 # write files if options['div']: writer = csv.writer(open("division_report.csv", 'w')) writer.writerow(['Division', 'Count']) for key, count in self.div_counts.items(): writer.writerow([key, count]) if options['author']: writer = csv.writer(open("author_report.csv", 'w')) writer.writerow(['Author', 'Division', 'Department', 'Count']) for netid, count in self.author_counts.items(): try: person = User.objects.get(username=netid).get_profile().esd_data() writer.writerow([person.directory_name, person.division_name, person.department_shortname, count]) except (User.DoesNotExist, UserProfile.DoesNotExist, EsdPerson.DoesNotExist) as e : self.output(0, "At least one part (User, Profile, ESD) for netid %s could not be found" % netid) if options['lead']: writer = csv.writer(open("lead_report.csv", 'w')) writer.writerow(['Division', 'Count']) for key, count in self.lead_counts.items(): writer.writerow([key, count]) # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % self.counts['total']) self.stdout.write("Skipped: %s\n" % self.counts['skipped']) self.stdout.write("Errors: %s\n" % self.counts['errors'])
def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 #counters self.counts = defaultdict(int) #connection to repository repo = Repository(username=settings.FEDORA_MANAGEMENT_USER, password=settings.FEDORA_MANAGEMENT_PASSWORD) #Symplectic-Elements setup self.session = requests.Session() self.session.auth = (settings.SYMPLECTIC_USER, settings.SYMPLECTIC_PASSWORD) self.session.verify=False self.session.stream=True self.session.headers.update({'Content-Type': 'text/xml'}) self.pub_query_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publications") self.pub_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publication/records/manual") self.relation_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "relationships") #if pids specified, use that list try: if len(args) != 0: pids = list(args) pid_set = [repo.get_object(pid=p,type=Article) for p in pids] else: #search for Articles. pid_set = repo.get_objects_with_cmodel(Article.ARTICLE_CONTENT_MODEL, Article) except Exception as e: raise CommandError('Error getting pid list (%s)' % e.message) try: articles = Paginator(pid_set, 20) self.counts['total'] = articles.count except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) self.counts['errors'] +=1 continue for article in objs: try: if not article.exists: self.output(1, "Skipping %s because pid does not exist" % article.pid) self.counts['skipped'] +=1 continue title = article.descMetadata.content.title_info.title if (article.descMetadata.content.title_info and article.descMetadata.content.title_info.title) else None if title is None or title == '': self.output(1, "Skipping %s because OE Title does not exist" % (article.pid)) self.counts['skipped'] +=1 continue if not article.is_published: self.output(1, "Skipping %s because pid is not published" % article.pid) self.counts['skipped'] +=1 continue # try to detect article by PMC if article.pmcid and not options['force']: response = self.session.get(self.pub_query_url, params = {'query' : 'external-identifiers.pmc="PMC%s"' % article.pmcid, 'detail': 'full'}) entries = load_xmlobject_from_string(response.raw.read(), OESympImportArticle).entries self.output(2, "Query for PMC Match: GET %s %s" % (response.url, response.status_code)) if response.status_code == 200: if len(entries) >= 1: self.output(1, "Skipping %s because PMC PMC%s already exists" % (article.pid, article.pmcid)) self.counts['skipped'] +=1 if options['rel']: symp_pub, relations = article.as_symp(source=entries[0].source, source_id=entries[0].source_id) self.process_relations(entries[0].source_id, relations, options) sleep(1) continue else: self.output(1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title)) self.counts['skipped'] +=1 continue # try to detect article by Title if it does not have PMC if not options['force']: response = self.session.get(self.pub_query_url, params = {'query' : 'title~"%s"' % title, 'detail': 'full'}) entries = load_xmlobject_from_string(response.raw.read(), OESympImportArticle).entries # Accouont for mutiple results titles = [e.title for e in entries] self.output(2, "Query for Title Match: GET %s %s" % (response.url, response.status_code)) if response.status_code == 200: found = False for t in titles: success, percent = percent_match(title, t, 90) self.output(1, "Percent Title Match '%s' '%s' %s " % (title, t, percent)) if success: found = True if found: self.output(1, "Skipping %s because Title \"%s\" already exists" % (article.pid, title)) self.counts['skipped'] +=1 # update relations if rel is set if options['rel']: symp_pub, relations = article.as_symp(source=entries[0].source, source_id=entries[0].source_id) self.process_relations(entries[0].source_id, relations, options) sleep(1) continue else: self.output(1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title)) self.counts['skipped'] +=1 continue # Process article and relations symp_pub, relations = article.as_symp() self.process_article(article.pid, symp_pub, options) self.process_relations(article.pid, relations, options) sleep(1) except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message)) import traceback traceback.print_exc() self.counts['errors'] +=1 # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % self.counts['total']) self.stdout.write("Skipped: %s\n" % self.counts['skipped']) self.stdout.write("Errors: %s\n" % self.counts['errors']) self.stdout.write("Warnings: %s\n" % self.counts['warnings']) self.stdout.write("Articles Processed: %s\n" % self.counts['articles_processed']) self.stdout.write("Relations Processed: %s\n" % self.counts['relations_processed'])
def handle(self, *args, **options): self.verbosity = int( options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 #counters self.counts = defaultdict(int) #connection to repository repo = Repository(username=settings.FEDORA_MANAGEMENT_USER, password=settings.FEDORA_MANAGEMENT_PASSWORD) #Symplectic-Elements setup self.session = requests.Session() self.session.auth = (settings.SYMPLECTIC_USER, settings.SYMPLECTIC_PASSWORD) self.session.verify = False self.session.stream = True self.session.headers.update({'Content-Type': 'text/xml'}) self.pub_query_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publications") self.pub_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publication/records/manual") self.relation_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "relationships") #if pids specified, use that list try: if len(args) != 0: pids = list(args) pid_set = [repo.get_object(pid=p, type=Article) for p in pids] else: #search for Articles. pid_set = repo.get_objects_with_cmodel( Article.ARTICLE_CONTENT_MODEL, Article) except Exception as e: raise CommandError('Error getting pid list (%s)' % e.message) try: articles = Paginator(pid_set, 20) self.counts['total'] = articles.count except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0, "Error getting page: %s : %s " % (p, e.message)) self.counts['errors'] += 1 continue for article in objs: try: if not article.exists: self.output( 1, "Skipping %s because pid does not exist" % article.pid) self.counts['skipped'] += 1 continue title = article.descMetadata.content.title_info.title if ( article.descMetadata.content.title_info and article.descMetadata.content.title_info.title ) else None if title is None or title == '': self.output( 1, "Skipping %s because OE Title does not exist" % (article.pid)) self.counts['skipped'] += 1 continue if not article.is_published: self.output( 1, "Skipping %s because pid is not published" % article.pid) self.counts['skipped'] += 1 continue # try to detect article by PMC if article.pmcid and not options['force']: response = self.session.get( self.pub_query_url, params={ 'query': 'external-identifiers.pmc="PMC%s"' % article.pmcid, 'detail': 'full' }) entries = load_xmlobject_from_string( response.raw.read(), OESympImportArticle).entries self.output( 2, "Query for PMC Match: GET %s %s" % (response.url, response.status_code)) if response.status_code == 200: if len(entries) >= 1: self.output( 1, "Skipping %s because PMC PMC%s already exists" % (article.pid, article.pmcid)) self.counts['skipped'] += 1 if options['rel']: symp_pub, relations = article.as_symp( source=entries[0].source, source_id=entries[0].source_id) self.process_relations( entries[0].source_id, relations, options) sleep(1) continue else: self.output( 1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title)) self.counts['skipped'] += 1 continue # try to detect article by Title if it does not have PMC if not options['force']: response = self.session.get(self.pub_query_url, params={ 'query': 'title~"%s"' % title, 'detail': 'full' }) entries = load_xmlobject_from_string( response.raw.read(), OESympImportArticle).entries # Accouont for mutiple results titles = [e.title for e in entries] self.output( 2, "Query for Title Match: GET %s %s" % (response.url, response.status_code)) if response.status_code == 200: found = False for t in titles: success, percent = percent_match(title, t, 90) self.output( 1, "Percent Title Match '%s' '%s' %s " % (title, t, percent)) if success: found = True if found: self.output( 1, "Skipping %s because Title \"%s\" already exists" % (article.pid, title)) self.counts['skipped'] += 1 # update relations if rel is set if options['rel']: symp_pub, relations = article.as_symp( source=entries[0].source, source_id=entries[0].source_id) self.process_relations( entries[0].source_id, relations, options) sleep(1) continue else: self.output( 1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title)) self.counts['skipped'] += 1 continue # Process article and relations symp_pub, relations = article.as_symp() self.process_article(article.pid, symp_pub, options) self.process_relations(article.pid, relations, options) sleep(1) except Exception as e: self.output( 0, "Error processing pid: %s : %s " % (article.pid, e.message)) import traceback traceback.print_exc() self.counts['errors'] += 1 # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % self.counts['total']) self.stdout.write("Skipped: %s\n" % self.counts['skipped']) self.stdout.write("Errors: %s\n" % self.counts['errors']) self.stdout.write("Warnings: %s\n" % self.counts['warnings']) self.stdout.write("Articles Processed: %s\n" % self.counts['articles_processed']) self.stdout.write("Relations Processed: %s\n" % self.counts['relations_processed'])
def handle(self, *args, **options): self.verbosity = int( options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 #counters counts = defaultdict(int) # check required options if not options['username']: raise CommandError('Username is required') else: if not options['password'] or options['password'] == '': options['password'] = getpass() #connection to repository repo = Repository(username=options['username'], password=options['password']) try: #if pids specified, use that list if len(args) != 0: pids = list(args) pid_set = [repo.get_object(pid=p, type=Article) for p in pids] else: #search for Articles pid_set = repo.get_objects_with_cmodel( Article.ARTICLE_CONTENT_MODEL, Article) except Exception as e: raise CommandError('Error gettings pids (%s)' % e.message) try: articles = Paginator(pid_set, 20) counts['total'] = articles.count except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0, "Error getting page: %s : %s " % (p, e.message)) counts['errors'] += 1 continue for article in objs: try: if not article.exists: self.output( 1, "Skipping %s because pid does not exist" % article.pid) counts['skipped'] += 1 continue else: self.output(0, "Processing %s" % article.pid) mods = article.descMetadata.content nlm = article.contentMetadata.content if article.contentMetadata.exists else None identifiers = [] #PMC info if nlm: pmc = nlm.docid pmc_id = 'PMC%s' % pmc access_url = pmc_access_url(pmc) identifiers.extend([pmc_id, access_url]) if mods.ark_uri: identifiers.append(mods.ark_uri) identifiers.append(article.pid) article.dc.content.identifier_list = identifiers ##########REMOVE dc.relation########### # # article.dc.content.relation_list = [] # # # ####################################### # save article if not options['noact']: article.save() self.output(1, "SAVED") except Exception as e: self.output( 0, "Error processing pid: %s : %s " % (article.pid, e.message)) counts['errors'] += 1 # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % counts['total']) self.stdout.write("Skipped: %s\n" % counts['skipped']) self.stdout.write("Errors: %s\n" % counts['errors'])
def main(argv): # Connect to repository repo = Repository(root='%s/fedora/' % HOST, username='******' % fedoraUser, password='******' % fedoraPass) # Get philologic pids using content model philologic_pids = repo.get_objects_with_cmodel(cmodel_uri='info:fedora/niu-objects:cmodel') # Logging phil_doc = open('phil_doc_dev.csv', 'w') image_ids = [] d = defaultdict(int) for pid in philologic_pids: # Logging images = [] image_count = 0 # Get the OBJ's content as string philologic = pid.getDatastreamObject('OBJ').content # Take the opportunity to replace deprecated HTML entity reference philologic = re.sub("˙", ".", philologic) # Load OBJ content into soup. Must specify html5lib parser, b/c lxml causes fatal exception (memory leak) soup = BeautifulSoup(philologic, "html5lib") # Find all ARTFL spans and <a>'s spans = soup.find_all("span", "ARTFL-figure-missing") links = soup.find_all("a", "ARTFL-figure") # Replace /fedora/repository with /islandora/object in existing links for a in links: href = a['href'] if href.startswith('/fedora/repository/'): a['href'] = '/islandora/object/%s' % href[19:] for span in spans: # Retreive the sysid and strip the file format. title = span['sysid'].split('.')[0] # Use sysid as title to send RI query for pid results = repo.risearch.sparql_query('select ?pid where {?pid <dc:title> "%s"}' % title) try: # sparql_query returns CSV object; next will retreive first row. # If no results, throw exception and log that image p = next(results)['pid'].replace('info:fedora/', '') # Create <a> tag with @href pointing to object new_tag = soup.new_tag("a", href="/islandora/object/%s/datastream/OBJ/view" % p) # B/c it's a reserved word, we have to add @class seperately new_tag['class']="ARTFL-figure" # Grab and add the <span> string new_tag.string = span.string # Replace <span> with <a> span.replace_with(new_tag) print "Successfully changed %s in %s" % (title, pid) except: print "Failed to locate %s in %s" % (title, pid) # Logging images.append(title) image_count+= 1 pass # Retreive entire OBJ datastream obj = pid.getDatastreamObject('OBJ') # Replace OBJ content with soup. Encoding as html to maintain entity references. obj.content = soup.encode(formatter="html") # Save and we're done. obj.save() # Because GSearch isn't listening, we have to index the update url = '%s/fedoragsearch/rest?operation=updateIndex&action=fromPid&value=%s' % (HOST, pid) gsearchOpener.open(url) # Rest is all logging not founds and errors image_ids.extend(images) images_string = ';'.join(images) phil_doc.write('%s,%s,%s\n' % (pid, image_count, images_string)) for i in image_ids: d[i] += 1 with open('phil_image_dev.csv', 'w') as outfile: phil_image = csv.writer(outfile) for key, value in d.items(): phil_image.writerow([key, value]) phil_doc.close()
def handle(self, *pids, **options): dry_run = options.get('dry_run', False) verbosity = int(options.get('verbosity', self.v_normal)) repo = Repository() try: pidman = DjangoPidmanRestClient() except Exception as err: # error if pid manager config options not in localsettings raise CommandError(err) # if pids are specified on command line, only process those objects if pids: objs = [repo.get_object(pid, type=Volume) for pid in pids] # otherwise, look for all volume objects in fedora else: objs = repo.get_objects_with_cmodel(Volume.VOLUME_CONTENT_MODEL, type=Volume) stats = defaultdict(int) for obj in objs: if not obj.exists: if verbosity >= self.v_normal: self.stdout.write( '%s does not exist or is not accessible' % obj.pid) stats['skipped'] += 1 continue stats['objs'] += 1 if is_ark(obj.dc.content.identifier): parsed_ark = parse_ark(obj.dc.content.identifier) noid = parsed_ark['noid'] try: ark_info = pidman.get_ark(noid) except Exception as err: # requested ARK is not in the configured pid manager # (this should ONLY happen in dev/QA) if verbosity >= self.v_normal: if '404: NOT FOUND' in str(err): msg = 'not found' self.stdout.write( 'Error retriving ARK information for %s: Not Found' % obj.pid) else: self.stdout.write( 'Error retriving ARK information for %s' % obj.pid) stats['skipped'] += 1 continue # update unqualified ark to resolve to readux volume landing page if not dry_run: pidman.update_ark_target(noid, target_uri=self.volume_url(obj), active=True) # we expected a qualified ARK target for the PDF; update whether # it currently exists or not qual = 'PDF' stats[ 'updated'] += 1 # count as updated in dry run mode (would be updated) if not dry_run: pidman.update_ark_target(noid, qual, target_uri=self.pdf_url(obj), active=True) # FIXME: catch possible exceptions here? # output summary if verbosity >= self.v_normal: msg = 'Processed %(objs)d object%%s; skipped %(skipped)d,%%s updated %(updated)d' % stats msg = msg % ('s' if stats['objs'] != 1 else '', ' would have' if dry_run else '') self.stdout.write(msg)
class Command(BaseCommand): '''Fetches `~openemory.publication.models.Article` objects from Fedora and fixes the DC and MODS checksumes: 1. Replaces '\r' with '' in abstract field. 2. Save object. Note: this will make a new version of the mods and copy some fields to the DC If PIDs are provided in the arguments, that list of pids will be used instead of searching Fedora. ''' args = "[pid pid ...]" help = __doc__ option_list = BaseCommand.option_list + ( make_option('--noact', '-n', action='store_true', default=False, help='Reports the pid and total number of Articles that would be processed but does not really do anything.'), make_option('--username', action='store', help='Username of fedora user to connect as'), make_option('--password', action='store', help='Password for fedora user, password= will prompt for password'), make_option('--host', action='store', default=settings.FEDORA_ROOT, help='Hostname of fedora instance. Defaults to localsettings:FEDORA_ROOT.'), ) def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 #counters counts = defaultdict(int) # check required options if not options['username']: raise CommandError('Username is required') else: if not options['password'] or options['password'] == '': options['password'] = getpass() #connection to repository self.repo = Repository(options['host'], username=options['username'], \ password=options['password'],) #if pids specified, use that list if len(args) != 0: article_set = self.get_pids(args) else: #search for Articles in Fedora. article_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication) #counts['total'] = article_set.count # self.stdout.write(article_set) #process all Articles for a in article_set: try: self.output(1, "Processing %s" % a.pid) if (a.descMetadata.content.abstract is not None) \ and (a.descMetadata.content.abstract.text) \ and ('\r' in a.descMetadata.content.abstract.text) \ and (not a.dc.validate_checksum()): a.descMetadata.content.abstract.text = a.descMetadata.content.abstract.text.replace('\r', '') # save article try: if not options['noact']: a.save("Removing backslash-r to fix checksums") except Exception as e: self.output(0, "Error processing pid: %s : %s " % (a.pid, e.message)) counts['errors'] +=1 counts['fixed'] +=1 else: self.output(1, "Skipping %s" % a.pid) counts['skip']+=1 except Exception as e: self.output(1, "Error on %s: %s" % (a.pid, e.message )) # summarize what was done self.stdout.write("\n\n") self.stdout.write("Fixed: %s\n" % counts['fixed']) self.stdout.write("Skipped: %s\n" % counts['skip']) self.stdout.write("Errors: %s\n" % counts['errors']) def get_pids(self, pids): # get objects only if they are Articles # Return generator for p in pids: obj = self.repo.get_object(pid=p, type=Publication) if str(obj.get_models()[0]) == Publication.ARTICLE_CONTENT_MODEL: yield obj def output(self, v, msg): '''simple function to handle logging output based on verbosity''' if self.verbosity >= v: self.stdout.write("%s\n" % msg)
def handle(self, *args, **options): self.verbosity = int( options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 # check required options if (not options['div']) and (not options['author']) and ( not options['lead']): raise CommandError( 'At least one of the options div, author or lead is required') if not options['username']: raise CommandError('Username is required') else: if not options['password'] or options['password'] == '': options['password'] = getpass() #connection to repository repo = Repository(username=options['username'], password=options['password']) pid_set = repo.get_objects_with_cmodel(Article.ARTICLE_CONTENT_MODEL, Article) try: articles = Paginator(pid_set, 100) self.counts['total'] = articles.count except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0, "Error getting page: %s : %s " % (p, e.message)) self.counts['errors'] += 1 continue for article in objs: try: if not article.exists: self.output( 0, "Skipping %s because pid does not exist" % article.pid) self.counts['skipped'] += 1 continue else: self.output(2, "Processing %s" % article.pid) if options['div']: self.division(article) if options['author']: self.author(article) if options['lead']: self.lead(article) except Exception as e: self.output( 0, "Error processing pid: %s : %s " % (article.pid, e.message)) self.counts['errors'] += 1 # write files if options['div']: writer = csv.writer(open("division_report.csv", 'w')) writer.writerow(['Division', 'Count']) for key, count in self.div_counts.items(): writer.writerow([key, count]) if options['author']: writer = csv.writer(open("author_report.csv", 'w')) writer.writerow(['Author', 'Division', 'Department', 'Count']) for netid, count in self.author_counts.items(): try: person = User.objects.get( username=netid).get_profile().esd_data() writer.writerow([ person.directory_name, person.division_name, person.department_shortname, count ]) except (User.DoesNotExist, UserProfile.DoesNotExist, EsdPerson.DoesNotExist) as e: self.output( 0, "At least one part (User, Profile, ESD) for netid %s could not be found" % netid) if options['lead']: writer = csv.writer(open("lead_report.csv", 'w')) writer.writerow(['Division', 'Count']) for key, count in self.lead_counts.items(): writer.writerow([key, count]) # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % self.counts['total']) self.stdout.write("Skipped: %s\n" % self.counts['skipped']) self.stdout.write("Errors: %s\n" % self.counts['errors'])
def handle(self, *args, **options): #counters counts = defaultdict(int) # check required options if not options['username']: raise CommandError('Username is required') else: if not options['password'] or options['password'] == '': options['password'] = getpass() #connection to repository repo = Repository(username=options['username'], password=options['password']) try: #if pids specified, use that list if len(args) != 0: pids = list(args) pid_set = [ repo.get_object(pid=p, type=AudioObject) for p in pids ] else: #search for Articles pid_set = repo.get_objects_with_cmodel( AudioObject.AUDIO_CONTENT_MODEL, AudioObject) except Exception as e: raise CommandError('Error gettings pids (%s)' % e.message) try: objects = Paginator(pid_set, 20) counts['total'] = objects.count except Exception as e: self.output("Error paginating items: : %s " % (e.message)) #process all Objects for p in objects.page_range: try: objs = objects.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output("Error getting page: %s : %s " % (p, e.message)) counts['errors'] += 1 continue for a in objs: try: if not a.exists: self.output("Skipping %s because pid does not exist" % a.pid) counts['skipped'] += 1 continue else: self.output("Processing %s" % a.pid) a._update_dc() # save object if not options['noact']: a.save("cleanup DC") self.output("SAVED %s" % a.pid) counts['saved'] += 1 counts['processed'] += 1 except Exception as e: self.output("Error processing pid: %s : %s " % (a.pid, e.message)) counts['errors'] += 1 # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % counts['total']) self.stdout.write("Total number processed: %s\n" % counts['processed']) self.stdout.write("Total number saved: %s\n" % counts['saved']) self.stdout.write("Skipped: %s\n" % counts['skipped']) self.stdout.write("Errors: %s\n" % counts['errors'])
def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 #counters counts = defaultdict(int) # check required options if not options['username']: raise CommandError('Username is required') else: if not options['password'] or options['password'] == '': options['password'] = getpass() #connection to repository repo = Repository(username=options['username'], password=options['password']) try: #if pids specified, use that list if len(args) != 0: pids = list(args) pid_set = [repo.get_object(pid=p, type=Publication) for p in pids] else: #search for Articles pid_set = repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, Article) except Exception as e: raise CommandError('Error gettings pids (%s)' % e.message) try: articles = Paginator(pid_set, 20) counts['total'] = articles.count except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) counts['errors'] +=1 continue for article in objs: try: if not article.exists: self.output(1, "Skipping %s because pid does not exist" % article.pid) counts['skipped'] +=1 continue else: self.output(0,"Processing %s" % article.pid) mods = article.descMetadata.content nlm = article.contentMetadata.content if article.contentMetadata.exists else None identifiers = [] #PMC info if nlm: pmc = nlm.docid pmc_id = 'PMC%s' % pmc access_url = pmc_access_url(pmc) identifiers.extend([pmc_id, access_url]) if mods.ark_uri: identifiers.append(mods.ark_uri) identifiers.append(article.pid) article.dc.content.identifier_list = identifiers ##########REMOVE dc.relation########### # # article.dc.content.relation_list = [] # # # ####################################### # save article if not options['noact']: article.save() self.output(1, "SAVED") except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message)) counts['errors'] +=1 # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % counts['total']) self.stdout.write("Skipped: %s\n" % counts['skipped']) self.stdout.write("Errors: %s\n" % counts['errors'])
def handle(self, *pids, **options): dry_run = options.get('dry_run', False) verbosity = int(options.get('verbosity', self.v_normal)) repo = Repository() try: pidman = DjangoPidmanRestClient() except Exception as err: # error if pid manager config options not in localsettings raise CommandError(err) # if pids are specified on command line, only process those objects if pids: objs = [repo.get_object(pid, type=Volume) for pid in pids] # otherwise, look for all volume objects in fedora else: objs = repo.get_objects_with_cmodel(Volume.VOLUME_CONTENT_MODEL, type=Volume) stats = defaultdict(int) for obj in objs: if not obj.exists: if verbosity >= self.v_normal: self.stdout.write('%s does not exist or is not accessible' % obj.pid) stats['skipped'] += 1 continue stats['objs'] += 1 if is_ark(obj.dc.content.identifier): parsed_ark = parse_ark(obj.dc.content.identifier) noid = parsed_ark['noid'] try: ark_info = pidman.get_ark(noid) except Exception as err: # requested ARK is not in the configured pid manager # (this should ONLY happen in dev/QA) if verbosity >= self.v_normal: if '404: NOT FOUND' in str(err): msg = 'not found' self.stdout.write('Error retriving ARK information for %s: Not Found' % obj.pid) else: self.stdout.write('Error retriving ARK information for %s' % obj.pid) stats['skipped'] += 1 continue # update unqualified ark to resolve to readux volume landing page if not dry_run: pidman.update_ark_target(noid, target_uri=self.volume_url(obj), active=True) # we expected a qualified ARK target for the PDF; update whether # it currently exists or not qual = 'PDF' stats['updated'] += 1 # count as updated in dry run mode (would be updated) if not dry_run: pidman.update_ark_target(noid, qual, target_uri=self.pdf_url(obj), active=True) # FIXME: catch possible exceptions here? # output summary if verbosity >= self.v_normal: msg = 'Processed %(objs)d object%%s; skipped %(skipped)d,%%s updated %(updated)d' % stats msg = msg % ('s' if stats['objs'] != 1 else '', ' would have' if dry_run else '') self.stdout.write(msg)
class Command(BaseCommand): '''Fetches `~openemory.publication.models.Article` objects from Fedora and fixes the DC and MODS checksumes: 1. Replaces '\r' with '' in abstract field. 2. Save object. Note: this will make a new version of the mods and copy some fields to the DC If PIDs are provided in the arguments, that list of pids will be used instead of searching Fedora. ''' args = "[pid pid ...]" help = __doc__ option_list = BaseCommand.option_list + ( make_option( '--noact', '-n', action='store_true', default=False, help= 'Reports the pid and total number of Articles that would be processed but does not really do anything.' ), make_option('--username', action='store', help='Username of fedora user to connect as'), make_option( '--password', action='store', help= 'Password for fedora user, password= will prompt for password'), make_option( '--host', action='store', default=settings.FEDORA_ROOT, help= 'Hostname of fedora instance. Defaults to localsettings:FEDORA_ROOT.' ), ) def handle(self, *args, **options): self.verbosity = int( options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 #counters counts = defaultdict(int) # check required options if not options['username']: raise CommandError('Username is required') else: if not options['password'] or options['password'] == '': options['password'] = getpass() #connection to repository self.repo = Repository(options['host'], username=options['username'], \ password=options['password'],) #if pids specified, use that list if len(args) != 0: article_set = self.get_pids(args) else: #search for Articles in Fedora. article_set = self.repo.get_objects_with_cmodel( Article.ARTICLE_CONTENT_MODEL, type=Article) #counts['total'] = article_set.count # self.stdout.write(article_set) #process all Articles for a in article_set: try: self.output(1, "Processing %s" % a.pid) if (a.descMetadata.content.abstract is not None) \ and (a.descMetadata.content.abstract.text) \ and ('\r' in a.descMetadata.content.abstract.text) \ and (not a.dc.validate_checksum()): a.descMetadata.content.abstract.text = a.descMetadata.content.abstract.text.replace( '\r', '') # save article try: if not options['noact']: a.save("Removing backslash-r to fix checksums") except Exception as e: self.output( 0, "Error processing pid: %s : %s " % (a.pid, e.message)) counts['errors'] += 1 counts['fixed'] += 1 else: self.output(1, "Skipping %s" % a.pid) counts['skip'] += 1 except Exception as e: self.output(1, "Error on %s: %s" % (a.pid, e.message)) # summarize what was done self.stdout.write("\n\n") self.stdout.write("Fixed: %s\n" % counts['fixed']) self.stdout.write("Skipped: %s\n" % counts['skip']) self.stdout.write("Errors: %s\n" % counts['errors']) def get_pids(self, pids): # get objects only if they are Articles # Return generator for p in pids: obj = self.repo.get_object(pid=p, type=Article) if str(obj.get_models()[0]) == Article.ARTICLE_CONTENT_MODEL: yield obj def output(self, v, msg): '''simple function to handle logging output based on verbosity''' if self.verbosity >= v: self.stdout.write("%s\n" % msg)