def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 if options['article'] and options['book']: raise CommandError('Can not use both parameters') if not options['article'] and not options['book']: raise CommandError('Use at least one parameter') if options['article']: cmodel = Publication.ARTICLE_CONTENT_MODEL if options['book']: cmodel = Publication.BOOK_CONTENT_MODEL # connection to repository self.repo = ManagementRepository() pid_set = self.repo.get_objects_with_cmodel(cmodel, type=Publication) try: publications = Paginator(pid_set, 100) except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in publications.page_range: try: objs = publications.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) continue for publication in objs: try: if not publication.exists: self.output(0, "Skipping %s because pid does not exist" % publication.pid) continue else: if not publication.has_model(Publication.PUBLICATION_CONTENT_MODEL): publication.add_relationship(relsextns.hasModel, Publication.PUBLICATION_CONTENT_MODEL) publication.save() else: continue except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
def handle(self, *args, **options): self.options = options self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 # counters self.counts = defaultdict(int) # duplicates list self.duplicates = {} # set the name of the report of duplications self.reportsdirectory = settings.REPORTS_DIR self.reportname = "replaces-report-%s.txt" % strftime("%Y-%m-%dT%H-%M-%S") # connection to repository self.repo = ManagementRepository() # get last run time and set new one time_zone = pytz.timezone('US/Eastern') last_run = LastRun.objects.get(name='Convert Symp to OE') date = last_run.start_time self.output(1, '%s EST' % date.strftime("%Y-%m-%dT%H:%M:%S")) date = time_zone.localize(date) date = date.astimezone(pytz.utc) date_str = date.strftime("%Y-%m-%dT%H:%M:%S") self.output(1, '%s UTC' % date_str) try: # Raise error if replace or ignore is not specified if self.options['replace'] is self.options['ignore']: raise Exception("no actions set. Specify --replace or --ignore") #if pids specified, use that list if len(args) != 0: pids = list(args) else: raise Exception("no pids specified") except Exception as e: raise Exception("Error getting pids: %s" % e.message) self.counts['total'] = len(pids) for pid in pids: try: self.output(1, "\nProcessing %s" % pid) # Load first as Article becauce that is the most likely type obj = self.repo.get_object(pid=pid) if not obj.exists: self.output(1, "Skipping because %s does not exist" % pid) continue ds = obj.getDatastreamObject('SYMPLECTIC-ATOM') if not ds: self.output(1, "Skipping %s because SYMPLECTIC-ATOM ds does not exist" % pid) continue ds_mod = ds.last_modified().strftime("%Y-%m-%dT%H:%M:%S") # # for property, value in vars(ds).iteritems(): # msg = "%s: %s" %(property, value) # self.output(1, msg) # WHEN OVERWRITING ORINGIALS WITH A DUPLICATE # 1. Make sure object content model has from_symp() function # 2. Add to content_types dict # 3. Add elif block (see few lines below) # 4. Add line in summary section # choose content type content_types = {'Article': 'journal article', 'Book': 'book', 'Chapter': 'chapter', 'Conference': 'conference', 'Poster': 'poster', 'Report': 'report', 'Presentation': 'presentation'} obj_types = ds.content.node.xpath('atom:category/@label', namespaces={'atom': 'http://www.w3.org/2005/Atom'}) if obj_types[1] in content_types.values(): logging.info("Processing %s as Publication" % pid) obj = self.repo.get_object(pid=pid, type=Publication) else: logging.info("Skipping %s Invalid Content Type" % pid) continue obj.from_symp() # get a list of predicates properties = [] for p in list(obj.rels_ext.content.predicates()): properties.append(str(p)) # process only if the rels-ext has the "replaces" tag, which indicates duplicates replaces_tag = "http://purl.org/dc/terms/replaces" if replaces_tag in properties: # Get the pubs object # pubs_id = obj.sympAtom.content.serialize().split('<pubs:id>')[1].split('</pubs:id>')[0] pubs_id = obj.sympAtom.content.pubs_id pubs_id = "pubs:%s" % (pubs_id) self.output(1, "Pub ID: %s" % pubs_id) pubs_obj = self.repo.get_object(pid=pubs_id) self.counts['Publication']+=1 original_pid = obj.rels_ext.content.serialize().split('<dcterms:replaces rdf:resource="')[1].split('"')[0] original_obj = self.repo.get_object(pid=original_pid, type=Publication) original_obj.from_symp() if not original_obj.exists: self.output(1, "Skipping because %s does not exist" % original_obj) self.counts['skipped']+=1 continue if not pid in original_obj.rels_ext.content.serialize(): self.output(1, "Skipping because %s does not contain %s" % (original_obj, pid) ) self.counts['skipped']+=1 continue self.output(1, "Original pid: %s\n Duplicate pid: %s" % (original_pid, pid)) # REPLACE ORIGINAL WITH DUPLICATE if self.options['replace']: original_obj.sympAtom.content = obj.sympAtom.content # replace PDF mime = None mime_ds_list = [i for i in obj.ds_list if obj.ds_list[i].mimeType in obj.allowed_mime_types.values()] if mime_ds_list: # sort by DS timestamp does not work yet asks for global name obj because of lambda function new_dict = {} for mime in mime_ds_list: new_dict[mime] = obj.getDatastreamObject(mime).last_modified() sorted_mimes = sorted(new_dict.items(), key=lambda x: x[1]) # sorted_mimes = sorted(mime_ds_list, key=lambda p: str(obj.getDatastreamObject(p).last_modified())) mime = sorted_mimes[-1][0] # most recent original_obj.pdf.content = obj.getDatastreamObject(mime).content # IGNORE DUPLICATE elif self.options['ignore']: self.reportname = "ignore-report-%s.txt" % strftime("%Y-%m-%dT%H-%M-%S") # Add to duplicate dict for report self.duplicates[pid.replace('info:fedora/','')] = original_pid.replace('info:fedora/','') # Update pubs object to point hasCurrent and hasVisible attibutes to the original_pid sympns = Namespace('info:symplectic/symplectic-elements:def/model#') pubs_obj.rels_ext.content.bind('symp', sympns) has_current = (URIRef("info:fedora/"+pubs_obj.pid),\ URIRef('info:symplectic/symplectic-elements:def/model#hasCurrent'), \ URIRef(original_pid)) has_visible = (URIRef("info:fedora/"+pubs_id),\ URIRef('info:symplectic/symplectic-elements:def/model#hasVisible'), \ URIRef(original_pid)) # hasCurrent pubs_obj.rels_ext.content.remove(has_current) pubs_obj.rels_ext.content.set(has_current) # hasVisible pubs_obj.rels_ext.content.remove(has_visible) pubs_obj.rels_ext.content.set(has_visible) # Close pubs rels_ext object pubs_obj.rels_ext.content.close() # SAVE OBJECTS UNLESS NOACT OPTION if not options['noact']: original_obj.save() pubs_obj.save() self.counts['saved']+=1 # if not a duplicate else: self.output(1, "Skipping because %s is not a duplicate" % pid) self.counts['skipped']+=1 continue except (KeyboardInterrupt, SystemExit): if self.counts['saved'] > 0: self.write_report(self.duplicates, error="interrupt") raise except Exception as e: self.output(1, "Error processing %s: %s" % (pid, e.message)) self.output(1, obj.rels_ext.content.serialize(pretty=True)) self.counts['errors']+=1 # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % self.counts['total']) self.stdout.write("Skipped: %s\n" % self.counts['skipped']) self.stdout.write("Errors: %s\n" % self.counts['errors']) self.stdout.write("Converted: %s\n" % self.counts['saved']) if self.counts['saved'] > 0: self.write_report(self.duplicates)
def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 # counters self.counts = defaultdict(int) # connection to repository repo = ManagementRepository() # Symplectic-Elements setup self.session = requests.Session() self.session.auth = (settings.SYMPLECTIC_USER, settings.SYMPLECTIC_PASSWORD) self.session.verify=False self.session.stream=True self.session.headers.update({'Content-Type': 'text/xml'}) self.pub_query_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publications") self.pub_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publication/records/manual") self.relation_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "relationships") # if pids specified, use that list try: if len(args) != 0: pids = list(args) pid_set = [repo.get_object(pid=p,type=Publication) for p in pids] else: #search for Articles. pid_set = repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, Article) except Exception as e: raise CommandError('Error getting pid list (%s)' % e.message) try: articles = Paginator(pid_set, 20) self.counts['total'] = articles.count except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) self.counts['errors'] +=1 continue for article in objs: try: # if not article.exists: # self.output(1, "Skipping %s because pid does not exist" % article.pid) # self.counts['skipped'] +=1 # continue # title = article.descMetadata.content.title_info.title if (article.descMetadata.content.title_info and article.descMetadata.content.title_info.title) else None # if title is None or title == '': # self.output(1, "Skipping %s because OE Title does not exist" % (article.pid)) # self.counts['skipped'] +=1 # continue # if not article.is_published: # self.output(1, "Skipping %s because pid is not published" % article.pid) # self.counts['skipped'] +=1 # continue # # try to detect article by PMC # if article.pmcid and not options['force']: # response = self.session.get(self.pub_query_url, params = {'query' : 'external-identifiers.pmc="PMC%s"' % article.pmcid, 'detail': 'full'}) # entries = load_xmlobject_from_string(response.raw.read(), OESympImportPublication).entries # self.output(2, "Query for PMC Match: GET %s %s" % (response.url, response.status_code)) # if response.status_code == 200: # if len(entries) >= 1: # self.output(1, "Skipping %s because PMC PMC%s already exists" % (article.pid, article.pmcid)) # self.counts['skipped'] +=1 # if options['rel']: # symp_pub, relations = article.as_symp(source=entries[0].source, source_id=entries[0].source_id) # self.process_relations(entries[0].source_id, relations, options) # sleep(1) # continue # else: # self.output(1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title)) # self.counts['skipped'] +=1 # continue # # try to detect article by Title if it does not have PMC # if not options['force']: # response = self.session.get(self.pub_query_url, params = {'query' : 'title~"%s"' % title, 'detail': 'full'}) # entries = load_xmlobject_from_string(response.raw.read(), OESympImportPublication).entries # # Accouont for mutiple results # titles = [e.title for e in entries] # self.output(2, "Query for Title Match: GET %s %s" % (response.url, response.status_code)) # if response.status_code == 200: # found = False # for t in titles: # success, percent = percent_match(title, t, 90) # self.output(1, "Percent Title Match '%s' '%s' %s " % (title, t, percent)) # if success: # found = True # if found: # self.output(1, "Skipping %s because Title \"%s\" already exists" % (article.pid, title)) # self.counts['skipped'] +=1 # # update relations if rel is set # if options['rel']: # symp_pub, relations = article.as_symp(source=entries[0].source, source_id=entries[0].source_id) # self.process_relations(entries[0].source_id, relations, options) # sleep(1) # continue # else: # self.output(1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title)) # self.counts['skipped'] +=1 # continue # Process article and relations symp_pub, relations = article.as_symp() self.process_article(article.pid, symp_pub, options) self.process_relations(article.pid, relations, options) sleep(1) except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message)) import traceback traceback.print_exc() self.counts['errors'] +=1 # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % self.counts['total']) self.stdout.write("Skipped: %s\n" % self.counts['skipped']) self.stdout.write("Errors: %s\n" % self.counts['errors']) self.stdout.write("Warnings: %s\n" % self.counts['warnings']) self.stdout.write("Articles Processed: %s\n" % self.counts['articles_processed']) self.stdout.write("Relations Processed: %s\n" % self.counts['relations_processed'])
def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 # connection to repository self.repo = ManagementRepository() pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication) writer = csv.writer(open("publications_csv.csv", 'wb')) writer.writerow([ smart_str(u"PID"), smart_str(u"Title"), smart_str(u"Withdrawn"), smart_str(u"Authors"), smart_str(u"Journal Title"), smart_str(u"Publisher"), smart_str(u"Version"), smart_str(u"Final Published Link"), smart_str(u"DOI"), smart_str(u"Subjects"), smart_str(u"Funding Group"), smart_str(u"CC License"), smart_str(u"Copyright Statement"), smart_str(u"Admin Note"), smart_str(u"Date Reviewed"), smart_str(u"Rights Research Date"), smart_str(u"PMC"), smart_str(u"PUBSID"), smart_str(u"File Deposited"), ]) try: articles = Paginator(pid_set, 100) except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) continue for article in objs: try: if not article.exists: self.output(0, "Skipping %s because pid does not exist" % article.pid) continue else: mods = article.descMetadata.content symp = article.sympAtom.content authors = [] subjects = [] funders = [] for author in mods.authors: authors.append('%s %s' % (author.given_name, author.family_name)) for subject in mods.subjects: subjects.append(subject.topic) for funder in mods.funders: funders.append(funder.name) writer.writerow([ smart_str(article.pid if article.pid else ''), smart_str(article.label if article.label else ''), smart_str(article.is_withdrawn), smart_str(",".join(authors)), smart_str(mods.journal.title if mods.journal else ''), smart_str(mods.journal.publisher if mods.journal else ''), smart_str(mods.version if mods.version else ''), smart_str(mods.final_version.url if mods.final_version else ''), smart_str(mods.final_version.doi if mods.final_version else ''), smart_str(",".join(subjects)), smart_str(",".join(funders)), smart_str(mods.license.text if mods.license else ''), smart_str(mods.copyright.text if mods.copyright else ''), smart_str(mods.admin_note.text if mods.admin_note else ''), smart_str(article.provenance.content.date_reviewed if article.provenance else ''), smart_str(mods.rights_research_date if mods.rights_research_date else ''), smart_str(article.pmcid if article.pmcid else ''), smart_str(symp.pubs_id if symp else ''), smart_str("Yes" if article.pdf.exists else 'No'), ]) except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message)) # self.counts['errors'] +=1 writer.close()
class Command(BaseCommand): ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo ''' args = "[netid netid ...]" help = __doc__ option_list = BaseCommand.option_list + ( make_option('--noact', '-n', action='store_true', default=False, help='Fixed all caps title in articles'), ) def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 # connection to repository self.repo = ManagementRepository() pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication) writer = csv.writer(open("publications_csv.csv", 'wb')) writer.writerow([ smart_str(u"PID"), smart_str(u"Title"), smart_str(u"Withdrawn"), smart_str(u"Authors"), smart_str(u"Journal Title"), smart_str(u"Publisher"), smart_str(u"Version"), smart_str(u"Final Published Link"), smart_str(u"DOI"), smart_str(u"Subjects"), smart_str(u"Funding Group"), smart_str(u"CC License"), smart_str(u"Copyright Statement"), smart_str(u"Admin Note"), smart_str(u"Date Reviewed"), smart_str(u"Rights Research Date"), smart_str(u"PMC"), smart_str(u"PUBSID"), smart_str(u"File Deposited"), ]) try: articles = Paginator(pid_set, 100) except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) continue for article in objs: try: if not article.exists: self.output(0, "Skipping %s because pid does not exist" % article.pid) continue else: mods = article.descMetadata.content symp = article.sympAtom.content authors = [] subjects = [] funders = [] for author in mods.authors: authors.append('%s %s' % (author.given_name, author.family_name)) for subject in mods.subjects: subjects.append(subject.topic) for funder in mods.funders: funders.append(funder.name) writer.writerow([ smart_str(article.pid if article.pid else ''), smart_str(article.label if article.label else ''), smart_str(article.is_withdrawn), smart_str(",".join(authors)), smart_str(mods.journal.title if mods.journal else ''), smart_str(mods.journal.publisher if mods.journal else ''), smart_str(mods.version if mods.version else ''), smart_str(mods.final_version.url if mods.final_version else ''), smart_str(mods.final_version.doi if mods.final_version else ''), smart_str(",".join(subjects)), smart_str(",".join(funders)), smart_str(mods.license.text if mods.license else ''), smart_str(mods.copyright.text if mods.copyright else ''), smart_str(mods.admin_note.text if mods.admin_note else ''), smart_str(article.provenance.content.date_reviewed if article.provenance else ''), smart_str(mods.rights_research_date if mods.rights_research_date else ''), smart_str(article.pmcid if article.pmcid else ''), smart_str(symp.pubs_id if symp else ''), smart_str("Yes" if article.pdf.exists else 'No'), ]) except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message)) # self.counts['errors'] +=1 writer.close() def output(self, v, msg): '''simple function to handle logging output based on verbosity''' if self.verbosity >= v: self.stdout.write("%s\n" % msg)
def handle(self, *args, **options): self.options = options self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 # counters self.counts = defaultdict(int) # duplicates list self.duplicates = {} # set the name of the report of duplications self.reportsdirectory = settings.REPORTS_DIR self.reportname = "replaces-report-%s.txt" % strftime("%Y-%m-%dT%H-%M-%S") # connection to repository self.repo = ManagementRepository() # get last run time and set new one time_zone = pytz.timezone('US/Eastern') last_run = LastRun.objects.get(name='Convert Symp to OE') date = last_run.start_time self.output(1, '%s EST' % date.strftime("%Y-%m-%dT%H:%M:%S")) date = time_zone.localize(date) date = date.astimezone(pytz.utc) date_str = date.strftime("%Y-%m-%dT%H:%M:%S") self.output(1, '%s UTC' % date_str) try: #if pids specified, use that list if len(args) != 0: pids = list(args) else: raise Exception("no pids specified") except Exception as e: raise Exception("Error getting pids: %s" % e.message) self.counts['total'] = len(pids) try: self.output(1, "\nProcessing %s" % pids[0]) # Load first as Article becauce that is the most likely type obj = self.repo.get_object(pid=pids[0]) if not obj.exists: self.output(1, "Skipping because %s does not exist" % pids[0]) raise Exception("Error getting pids: %s" % e.message) # continue # choose content type content_types = {'Article': 'journal article'} obj_types = ds.content.node.xpath('atom:category/@label', namespaces={'atom': 'http://www.w3.org/2005/Atom'}) if obj_types[1] in content_types.values(): logging.info("Processing %s as Publication" % pids[0]) obj = self.repo.get_object(pid=pids[0], type=Publication) else: logging.info("Skipping %s Invalid Content Type" % pids[0]) raise Exception("Error getting pids: %s" % e.message) # continue # Get the pubs object # pubs_id = obj.sympAtom.content.serialize().split('<pubs:id>')[1].split('</pubs:id>')[0] pubs_id = "pubs:%s" % (pids[1]) self.output(1, "Pub ID: %s" % pubs_id) #ingesting new pubs_id object foxml = '<?xml version="1.0" encoding="UTF-8"?>' '<foxml:digitalObject VERSION="1.1" PID="'+ pubs_id +'"' 'xmlns:foxml="info:fedora/fedora-system:def/foxml#"' 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"' 'xsi:schemaLocation="info:fedora/fedora-system:def/foxml# http://www.fedora.info/definitions/1/0/foxml1-1.xsd">' '<foxml:objectProperties>' '<foxml:property NAME="info:fedora/fedora-system:def/model#state" VALUE="Active"/>' '</foxml:objectProperties>' '</foxml:digitalObject>' pubs_obj = self.repo.ingest(text=foxml) obj = repo.get_object(pid=pubs_id) obj.dc.content.identifier_list.extend(pubs_id) original_pid = repo.get_object(pid=pids[0], type=Publication) # pubs_dc = '<oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"><dc:identifier>'+ pubs_id +'</dc:identifier></oai_dc:dc>' # pubs_obj.dc.content = pubs_dc # Update pubs object to point hasCurrent and hasVisible attibutes to the original_pid sympns = Namespace('info:symplectic/symplectic-elements:def/model#') pubs_obj.rels_ext.content.bind('symp', sympns) has_current = (URIRef("info:fedora/"+obj.pid),\ URIRef('info:symplectic/symplectic-elements:def/model#hasCurrent'), \ URIRef(original_pid)) has_visible = (URIRef("info:fedora/"+pubs_id),\ URIRef('info:symplectic/symplectic-elements:def/model#hasVisible'), \ URIRef(original_pid)) # hasCurrent obj.rels_ext.content.set(has_current) # hasVisible obj.rels_ext.content.set(has_visible) # Close pubs rels_ext object obj.rels_ext.content.close() symp_pub, relations = original_pid.as_symp() self.process_article(original_pid.pid, symp_pub, options) self.process_relations(original_pid.pid, relations, options) # SAVE OBJECTS UNLESS NOACT OPTION if not options['noact']: original_obj.save() pubs_obj.save() self.counts['saved']+=1
def handle(self, *args, **options): self.options = options self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 year = date.today().year quarter = year_quarter(date.today().month) #get the quarter 1, 2, 3, 4 # counters self.counts = defaultdict(int) # set the name of the report of duplications self.reportsdirectory = settings.REPORTS_DIR self.reportname = "merge-report-%s.txt" % strftime("%Y-%m-%dT%H-%M-%S") # connection to repository self.repo = ManagementRepository() try: #if pids specified, use that list if len(args) == 2: pids = list(args) else: raise Exception("specify two pid") except Exception as e: raise Exception("Error getting pids: %s" % e.message) self.counts['total'] = len(pids) for idx,pid in enumerate(pids): try: if idx == 0: self.output(1, "\nProcessing Elements PID %s" % pid) # Load first as Article becauce that is the most likely type element_obj = self.repo.get_object(pid=pid, type=Publication) element_stats = ArticleStatistics.objects.filter(pid=pid) if element_stats: element_stats.delete() if not element_obj.exists: self.output(1, "Skipping because %s does not exist" % pid) continue elif idx == 1: self.output(1, "\nProcessing Old PID %s" % pid) original_obj = self.repo.get_object(pid=pid, type=Publication) if not original_obj.exists: self.output(1, "Skipping because %s does not exist" % pid) continue original_stats = ArticleStatistics.objects.filter(pid=pid) if not original_stats: original_stats = ArticleStatistics.objects.create(pid=pid, year=year, quarter=quarter) except (KeyboardInterrupt, SystemExit): if self.counts['saved'] > 0: self.write_report(self.duplicates, error="interrupt") raise except Exception as e: self.output(1, "Error processing %s: %s" % (pid, e.message)) self.output(1, element_obj.rels_ext.content.serialize(pretty=True)) self.counts['errors']+=1 element_obj.descMetadata.content = original_obj.descMetadata.content element_obj.provenance.content = original_obj.provenance.content element_obj.dc.content = original_obj.dc.content if original_obj.pdf.content: element_obj.pdf.content = original_obj.pdf.content original_obj.state = 'I' element_obj.provenance.content.init_object(element_obj.pid, 'pid') element_obj.provenance.content.merged(original_obj.pid, element_obj.pid) ArticleStatistics.objects.filter(pid=element_obj.pid).delete() for stat in original_stats: ArticleStatistics.objects.create(pid=element_obj.pid, year=stat.year, quarter=stat.quarter, num_downloads=stat.num_downloads, num_views=stat.num_views) coll = self.repo.get_object(pid=settings.PID_ALIASES['oe-collection']) element_obj.collection = coll element_obj.rels_ext.content.add((element_obj.uriref, relsextns.hasModel, URIRef(Publication.ARTICLE_CONTENT_MODEL))) element_obj.rels_ext.content.add((element_obj.uriref, relsextns.hasModel, URIRef(Publication.PUBLICATION_CONTENT_MODEL))) # SAVE OBJECTS UNLESS NOACT OPTION if not options['noact']: element_obj.save() original_obj.save() self.counts['saved']+=1 # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % self.counts['total']) self.stdout.write("Skipped: %s\n" % self.counts['skipped']) self.stdout.write("Errors: %s\n" % self.counts['errors']) self.stdout.write("Converted: %s\n" % self.counts['saved'])
class Command(BaseCommand): '''Provides merge/ignore options for duplicate objects created by Elements connector for manual duplicate management. This alters the pubs_object that the original and duplicate share. ''' args = "[pid pid ...]" help = __doc__ option_list = BaseCommand.option_list + ( make_option('--noact', '-n', action='store_true', default=False, help='Reports the pid and total number of object that would be processed but does not really do anything.'), make_option('--ignore', '-i', action='store_true', default=False, help='Changes the pub object to disregard the duplicate pids.'), make_option('--merge', '-m', action='store_true', default=False, help='Keeps the changes from the duplicate pids by copying ATOM-FEED to original.'), ) def handle(self, *args, **options): self.options = options self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 year = date.today().year quarter = year_quarter(date.today().month) #get the quarter 1, 2, 3, 4 # counters self.counts = defaultdict(int) # set the name of the report of duplications self.reportsdirectory = settings.REPORTS_DIR self.reportname = "merge-report-%s.txt" % strftime("%Y-%m-%dT%H-%M-%S") # connection to repository self.repo = ManagementRepository() try: #if pids specified, use that list if len(args) == 2: pids = list(args) else: raise Exception("specify two pid") except Exception as e: raise Exception("Error getting pids: %s" % e.message) self.counts['total'] = len(pids) for idx,pid in enumerate(pids): try: if idx == 0: self.output(1, "\nProcessing Elements PID %s" % pid) # Load first as Article becauce that is the most likely type element_obj = self.repo.get_object(pid=pid, type=Publication) element_stats = ArticleStatistics.objects.filter(pid=pid) if element_stats: element_stats.delete() if not element_obj.exists: self.output(1, "Skipping because %s does not exist" % pid) continue elif idx == 1: self.output(1, "\nProcessing Old PID %s" % pid) original_obj = self.repo.get_object(pid=pid, type=Publication) if not original_obj.exists: self.output(1, "Skipping because %s does not exist" % pid) continue original_stats = ArticleStatistics.objects.filter(pid=pid) if not original_stats: original_stats = ArticleStatistics.objects.create(pid=pid, year=year, quarter=quarter) except (KeyboardInterrupt, SystemExit): if self.counts['saved'] > 0: self.write_report(self.duplicates, error="interrupt") raise except Exception as e: self.output(1, "Error processing %s: %s" % (pid, e.message)) self.output(1, element_obj.rels_ext.content.serialize(pretty=True)) self.counts['errors']+=1 element_obj.descMetadata.content = original_obj.descMetadata.content element_obj.provenance.content = original_obj.provenance.content element_obj.dc.content = original_obj.dc.content if original_obj.pdf.content: element_obj.pdf.content = original_obj.pdf.content original_obj.state = 'I' element_obj.provenance.content.init_object(element_obj.pid, 'pid') element_obj.provenance.content.merged(original_obj.pid, element_obj.pid) ArticleStatistics.objects.filter(pid=element_obj.pid).delete() for stat in original_stats: ArticleStatistics.objects.create(pid=element_obj.pid, year=stat.year, quarter=stat.quarter, num_downloads=stat.num_downloads, num_views=stat.num_views) coll = self.repo.get_object(pid=settings.PID_ALIASES['oe-collection']) element_obj.collection = coll element_obj.rels_ext.content.add((element_obj.uriref, relsextns.hasModel, URIRef(Publication.ARTICLE_CONTENT_MODEL))) element_obj.rels_ext.content.add((element_obj.uriref, relsextns.hasModel, URIRef(Publication.PUBLICATION_CONTENT_MODEL))) # SAVE OBJECTS UNLESS NOACT OPTION if not options['noact']: element_obj.save() original_obj.save() self.counts['saved']+=1 # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % self.counts['total']) self.stdout.write("Skipped: %s\n" % self.counts['skipped']) self.stdout.write("Errors: %s\n" % self.counts['errors']) self.stdout.write("Converted: %s\n" % self.counts['saved']) def output(self, v, msg): '''simple function to handle logging output based on verbosity''' if self.verbosity >= v: self.stdout.write("%s\n" % msg)
class Command(BaseCommand): ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo ''' args = "[netid netid ...]" help = __doc__ option_list = BaseCommand.option_list + ( make_option('--noact', '-n', action='store_true', default=False, help='Fixed all caps title in articles'), ) def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 # connection to repository self.repo = ManagementRepository() pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication) try: articles = Paginator(pid_set, 100) except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) continue for article in objs: try: if not article.exists: self.output(0, "Skipping %s because pid does not exist" % article.pid) continue else: mods = article.descMetadata.content if mods.journal is not None: if mods.journal.title is not None: try: journals = romeo.search_journal_title(mods.journal.title, type='starts') if mods.journal.title else [] suggestions = [journal_suggestion_data(journal) for journal in journals] if mods.journal.title.lower() in map(str.lower, JOURNAL_LIST): mods.journal.title = suggestions[0]['value'] print "JOURNAL" print mods.journal.title article.save() else: continue except: suggestions = [] # if mods.journal.publisher is not None: # try: # publishers = romeo.search_publisher_name(mods.journal.publisher, versions='all') # suggestions = [publisher_suggestion_data(pub) for pub in publishers] # mods.journal.publisher = suggestions[0]['value'] # print "PUBLISHER" # print mods.journal.publisher # except: # suggestions = [] else: continue except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message)) # self.counts['errors'] +=1 def output(self, v, msg): '''simple function to handle logging output based on verbosity''' if self.verbosity >= v: self.stdout.write("%s\n" % msg)
def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 # connection to repository self.repo = ManagementRepository() pid_set = self.repo.get_objects_with_cmodel(Publication.ARTICLE_CONTENT_MODEL, type=Publication) try: articles = Paginator(pid_set, 100) except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) continue for article in objs: try: if not article.exists: self.output(0, "Skipping %s because pid does not exist" % article.pid) continue else: mods = article.descMetadata.content if mods.journal is not None: if mods.journal.title is not None: try: journals = romeo.search_journal_title(mods.journal.title, type='starts') if mods.journal.title else [] suggestions = [journal_suggestion_data(journal) for journal in journals] if mods.journal.title.lower() in map(str.lower, JOURNAL_LIST): mods.journal.title = suggestions[0]['value'] print "JOURNAL" print mods.journal.title article.save() else: continue except: suggestions = [] # if mods.journal.publisher is not None: # try: # publishers = romeo.search_publisher_name(mods.journal.publisher, versions='all') # suggestions = [publisher_suggestion_data(pub) for pub in publishers] # mods.journal.publisher = suggestions[0]['value'] # print "PUBLISHER" # print mods.journal.publisher # except: # suggestions = [] else: continue except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))
def handle(self, *args, **options): self.options = options self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 #counters self.counts = defaultdict(int) # duplicates list self.duplicates = {} # error list self.errors = {} # set the name of the report of duplications self.reportsdirectory = settings.REPORTS_DIR self.reportname = "duplicates-report-%s.txt" % strftime("%Y-%m-%dT%H-%M-%S") # connection to repository self.repo = ManagementRepository() # get last run time and set new one time_zone = pytz.timezone('US/Eastern') if not options['date']: last_run = LastRun.objects.get(name='Convert Symp to OE') date = last_run.start_time else: try: date = datetime.strptime(options['date'], '%Y-%m-%dT%H:%M:%S') except: raise CommandError("Could not parse date") if options['date'] and len(args) !=0: raise CommandError('Can not use date option with list of pids') if (not options['date']) and (len(args) == 0) and (not options['noact']) and (not options['force']): last_run.start_time = datetime.now() last_run.save() logging.info('%s EST' % date.strftime("%Y-%m-%dT%H:%M:%S")) date = time_zone.localize(date) date = date.astimezone(pytz.utc) date_str = date.strftime("%Y-%m-%dT%H:%M:%S") logging.info('%s UTC' % date_str) try: #if pids specified, use that list if len(args) != 0: pids = list(args) else: query = """SELECT ?pid WHERE { ?pid <info:fedora/fedora-system:def/view#disseminates> ?ds. ?pid <info:fedora/fedora-system:def/model#createdDate> ?created. FILTER ( regex(str(?ds), 'SYMPLECTIC-ATOM') && ?created >= xsd:dateTime('%sZ') ) }""" % date_str pids = [o['pid'] for o in self.repo.risearch.sparql_query(query)] except Exception as e: raise Exception("Error getting pids: %s" % e.message) self.counts['total'] = len(pids) for pid in pids: try: logging.info("Processing %s" % pid) # Load first as Publication becauce that is the most likely type obj = self.repo.get_object(pid=pid) if not obj.exists: logging.warning("Skipping because %s does not exist" % pid) continue ds = obj.getDatastreamObject('SYMPLECTIC-ATOM') if not ds: logging.warning("Skipping %s because SYMPLECTIC-ATOM ds does not exist" % pid) continue ds_mod = ds.last_modified().strftime("%Y-%m-%dT%H:%M:%S") if date_str and ds_mod < date_str and (not options['force']): logging.warning("Skipping %s because SYMPLECTIC-ATOM ds not modified since last run %s " % (pid, ds_mod)) self.counts['skipped']+=1 continue license = obj.getDatastreamObject('SYMPLECTIC-LICENCE') if not license.content: logging.warning("Skipping %s because SYMPLECTIC-LICENCE ds not modified since last run %s " % (pid, ds_mod)) self.counts['skipped']+=1 payload = {"text": "No Assent Publication.\n pid: %s" % pid} r = requests.post(settings.SLACK_TOKEN, data=json.dumps(payload)) # WHEN ADDING NEW CONTENT TYPES: # 1. Make sure object content modle has from_symp() function # 2. Add to content_types dict # 3. Add elif block (see few lines below) # 4. Add line in summary section of this script #choose content type content_types = {'Article': 'journal article', 'Book': 'book', 'Chapter': 'chapter', 'Conference': 'conference', 'Poster': 'poster', 'Report': 'report', 'Presentation': 'presentation'} obj_types = ds.content.node.xpath('atom:category/@label', namespaces={'atom': 'http://www.w3.org/2005/Atom'}) if obj_types[1] in content_types.values(): logging.info("Processing %s as Publication" % pid) obj = self.repo.get_object(pid=pid, type=Publication) else: logging.info("Skipping %s Invalid Content Type" % pid) continue obj.from_symp() # get a list of predicates properties = [] for p in list(obj.rels_ext.content.predicates()): properties.append(str(p)) # skip if the rels-ext has the "replaces tag, which indicates duplicates" replaces_tag = "http://purl.org/dc/terms/replaces" if replaces_tag in properties: self.counts['duplicates']+=1 # get the pid of the original object this is replaceing replaces_pid = obj.rels_ext.content.serialize().split('<dcterms:replaces rdf:resource="')[1].split('"')[0] # add to duplicate dict self.duplicates[pid.replace('info:fedora/','')] = replaces_pid.replace('info:fedora/','') if not obj.is_withdrawn: try: user = User.objects.get(username=u'oebot') except ObjectDoesNotExist: user = User.objects.get_or_create(username=u'bob', password=u'bobspassword',)[0] user.first_name = "Import" user.last_name = "Process" user.save() reason = "Duplicate." self.counts['withdrawn']+=1 obj.provenance.content.init_object(obj.pid, 'pid') obj.provenance.content.withdrawn(user,reason) obj.state = 'I' logging.info("Withdrew duplicate pid: %s" % obj.pid) else: self.counts['pdf']+=1 # convert attached PDF fle to be used with OE # filter datastreams for only application/pdf mime = None mime_ds_list = None print obj.descMetadata.content.genre if obj.descMetadata.content.genre == "Article" or obj.descMetadata.content.genre == "Book" or obj.descMetadata.content.genre == "Chapter": mime_ds_list = [i for i in obj.ds_list if obj.ds_list[i].mimeType in obj.allowed_mime_types.values()] elif obj.descMetadata.content.genre == "Conference": mime_ds_list = [i for i in obj.ds_list if obj.ds_list[i].mimeType in obj.allowed_mime_conference.values()] elif obj.descMetadata.content.genre == "Report": mime_ds_list = [i for i in obj.ds_list if obj.ds_list[i].mimeType in obj.allowed_mime_report.values()] elif obj.descMetadata.content.genre == "Poster": mime_ds_list = [i for i in obj.ds_list if obj.ds_list[i].mimeType in obj.allowed_mime_poster.values()] elif obj.descMetadata.content.genre == "Presentation": mime_ds_list = [i for i in obj.ds_list if obj.ds_list[i].mimeType in obj.allowed_mime_presentation.values()] else: logging.info("Skipping because mime type is not allowed") continue if mime_ds_list: # sort by DS timestamp does not work yet asks for global name obj because of lambda function new_dict = {} for mime in mime_ds_list: new_dict[mime] = obj.getDatastreamObject(mime).last_modified() sorted_mimes = sorted(new_dict.items(), key=lambda x: x[1]) # sorted_mimes = sorted(mime_ds_list, key=lambda p: str(obj.getDatastreamObject(p).last_modified())) mime = sorted_mimes[-1][0] # most recent if not options['noact']: obj.save() # obj.index_data() if mime: mime_type = obj.ds_list[mime].mimeType print mime_type print "####################################" self.repo.api.addDatastream(pid=obj.pid, dsID='content', dsLabel='%s' % mime_type, mimeType=mime_type, logMessage='added %s content from %s' % (mime_type,mime), controlGroup='M', versionable=True, content=obj.getDatastreamObject(mime).content) logging.info("Converting %s to %s Content" % (mime,mime_type)) self.counts[mime_type]+=1 self.counts['Publication']+=1 except (KeyboardInterrupt, SystemExit): if self.counts['duplicates'] > 0: self.write_dup_report(self.duplicates, error="interrupt") raise except Exception as e: logging.error("Error processing %s: %s" % (pid, e.message)) logging.error(obj.rels_ext.content.serialize(pretty=True)) self.counts['errors']+=1 self.errors[pid] = e.message # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % self.counts['total']) self.stdout.write("Skipped: %s\n" % self.counts['skipped']) self.stdout.write("Duplicates: %s\n" % self.counts['duplicates']) self.stdout.write("Withdrew: %s\n" % self.counts['withdrawn']) self.stdout.write("PDFs converted: %s\n" % self.counts['pdf']) self.stdout.write("Errors: %s\n" % self.counts['errors']) self.stdout.write("Publications converted: %s\n" % self.counts['Publication']) if self.counts['duplicates'] > 0 or self.counts['errors'] > 0: self.write_dup_report(self.duplicates, self.errors)
class Command(BaseCommand): ''' This command run through all the articles and makes sure that journal titles and publishers match against Sherpa Romeo ''' args = "[netid netid ...]" help = __doc__ option_list = BaseCommand.option_list + ( make_option('--article', '-a', action='store_true', default=False, help='Cleans up content models for articles.'), make_option('--book', '-b', action='store', default=False, help='Cleans up content models for books.'), make_option('--force', '-f', action='store_true', default=False, help='Updates even if SYMPLECTIC-ATOM has not been modified since last run.'), ) def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 if options['article'] and options['book']: raise CommandError('Can not use both parameters') if not options['article'] and not options['book']: raise CommandError('Use at least one parameter') if options['article']: cmodel = Publication.ARTICLE_CONTENT_MODEL if options['book']: cmodel = Publication.BOOK_CONTENT_MODEL # connection to repository self.repo = ManagementRepository() pid_set = self.repo.get_objects_with_cmodel(cmodel, type=Publication) try: publications = Paginator(pid_set, 100) except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in publications.page_range: try: objs = publications.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0,"Error getting page: %s : %s " % (p, e.message)) continue for publication in objs: try: if not publication.exists: self.output(0, "Skipping %s because pid does not exist" % publication.pid) continue else: if not publication.has_model(Publication.PUBLICATION_CONTENT_MODEL): publication.add_relationship(relsextns.hasModel, Publication.PUBLICATION_CONTENT_MODEL) publication.save() else: continue except Exception as e: self.output(0, "Error processing pid: %s : %s " % (article.pid, e.message))