for restriction in xml.findall( './postprints/postrestrictions/postrestriction'): addRestriction(restriction, 'postprint', publisher) for restriction in xml.findall( './pdfversion/pdfrestrictions/pdfrestriction'): addRestriction(restriction, 'pdfversion', publisher) for condition in xml.findall('./conditions/condition'): if condition.text: c = PublisherCondition(publisher=publisher, text=condition.text.strip()) c.save() # Update the publisher status publisher.oa_status = publisher.classify_oa_status() publisher.save(update_fields=['oa_status']) for link in xml.findall('./copyrightlinks/copyrightlink'): text = None url = None texts = link.findall('./copyrightlinktext') if texts: text = nstrip(texts[0].text) urls = link.findall('./copyrightlinkurl') if urls: url = nstrip(urls[0].text) if url and text: cplink = PublisherCopyrightLink(text=text, url=url, publisher=publisher)
def get_or_create_publisher(self, romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO. If the data from RoMEO is more fresh than what we have in cache, we update our model. """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException('RoMEO did not provide a publisher id.') romeo_parent_id = None try: romeo_parent_id = xml.attrib['parentid'] except KeyError: pass name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the publisher\'s name.') alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except (KeyError, IndexError): pass last_update = self._get_romeo_date(xml, './dateupdated') # Check if we already have it. # Sadly the romeo_id is not unique (as publishers imported from doaj # all get the same id, so we have to use the name too). matches = None if re.match(r'\d+', romeo_id): # numeric ids are unambiguous matches = Publisher.objects.filter(romeo_id=romeo_id) elif alias: matches = Publisher.objects.filter( romeo_id=romeo_id, name__iexact=name, alias__iexact=alias) else: matches = Publisher.objects.filter( romeo_id=romeo_id, name__iexact=name, alias__isnull=True) if matches: first_match = matches[0] if first_match.last_updated is not None and first_match.last_updated >= last_update: return matches[0] # Otherwise, create it url = None try: url = nstrip(xml.findall('./homeurl')[0].text) except (KeyError, IndexError): pass preprint = None try: preprint = xml.findall('./preprints/prearchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the preprint policy.') postprint = None try: postprint = xml.findall('./postprints/postarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the postprint policy.') pdfversion = None try: pdfversion = xml.findall('./pdfversion/pdfarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the pdf archiving policy.') # Compute OA status of the publisher status = 'UNK' if not matches: publisher = Publisher() else: publisher = matches[0] publisher.name = name publisher.alias = alias publisher.url = url publisher.preprint = preprint publisher.postprint = postprint publisher.pdfversion = pdfversion publisher.romeo_id = romeo_id publisher.romeo_parent_id = romeo_parent_id publisher.oa_status = status publisher.last_updated = last_update publisher.save() if matches: publisher.publishercopyrightlink_set.all().delete() publisher.publisherrestrictiondetail_set.all().delete() publisher.publishercondition_set.all().delete() # Add the conditions, restrictions, and copyright for restriction in xml.findall('./preprints/prerestrictions/prerestriction'): self.add_restriction(restriction, 'preprint', publisher) for restriction in xml.findall('./postprints/postrestrictions/postrestriction'): self.add_restriction(restriction, 'postprint', publisher) for restriction in xml.findall('./pdfversion/pdfrestrictions/pdfrestriction'): self.add_restriction(restriction, 'pdfversion', publisher) for condition in xml.findall('./conditions/condition'): if condition.text: c = PublisherCondition(publisher=publisher, text=condition.text.strip()) c.save() # Update the publisher status publisher.oa_status = publisher.classify_oa_status() publisher.save(update_fields=['oa_status']) # TODO: if the OA status has changed, then we should update the journals and papers accordingly with the # adequate task for link in xml.findall('./copyrightlinks/copyrightlink'): text = None url = None texts = link.findall('./copyrightlinktext') if texts: text = nstrip(texts[0].text) urls = link.findall('./copyrightlinkurl') if urls: url = nstrip(urls[0].text) if url and text: cplink = PublisherCopyrightLink( text=text, url=url[:1024], publisher=publisher) cplink.save() return publisher