def fetch(self, orcid_id, instance=settings.ORCID_BASE_DOMAIN): """ Fetches the profile by id using the public API. :param id: the ORCID identifier to fetch :param instance: the domain name of the instance to use (orcid.org or sandbox.orcid.org) """ if instance not in ['orcid.org', 'sandbox.orcid.org']: raise ValueError('Unexpected instance') try: headers = {'Accept': 'application/orcid+json'} profile_req = requests.get('http://pub.%s/v1.2/%s/orcid-profile' % (instance, orcid_id), headers=headers) parsed = profile_req.json() if parsed.get('orcid-profile') is None: # TEMPORARY: also check from the sandbox if instance == 'orcid.org': return self.fetch(orcid_id, instance='sandbox.orcid.org') raise ValueError self.json = parsed except (requests.exceptions.HTTPError, ValueError): raise MetadataSourceException('The ORCiD %s could not be found' % orcid_id) except TypeError: raise MetadataSourceException( 'The ORCiD %s returned invalid JSON.' % orcid_id)
def urlopen_retry(url, **kwargs): # data, timeout, retries, delay, backoff): data = kwargs.get('data', None) timeout = kwargs.get('timeout', 10) retries = kwargs.get('retries', 3) delay = kwargs.get('delay', 5) backoff = kwargs.get('backoff', 2) headers = kwargs.get('headers', {}) try: r = requests.get(url, params=data, timeout=timeout, headers=headers, allow_redirects=True) return r.text except requests.exceptions.Timeout as e: if retries <= 0: raise MetadataSourceException('Timeout: ' + str(e)) except requests.exceptions.ConnectionError as e: if retries <= 0: raise MetadataSourceException('Connection error: ' + str(e)) except requests.exceptions.RequestException as e: raise MetadataSourceException('Request error: ' + str(e)) print "Retrying in " + str(delay) + " seconds..." print "URL: " + url sleep(delay) return urlopen_retry(url, data=data, timeout=timeout, retries=retries - 1, delay=delay * backoff, backoff=backoff)
def perform_romeo_query(self, search_terms): search_terms = search_terms.copy() if self.api_key: search_terms['ak'] = self.api_key # Perform the query try: req = requests.get(self.base_url, params=search_terms, timeout=20) except requests.exceptions.RequestException as e: raise MetadataSourceException('Error while querying RoMEO.\n' + 'URL was: '+self.base_url+'\n' + 'Parameters were: '+str(search_terms)+'\n' + 'Error is: '+str(e)) # Parse it try: parser = ET.XMLParser(encoding='ISO-8859-1') root = ET.parse(BytesIO(req.content), parser) except ET.ParseError as e: raise MetadataSourceException('RoMEO returned an invalid XML response.\n' + 'URL was: '+self.base_url+'\n' + 'Parameters were: '+str(search_terms)+'\n' + 'Error is: '+str(e)) return root
def perform_romeo_query(search_terms): search_terms = search_terms.copy() if ROMEO_API_KEY: search_terms['ak'] = ROMEO_API_KEY base_url = 'http://' + ROMEO_API_DOMAIN + '/romeo/api29.php' # Perform the query try: response = urlopen_retry(base_url, data=search_terms).encode('utf-8') except requests.exceptions.RequestException as e: raise MetadataSourceException('Error while querying RoMEO.\n' + 'URL was: ' + base_url + '\n' + 'Parameters were: ' + str(search_terms) + '\n' + 'Error is: ' + str(e)) # Parse it try: parser = ET.XMLParser(encoding='utf-8') root = ET.parse(BytesIO(response), parser) except ET.ParseError as e: with open('/tmp/romeo_response.xml', 'w') as f: f.write(response) f.write('\n') raise MetadataSourceException( 'RoMEO returned an invalid XML response, dumped at /tmp/romeo_response.xml\n' + 'URL was: ' + base_url + '\n' + 'Parameters were: ' + str(search_terms) + '\n' + 'Error is: ' + str(e)) return root
def get_or_create_publisher(romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException( 'RoMEO did not provide a publisher id.\n' + 'URL was: ' + request) name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the publisher\'s name.\n' + 'URL was: ' + request) alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except KeyError, IndexError: pass
def fetch_dois_by_batch(doi_list): """ Fetch a list of DOIs by batch (useful when refreshing the list of publications of a given researcher, as the records have most likely been already cached before by the proxy) """ def results_list_to_dict(results): dct = {} for item in results: if item and 'DOI' in item: dct[item['DOI']] = item return dct if len(doi_list) == 0: return [] elif len(doi_list) > nb_dois_per_batch: first_dois = fetch_dois_by_batch(doi_list[:nb_dois_per_batch]) last_dois = fetch_dois_by_batch(doi_list[nb_dois_per_batch:]) return first_dois + last_dois # Given how we are joining the DOIs, they cannot contain commas params = { 'filter': ','.join(['doi:' + doi for doi in doi_list if ',' not in doi]), 'mailto': settings.CROSSREF_MAILTO } req = None try: # First we fetch dois by batch from CrossRef. That's fast, but only # works for CrossRef DOIs req = make_crossref_call('/works', params=params) req.raise_for_status() results = req.json()['message'].get('items', []) dct = results_list_to_dict(results) # Some DOIs might not be in the results list, because they are issued by other organizations # We fetch them using our proxy (cached content negociation) missing_dois = list(set(doi_list) - set(dct.keys())) if missing_dois: req = requests.post( 'https://' + settings.DOI_PROXY_DOMAIN + '/batch', {'dois': json.dumps(missing_dois)}) req.raise_for_status() missing_dois_dct = results_list_to_dict(req.json()) dct.update(missing_dois_dct) result = [dct.get(doi) for doi in doi_list] return result except RequestException as e: raise MetadataSourceException('Connecting to the DOI proxy at ' + req.url + ' failed: ' + str(e)) except ValueError as e: raise MetadataSourceException( 'Invalid JSON returned by the DOI proxy: ' + str(e)) except KeyError: return [] except requests.exceptions.RequestException as e: raise MetadataSourceException( 'Failed to retrieve batch metadata from the proxy: ' + str(e))
def search_for_dois_incrementally( self, query, filters={}, max_batches=max_crossref_batches_per_researcher): """ Searches for DOIs for the given query and yields their metadata as it finds them. :param query: the search query to pass to CrossRef :param filters: filters as specified by the REST API :param max_batches: maximum number of queries to send to CrossRef """ params = {} if query: params['query'] = query if filters: params['filter'] = ','.join( map(lambda (k, v): k + ":" + v, filters.items())) count = 0 rows = 20 offset = 0 while not max_batches or count < max_batches: url = 'http://api.crossref.org/works' params['rows'] = rows params['offset'] = offset try: r = requests.get(url, params=params) print "CROSSREF: " + r.url js = r.json() found = False for item in jpath('message/items', js, default=[]): found = True yield item if not found: break except ValueError as e: raise MetadataSourceException( 'Error while fetching CrossRef results:\nInvalid response.\n' + 'URL was: %s\nParameters were: %s\nJSON parser error was: %s' % (url, urlencode(params), unicode(e))) except requests.exceptions.RequestException as e: raise MetadataSourceException( 'Error while fetching CrossRef results:\nUnable to open the URL: ' + request + '\nError was: ' + str(e)) offset += rows count += 1
def get_or_create_publisher(romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException('RoMEO did not provide a publisher id.') name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the publisher\'s name.') alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except (KeyError, IndexError): pass # Check if we already have it matches = None if alias: matches = Publisher.objects.filter(romeo_id=romeo_id, name__iexact=name, alias__iexact=alias) else: matches = Publisher.objects.filter(romeo_id=romeo_id, name__iexact=name, alias__isnull=True) if matches: return matches[0] # Otherwise, create it url = None try: url = nstrip(xml.findall('./homeurl')[0].text) except KeyError, IndexError: pass
def fetch_journal(search_terms, matching_mode='exact'): """ Fetch the journal data from RoMEO. Returns an Journal object. search_terms should be a dictionnary object containing at least one of these fields: """ allowed_fields = ['issn', 'jtitle'] # Make the title HTML-safe before searching for it in the database or in the API if 'title' in search_terms: search_terms['title'] = kill_html(search_terms['title']) original_search_terms = search_terms.copy() # Check the arguments if not all( map(lambda x: x in allowed_fields, (key for key in search_terms))): raise ValueError('The search terms have to belong to ' + str(allowed_fields) + 'but the dictionary I got is ' + str(search_terms)) # Remove diacritics (because it has to be sent in ASCII to ROMEO) for key in search_terms: search_terms[key] = remove_diacritics(search_terms[key]) # First check we don't have it already journal = find_journal_in_model(search_terms) if journal: return journal # Perform the query root = perform_romeo_query(search_terms) # Find the matching journals (if any) journals = list(root.findall('./journals/journal')) if not journals: # Retry with a less restrictive matching type if matching_mode == 'exact': return fetch_journal(original_search_terms, 'contains') return None if len(journals) > 1: print("Warning, " + str(len(journals)) + " journals match the RoMEO request, " + "defaulting to the first one") # TODO different behaviour: get the ISSN and try again. journal = journals[0] names = list(journal.findall('./jtitle')) if not names: raise MetadataSourceException( 'RoMEO returned a journal without title.\n' + 'URL was: ' + request) if len(names) > 1: print("Warning, " + str(len(names)) + " names provided for one journal, " + "defaulting to the first one") name = kill_html(names[0].text) issn = None try: issn = nstrip(journal.findall('./issn')[0].text) except KeyError, IndexError: pass
def fetch_all_records(self, filters=None, cursor="*"): """ Fetches all Crossref records from their API, starting at a given date. :param filters: filters as specified by the REST API (as a dictionary) :param cursor: the initial cursor where to start the fetching (useful to resume failed ingestions) """ if filters is None: filters = {} params = {} if filters: params['filter'] = ','.join(k + ":" + v for k, v in list(filters.items())) rows = 100 next_cursor = cursor while next_cursor: params['rows'] = rows params['cursor'] = next_cursor params['mailto'] = settings.CROSSREF_MAILTO try: r = make_crossref_call('/works', params=params) r.raise_for_status() js = r.json() if js['status'] == 'failed': raise MetadataSourceException( 'Querying Crossrsef with {} failed.'.format(r.url)) found = False for item in jpath('message/items', js, default=[]): found = True yield item if not found: break next_cursor = jpath('message/next-cursor', js) logger.info( "Next cursor: {}".format(next_cursor)) # to ease recovery except ValueError as e: raise MetadataSourceException( 'Error while fetching CrossRef results:\nInvalid response.\n' + 'Parameters were: %s\nJSON parser error was: %s' % (urlencode(params), str(e))) except requests.exceptions.RequestException as e: raise MetadataSourceException( 'Error while fetching CrossRef results:\nError was: ' + str(e))
def request_retry(url, **kwargs): """ Retries a request, with throttling and exponential back-off. :param url: the URL to fetch :param data: the GET parameters :param headers: the HTTP headers :param timeout: the number of seconds to wait before declaring that an individual request timed out (default 10) :param retries: the number of times to retry a query (default 3) :param delay: the minimum delay between requests (default 5) :param backoff: the multiple used when raising the delay after an unsuccessful query (default 2) """ data = kwargs.get('data', None) timeout = kwargs.get('timeout', 10) retries = kwargs.get('retries', 3) delay = kwargs.get('delay', 5) backoff = kwargs.get('backoff', 2) headers = kwargs.get('headers', {}) try: r = requests.get(url, params=data, timeout=timeout, headers=headers, allow_redirects=True) r.raise_for_status() return r except requests.exceptions.Timeout as e: if retries <= 0: raise MetadataSourceException('Timeout: '+str(e)) except requests.exceptions.ConnectionError as e: if retries <= 0: raise MetadataSourceException('Connection error: '+str(e)) except requests.exceptions.RequestException as e: if retries <= 0: raise MetadataSourceException('Request error: '+str(e)) logger.info("Retrying in "+str(delay)+" seconds with url "+url) sleep(delay) return request_retry(url, data=data, timeout=timeout, retries=retries-1, delay=delay*backoff, backoff=backoff)
def fetch(self): """ Fetches the profile by id using the public API. This only fetches the summaries, subsequent requests will be made for works. """ try: parsed = self.request_element('') if parsed.get('orcid-identifier') is None: # TEMPORARY: also check from the sandbox if self.instance == 'orcid.org': self.instance = 'sandbox.orcid.org' return self.fetch() raise ValueError self.json = parsed except (requests.exceptions.HTTPError, ValueError): raise MetadataSourceException( 'The ORCiD {id} could not be found'.format(id=self.id)) except TypeError: raise MetadataSourceException( 'The ORCiD {id} returned invalid JSON.'.format(id=self.id))
def fetch_zotero_by_DOI(doi): """ Fetch Zotero metadata for a given DOI. Works only with the doi_cache proxy. """ try: request = requests.get('http://'+DOI_PROXY_DOMAIN+'/zotero/'+doi) return request.json() except ValueError as e: raise MetadataSourceException('Error while fetching Zotero metadata:\nInvalid JSON response.\n'+ 'Error: '+str(e))
def get_or_create_by_orcid(cls, orcid, profile=None, user=None): """ Creates (or returns an existing) researcher from its ORCID id. :param profile: an :class:`OrcidProfile` object if it has already been fetched from the API (otherwise we will fetch it ourselves) :param user: an user to associate with the profile. :returns: a :class:`Researcher` if everything went well, raises MetadataSourceException otherwise """ researcher = None if orcid is None: raise MetadataSourceException('Invalid ORCID id') try: researcher = Researcher.objects.get(orcid=orcid) except Researcher.DoesNotExist: if profile is None: profile = OrcidProfile(id=orcid) else: profile = OrcidProfile(json=profile) name = profile.name homepage = profile.homepage email = profile.email researcher = Researcher.create_by_name(name[0], name[1], orcid=orcid, user=user, homepage=homepage, email=email) # Ensure that extra info is added. save = False for kw, val in [('homepage', homepage), ('orcid', orcid), ('email', email)]: if not researcher.__dict__[kw] and val: researcher.__dict__[kw] = val save = True if save: researcher.save() for variant in profile.other_names: confidence = name_similarity(variant, variant) name = Name.lookup_name(variant) researcher.add_name_variant(name, confidence) return researcher
def fetch_metadata_by_DOI(doi): """ Fetch the metadata for a single DOI. This is supported by the standard proxy, doi.org, as well as more advanced proxies such as doi_cache """ if doi is None: return addheaders = {'Accept': 'application/citeproc+json'} try: request = 'http://'+DOI_PROXY_DOMAIN+'/'+doi response = urlopen_retry(request, timeout=crossref_timeout, headers=addheaders, retries=0) parsed = json.loads(response) return parsed except ValueError as e: raise MetadataSourceException('Error while fetching DOI metadata:\nInvalid JSON response.\n' + 'Error: '+str(e))
def fetch_orcid_records(self, orcid_identifier, profile=None, use_doi=True): """ Queries ORCiD to retrieve the publications associated with a given ORCiD. It also fetches such papers from the CrossRef search interface. :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON). :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow) :returns: a generator, where all the papers found are yielded. (some of them could be in free form, hence not imported) """ cr_api = CrossRefAPI() # Cleanup iD: orcid_id = validate_orcid(orcid_identifier) if orcid_id is None: raise MetadataSourceException('Invalid ORCiD identifier') # Get ORCiD profile try: if profile is None: profile = OrcidProfile(orcid_id=orcid_id) else: profile = OrcidProfile(json=profile) except MetadataSourceException as e: print e return # As we have fetched the profile, let's update the Researcher self.researcher = Researcher.get_or_create_by_orcid(orcid_identifier, profile.json, update=True) if not self.researcher: return # Reference name ref_name = profile.name ignored_papers = [ ] # list of ignored papers due to incomplete metadata # Get summary publications and separate them in two classes: # - the ones with DOIs, that we will fetch with CrossRef dois_and_putcodes = [] # list of (DOIs,putcode) to fetch # - the ones without: we will fetch ORCID's metadata about them # and try to create a paper with what they provide put_codes = [] for summary in profile.work_summaries: if summary.doi and use_doi: dois_and_putcodes.append((summary.doi, summary.put_code)) else: put_codes.append(summary.put_code) # 1st attempt with DOIs and CrossRef if use_doi: # Let's grab papers with DOIs found in our ORCiD profile. dois = [doi for doi, put_code in dois_and_putcodes] for idx, (success, paper_or_metadata) in enumerate( self.fetch_metadata_from_dois(cr_api, ref_name, orcid_id, dois)): if success: yield paper_or_metadata else: put_codes.append(dois_and_putcodes[idx][1]) # 2nd attempt with ORCID's own crappy metadata works = profile.fetch_works(put_codes) for work in works: if not work: continue # If the paper is skipped due to invalid metadata. # We first try to reconcile it with local researcher author name. # Then, we consider it missed. if work.skipped: print(work.json) print(work.skip_reason) print('work skipped due to incorrect metadata (%s)' % (work.skip_reason)) ignored_papers.append(work.as_dict()) continue yield self.create_paper(work) self.warn_user_of_ignored_papers(ignored_papers) if ignored_papers: print('Warning: Total ignored papers: %d' % (len(ignored_papers)))
def fetch_orcid_records(self, orcid_identifier, profile=None, use_doi=True): """ Queries ORCiD to retrieve the publications associated with a given ORCiD. It also fetches such papers from the CrossRef search interface. :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON). :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow) :returns: a generator, where all the papers found are yielded. (some of them could be in free form, hence not imported) """ crps = CrossRefPaperSource(self.ccf) # Cleanup iD: orcid_id = validate_orcid(orcid_identifier) if orcid_id is None: raise MetadataSourceException('Invalid ORCiD identifier') # Get ORCiD profile try: if profile is None: profile = OrcidProfile(id=orcid_id) else: profile = OrcidProfile(json=profile) except MetadataSourceException as e: print e return # Reference name ref_name = profile.name # curl -H "Accept: application/orcid+json" 'http://pub.orcid.org/v1.2/0000-0002-8612-8827/orcid-works' -L -i dois = [] # list of DOIs to fetch ignored_papers = [] # list of ignored papers due to incomplete metadata # Fetch publications (1st attempt with ORCiD data) pubs = jpath('orcid-profile/orcid-activities/orcid-works/orcid-work', profile, []) for pub in pubs: data_paper = ORCIDDataPaper.from_orcid_metadata( ref_name, orcid_id, pub, stop_if_dois_exists=use_doi ) if data_paper.dois and use_doi: # We want to batch it rather than manually do it. dois.extend(data_paper.dois) continue # If the paper is skipped due to invalid metadata. # We first try to reconcile it with local researcher author name. # Then, we consider it missed. if data_paper.skipped: print ('%s is skipped due to incorrect metadata (%s)' % (data_paper, data_paper.skip_reason)) print ('Trying to reconcile it with local researcher.') data_paper = self.reconcile_paper( ref_name, orcid_id, pub, overrides={ 'authors': [(self.researcher.name.first, self.researcher.name.last)] } ) if data_paper.skipped: ignored_papers.append(data_paper.as_dict()) continue yield self.create_paper(data_paper) # 2nd attempt with DOIs and CrossRef if use_doi: # Let's grab papers from CrossRef for success, paper_or_metadata in self.fetch_crossref_incrementally(crps, orcid_id): if success: yield paper_or_metadata else: ignored_papers.append(paper_or_metadata) print ('This metadata (%s) yields no paper.' % (metadata)) # Let's grab papers with DOIs found in our ORCiD profile. # FIXME(RaitoBezarius): if we fail here, we should get back the pub and yield it. for success, paper_or_metadata in self.fetch_metadata_from_dois(crps, ref_name, orcid_id, dois): if success: yield paper_or_metadata else: ignored_papers.append(paper_or_metadata) print ('This metadata (%s) yields no paper.' % (paper_or_metadata)) self.warn_user_of_ignored_papers(ignored_papers) if ignored_papers: print ('Warning: Total ignored papers: %d' % (len(ignored_papers)))
def fetch_orcid_records(self, id, profile=None, use_doi=True): """ Queries ORCiD to retrieve the publications associated with a given ORCiD. It also fetches such papers from the CrossRef search interface. :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON). :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow) :returns: a generator, where all the papers found are yielded. (some of them could be in free form, hence not imported) """ crps = CrossRefPaperSource(self.ccf) # Cleanup iD: id = validate_orcid(id) if id is None: raise MetadataSourceException('Invalid ORCiD identifier') # Get ORCiD profile try: if profile is None: profile = OrcidProfile(id=id) else: profile = OrcidProfile(json=profile) except MetadataSourceException as e: print e return # Reference name ref_name = profile.name # curl -H "Accept: application/orcid+json" 'http://pub.orcid.org/v1.2/0000-0002-8612-8827/orcid-works' -L -i dois = [] # list of DOIs to fetch papers = [] # list of papers created records_found = 0 # how many records did we successfully import from the profile? # Fetch publications pubs = jpath('orcid-profile/orcid-activities/orcid-works/orcid-work', profile, []) for pub in pubs: def j(path, default=None): return jpath(path, pub, default) # DOI doi = None for extid in j( 'work-external-identifiers/work-external-identifier', []): if extid.get('work-external-identifier-type') == 'DOI': doi = to_doi( jpath('work-external-identifier-id/value', extid)) if doi: # If a DOI is available, create the paper using metadata from CrossRef. # We don't do it yet, we only store the DOI, so that we can fetch them # by batch later. dois.append(doi) if doi and use_doi: continue # Extract information from ORCiD # Title title = j('work-title/title/value') if title is None: print "Warning: Skipping ORCID publication: no title" # Type doctype = orcid_to_doctype(j('work-type', 'other')) # Contributors (ignored for now as they are very often not present) def get_contrib(js): return { 'orcid': jpath('contributor-orcid', js), 'name': jpath('credit-name/value', js), } contributors = map(get_contrib, j('work-contributors/contributor', [])) author_names = filter(lambda x: x is not None, map(lambda x: x['name'], contributors)) authors = map(parse_comma_name, author_names) pubdate = None # ORCiD internal id identifier = j('put-code') affiliations = map(lambda x: x['orcid'], contributors) # Pubdate year = parse_int(j('publication-date/year/value'), 1970) month = parse_int(j('publication-date/month/value'), 01) day = parse_int(j('publication-date/day/value'), 01) pubdate = None try: pubdate = date(year=year, month=01, day=01) pubdate = date(year=year, month=month, day=01) pubdate = date(year=year, month=month, day=day) except ValueError: if pubdate is None: print "Invalid publication date in ORCID publication, skipping" continue # Citation type: metadata format citation_format = j('work-citation/work-citation-type') print citation_format bibtex = j('work-citation/citation') if bibtex is not None: try: entry = parse_bibtex(bibtex) if entry.get('author', []) == []: print "Warning: Skipping ORCID publication: no authors." print j('work-citation/citation') if not authors: authors = entry['author'] except ValueError: pass affiliations = affiliate_author_with_orcid( ref_name, id, authors, initial_affiliations=affiliations) authors = map(name_lookup_cache.lookup, authors) if not authors: print "No authors found, skipping" continue # Create paper: paper = BarePaper.create(title, authors, pubdate, 'VISIBLE', affiliations) record = BareOaiRecord(source=orcid_oai_source, identifier=identifier, splash_url='http://orcid.org/' + id, pubtype=doctype) paper.add_oairecord(record) yield paper if use_doi: for metadata in crps.search_for_dois_incrementally( '', {'orcid': id}): try: paper = crps.save_doi_metadata(metadata) if paper: yield paper except ValueError as e: print "Saving CrossRef record from ORCID failed: %s" % unicode( e) # Now we add the DOIs found in the ORCID profile. doi_metadata = fetch_dois(dois) for metadata in doi_metadata: try: authors = map(convert_to_name_pair, metadata['author']) affiliations = affiliate_author_with_orcid( ref_name, id, authors) paper = crps.save_doi_metadata(metadata, affiliations) if not paper: continue record = BareOaiRecord(source=orcid_oai_source, identifier='orcid:' + id + ':' + metadata['DOI'], splash_url='http://orcid.org/' + id, pubtype=paper.doctype) paper.add_oairecord(record) yield paper except (KeyError, ValueError, TypeError): pass
alias__isnull=True) if matches: return matches[0] # Otherwise, create it url = None try: url = nstrip(xml.findall('./homeurl')[0].text) except KeyError, IndexError: pass preprint = None try: preprint = xml.findall('./preprints/prearchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the preprint policy.') postprint = None try: postprint = xml.findall('./postprints/postarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the postprint policy.') pdfversion = None try: pdfversion = xml.findall('./pdfversion/pdfarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the pdf archiving policy.')
def fetch_journal(search_terms, matching_mode='exact'): """ Fetch the journal data from RoMEO. Returns an Journal object. search_terms should be a dictionnary object containing at least one of these fields: """ allowed_fields = ['issn', 'jtitle'] terms = search_terms.copy() # Make the title HTML-safe before searching for it in the database or in # the API if 'title' in terms: terms['title'] = kill_html(terms['title']) # Check the arguments if not all(key in allowed_fields for key in terms): raise ValueError('The search terms have to belong to ' + str(allowed_fields) + 'but the dictionary I got is ' + str(terms)) # Remove diacritics (because it has to be sent in ASCII to ROMEO) for key in terms: terms[key] = remove_diacritics(terms[key]) if len(terms[key]) > 256: return None # First check we don't have it already journal = find_journal_in_model(terms) if journal: return journal # Perform the query if matching_mode != 'exact': terms['qtype'] = matching_mode root = perform_romeo_query(terms) # Find the matching journals (if any) journals = list(root.findall('./journals/journal')) if not journals: return None elif len(journals) > 1: print("Warning, " + str(len(journals)) + " journals match the RoMEO request, " + "defaulting to the first one") # TODO different behaviour: get the ISSN and try again. journal = journals[0] names = list(journal.findall('./jtitle')) if not names: raise MetadataSourceException( 'RoMEO returned a journal without title.\n' + 'Terms were: ' + unicode(terms)) if len(names) > 1: print("Warning, " + str(len(names)) + " names provided for one journal, " + "defaulting to the first one") name = kill_html(names[0].text) issn = None try: issn = nstrip(journal.findall('./issn')[0].text) except (KeyError, IndexError): pass # Now we may have additional info, so it's worth trying again in the model model_journal = find_journal_in_model({'issn': issn, 'jtitle': name}) if model_journal: return model_journal # Otherwise we need to find the publisher publishers = root.findall('./publishers/publisher') if not publishers: return None # TODO here we shouldn't default to the first one but look it up using the # <romeopub> publisher_desc = publishers[0] publisher = get_or_create_publisher(publisher_desc) result = Journal(title=name, issn=issn, publisher=publisher) result.save() return result
if matches: return matches[0] # Otherwise, create it url = None try: url = nstrip(xml.findall('./homeurl')[0].text) except KeyError, IndexError: pass preprint = None try: preprint = xml.findall('./preprints/prearchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the preprint policy.\n' + 'URL was: ' + request) postprint = None try: postprint = xml.findall('./postprints/postarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the postprint policy.\n' + 'URL was: ' + request) pdfversion = None try: pdfversion = xml.findall('./pdfversion/pdfarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException(
def get_or_create_publisher(self, romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO. If the data from RoMEO is more fresh than what we have in cache, we update our model. """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException('RoMEO did not provide a publisher id.') romeo_parent_id = None try: romeo_parent_id = xml.attrib['parentid'] except KeyError: pass name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the publisher\'s name.') alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except (KeyError, IndexError): pass last_update = self._get_romeo_date(xml, './dateupdated') # Check if we already have it. # Sadly the romeo_id is not unique (as publishers imported from doaj # all get the same id, so we have to use the name too). matches = None if re.match(r'\d+', romeo_id): # numeric ids are unambiguous matches = Publisher.objects.filter(romeo_id=romeo_id) elif alias: matches = Publisher.objects.filter( romeo_id=romeo_id, name__iexact=name, alias__iexact=alias) else: matches = Publisher.objects.filter( romeo_id=romeo_id, name__iexact=name, alias__isnull=True) if matches: first_match = matches[0] if first_match.last_updated is not None and first_match.last_updated >= last_update: return matches[0] # Otherwise, create it url = None try: url = nstrip(xml.findall('./homeurl')[0].text) except (KeyError, IndexError): pass preprint = None try: preprint = xml.findall('./preprints/prearchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the preprint policy.') postprint = None try: postprint = xml.findall('./postprints/postarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the postprint policy.') pdfversion = None try: pdfversion = xml.findall('./pdfversion/pdfarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the pdf archiving policy.') # Compute OA status of the publisher status = 'UNK' if not matches: publisher = Publisher() else: publisher = matches[0] publisher.name = name publisher.alias = alias publisher.url = url publisher.preprint = preprint publisher.postprint = postprint publisher.pdfversion = pdfversion publisher.romeo_id = romeo_id publisher.romeo_parent_id = romeo_parent_id publisher.oa_status = status publisher.last_updated = last_update publisher.save() if matches: publisher.publishercopyrightlink_set.all().delete() publisher.publisherrestrictiondetail_set.all().delete() publisher.publishercondition_set.all().delete() # Add the conditions, restrictions, and copyright for restriction in xml.findall('./preprints/prerestrictions/prerestriction'): self.add_restriction(restriction, 'preprint', publisher) for restriction in xml.findall('./postprints/postrestrictions/postrestriction'): self.add_restriction(restriction, 'postprint', publisher) for restriction in xml.findall('./pdfversion/pdfrestrictions/pdfrestriction'): self.add_restriction(restriction, 'pdfversion', publisher) for condition in xml.findall('./conditions/condition'): if condition.text: c = PublisherCondition(publisher=publisher, text=condition.text.strip()) c.save() # Update the publisher status publisher.oa_status = publisher.classify_oa_status() publisher.save(update_fields=['oa_status']) # TODO: if the OA status has changed, then we should update the journals and papers accordingly with the # adequate task for link in xml.findall('./copyrightlinks/copyrightlink'): text = None url = None texts = link.findall('./copyrightlinktext') if texts: text = nstrip(texts[0].text) urls = link.findall('./copyrightlinkurl') if urls: url = nstrip(urls[0].text) if url and text: cplink = PublisherCopyrightLink( text=text, url=url[:1024], publisher=publisher) cplink.save() return publisher