def get_or_create_by_orcid(cls, orcid, profile=None, user=None): researcher = None try: researcher = Researcher.objects.get(orcid=orcid) except Researcher.DoesNotExist: if profile is None: profile = OrcidProfile(id=orcid) else: profile = OrcidProfile(json=profile) name = profile.name homepage = profile.homepage email = profile.email researcher = Researcher.create_by_name(name[0], name[1], orcid=orcid, user=user, homepage=homepage, email=email) # Ensure that extra info is added. save = False for kw, val in [('homepage', homepage), ('orcid', orcid), ('email', email)]: if not researcher.__dict__[kw] and val: researcher.__dict__[kw] = val save = True if save: researcher.save() for variant in profile.other_names: confidence = name_similarity(variant, variant) name = Name.lookup_name(variant) researcher.add_name_variant(name, confidence) return researcher
def test_institution(self): self.assertEqual(OrcidProfile( orcid_id='0000-0002-0022-2290').institution, {'name':'Ecole Normale Superieure', 'identifier':None, 'country':'FR'}) self.assertEqual(OrcidProfile( orcid_id='0000-0002-5654-4053').institution, {'country': 'FR', 'identifier': None, 'name': "École nationale supérieure de céramique industrielle"})
def test_institution(self): self.assertEqual( OrcidProfile(orcid_id='0000-0002-0022-2290').institution, { 'name': 'Ecole Normale Superieure', 'identifier': None, 'country': 'FR' }) self.assertEqual( OrcidProfile(orcid_id='0000-0002-5654-4053').institution, { 'country': 'FR', 'identifier': None, 'name': "Polytech'Rambouillet" })
def bulk_import(self, directory, fetch_papers=True, use_doi=False): """ Bulk-imports ORCID profiles from a dump (warning: this still uses our DOI cache). The directory should contain json versions of orcid profiles, as in the official ORCID dump. """ for root, _, fnames in os.walk(directory): for fname in fnames: #if fname == '0000-0003-1349-4524.json': # seen = True #if not seen: # continue with open(os.path.join(root, fname), 'r') as f: try: profile = json.load(f) orcid = profile['orcid-profile']['orcid-identifier'][ 'path'] r = Researcher.get_or_create_by_orcid(orcid, profile, update=True) if fetch_papers: papers = self.fetch_orcid_records( orcid, profile=OrcidProfile(json=profile), use_doi=use_doi) for p in papers: self.save_paper(p, r) except (ValueError, KeyError): logger.warning("Invalid profile: %s" % fname)
def get_or_create_by_orcid(cls, orcid, profile=None, user=None): """ Creates (or returns an existing) researcher from its ORCID id. :param profile: an :class:`OrcidProfile` object if it has already been fetched from the API (otherwise we will fetch it ourselves) :param user: an user to associate with the profile. :returns: a :class:`Researcher` if everything went well, raises MetadataSourceException otherwise """ researcher = None if orcid is None: raise MetadataSourceException('Invalid ORCID id') try: researcher = Researcher.objects.get(orcid=orcid) except Researcher.DoesNotExist: if profile is None: profile = OrcidProfile(id=orcid) else: profile = OrcidProfile(json=profile) name = profile.name homepage = profile.homepage email = profile.email researcher = Researcher.create_by_name(name[0], name[1], orcid=orcid, user=user, homepage=homepage, email=email) # Ensure that extra info is added. save = False for kw, val in [('homepage', homepage), ('orcid', orcid), ('email', email)]: if not researcher.__dict__[kw] and val: researcher.__dict__[kw] = val save = True if save: researcher.save() for variant in profile.other_names: confidence = name_similarity(variant, variant) name = Name.lookup_name(variant) researcher.add_name_variant(name, confidence) return researcher
def test_search(self): # for this one we use the production database # because test profiles on the sandbox # tend to get deleted quite often results = list(OrcidProfile.search_by_name('John', 'Doe')) self.assertTrue(all(map(lambda x: len(x['orcid']) and ( len(x['first']) or len(x['last'])), results))) names_returned = map(lambda x: (x['first'], x['last']), results) self.assertTrue(('John', 'Doe') in names_returned)
def test_sandbox(self): self.assertEqual(OrcidProfile( orcid_id='0000-0002-5654-4053').name, ('Peter', 'Lieth'))
def test_wrong_instance(self): with self.assertRaises(ValueError): p = OrcidProfile('0000-0002-2963-7764', instance='dissem.in') del p
def test_homepage_without_http(self): self.assertEqual(OrcidProfile( orcid_id='0000-0002-5710-3989').homepage, 'http://evrard.perso.enseeiht.fr')
def test_empty_lastname(self): self.assertEqual(OrcidProfile( orcid_id='0000-0001-5006-3868').name, ('Qiang', ''))
def fetch_orcid_records(self, orcid_identifier, profile=None, use_doi=True): """ Queries ORCiD to retrieve the publications associated with a given ORCiD. It also fetches such papers from the CrossRef search interface. :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON). :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow) :returns: a generator, where all the papers found are yielded. (some of them could be in free form, hence not imported) """ cr_api = CrossRefAPI() # Cleanup iD: orcid_id = validate_orcid(orcid_identifier) if orcid_id is None: raise MetadataSourceException('Invalid ORCiD identifier') # Get ORCiD profile try: if profile is None: profile = OrcidProfile(orcid_id=orcid_id) except MetadataSourceException: logger.exception("ORCID Profile Error") return # As we have fetched the profile, let's update the Researcher self.researcher = Researcher.get_or_create_by_orcid(orcid_identifier, profile.json, update=True) if not self.researcher: return # Reference name ref_name = profile.name ignored_papers = [] # list of ignored papers due to incomplete metadata # Get summary publications and separate them in two classes: # - the ones with DOIs, that we will fetch with CrossRef dois_and_putcodes = [] # list of (DOIs,putcode) to fetch # - the ones without: we will fetch ORCID's metadata about them # and try to create a paper with what they provide put_codes = [] for summary in profile.work_summaries: if summary.doi and use_doi: dois_and_putcodes.append((summary.doi, summary.put_code)) else: put_codes.append(summary.put_code) # 1st attempt with DOIs and CrossRef if use_doi: # Let's grab papers with DOIs found in our ORCiD profile. dois = [doi for doi, put_code in dois_and_putcodes] for idx, (success, paper_or_metadata) in enumerate(self.fetch_metadata_from_dois(cr_api, ref_name, orcid_id, dois)): if success: yield paper_or_metadata # We know that this is a paper else: put_codes.append(dois_and_putcodes[idx][1]) # 2nd attempt with ORCID's own crappy metadata works = profile.fetch_works(put_codes) for work in works: if not work: continue # If the paper is skipped due to invalid metadata. # We first try to reconcile it with local researcher author name. # Then, we consider it missed. if work.skipped: logger.warning("Work skipped due to incorrect metadata. \n %s \n %s" % (work.reason, work.skip_reason)) ignored_papers.append(work.as_dict()) continue yield self.create_paper(work) self.warn_user_of_ignored_papers(ignored_papers) if ignored_papers: logger.warning("Total ignored papers: %d" % (len(ignored_papers)))
def setUpClass(self): self.antonin = OrcidProfile(orcid_id='0000-0002-8612-8827') self.thomas = OrcidProfile(orcid_id='0000-0003-0524-631X') self.sergey = OrcidProfile(orcid_id='0000-0003-3397-9895') self.marco = OrcidProfile(orcid_id='0000-0002-6561-5642')
class OrcidProfileTest(unittest.TestCase): """ TODO: duplicate all these profiles to the ORCID sandbox to be sure they will not be modified! """ @classmethod def setUpClass(self): self.antonin = OrcidProfile(orcid_id='0000-0002-8612-8827') self.thomas = OrcidProfile(orcid_id='0000-0003-0524-631X') self.sergey = OrcidProfile(orcid_id='0000-0003-3397-9895') self.marco = OrcidProfile(orcid_id='0000-0002-6561-5642') def test_simple_name(self): self.assertEqual(self.antonin.name, ('Antonin', 'Delpeuch')) self.assertEqual(self.thomas.name, ('Thomas', 'Bourgeat')) self.assertEqual(self.marco.name, ('Marco', 'Diana')) def test_credit_name(self): self.assertEqual(self.sergey.name, ('Sergey M.', 'Natanzon')) self.assertEqual(OrcidProfile( orcid_id='0000-0001-9547-293X').name, ('Darío', 'Álvarez')) def test_empty_lastname(self): self.assertEqual(OrcidProfile( orcid_id='0000-0001-5006-3868').name, ('Qiang', '')) def test_other_names(self): self.assertEqual(set(self.sergey.other_names), set([('Sergey', 'Natanzon'), ('S.', 'Natanzon'), ('S. M.', 'Natanzon'), ('Sergey', 'Natanzon')])) def test_homepage_without_http(self): self.assertEqual(OrcidProfile( orcid_id='0000-0002-5710-3989').homepage, 'http://evrard.perso.enseeiht.fr') def test_iterable(self): for key in self.thomas: self.assertEqual(type(key), unicode) def test_attr(self): self.assertTrue('orcid-identifier' in self.thomas) self.assertEqual(type(self.thomas['orcid-identifier']), dict) def test_wrong_instance(self): with self.assertRaises(ValueError): p = OrcidProfile('0000-0002-2963-7764', instance='dissem.in') del p def test_sandbox(self): self.assertEqual(OrcidProfile( orcid_id='0000-0002-5654-4053').name, ('Peter', 'Lieth')) def test_search(self): # for this one we use the production database # because test profiles on the sandbox # tend to get deleted quite often results = list(OrcidProfile.search_by_name('John', 'Doe')) self.assertTrue(all(map(lambda x: len(x['orcid']) and ( len(x['first']) or len(x['last'])), results))) names_returned = map(lambda x: (x['first'], x['last']), results) self.assertTrue(('John', 'Doe') in names_returned) def test_institution(self): self.assertEqual(OrcidProfile( orcid_id='0000-0002-0022-2290').institution, {'name':'Ecole Normale Superieure', 'identifier':None, 'country':'FR'}) self.assertEqual(OrcidProfile( orcid_id='0000-0002-5654-4053').institution, {'country': 'FR', 'identifier': None, 'name': "École nationale supérieure de céramique industrielle"}) def test_work_summaries(self): summaries = self.antonin.work_summaries dois = [summary.doi for summary in summaries] titles = [summary.title for summary in summaries] self.assertTrue('10.4204/eptcs.172.16' in dois) self.assertTrue('Complexity of Grammar Induction for Quantum Types' in titles) self.assertTrue(None not in [summary.put_code for summary in summaries]) def test_philipp(self): p = OrcidProfile(orcid_id='0000-0001-6723-6833') summaries = p.work_summaries dois = [summary.doi for summary in summaries] self.assertTrue('10.3354/meps09890' in dois) def test_wrong_id_type(self): """ I found this payload in an ORCID profile… looks like ORCID does not validate their ids against regexes """ summary_json = { "last-modified-date" : { "value" : 1505077812702 }, "external-ids" : { "external-id" : [ { "external-id-type" : "doi", "external-id-value" : "http://hdl.handle.net/2080/2662", "external-id-url" : None, "external-id-relationship" : "SELF" } ] }, "work-summary" : [ { "put-code" : 36669776, "created-date" : { "value" : 1505077812702 }, "last-modified-date" : { "value" : 1505077812702 }, "source" : { "source-orcid" : { "uri" : "https://orcid.org/0000-0002-9658-1473", "path" : "0000-0002-9658-1473", "host" : "orcid.org" }, "source-client-id" : None, "source-name" : { "value" : "Bhojaraju Gunjal" } }, "title" : { "title" : { "value" : "Open Source Solutions for Creation of ETD Archives/Repository: A Case Study of Central Library@NIT Rourkela" }, "subtitle" : None, "translated-title" : None }, "external-ids" : { "external-id" : [ { "external-id-type" : "doi", "external-id-value" : "http://hdl.handle.net/2080/2662", "external-id-url" : None, "external-id-relationship" : "SELF" } ] }, "type" : "CONFERENCE_PAPER", "publication-date" : { "year" : { "value" : "2017" }, "month" : None, "day" : None, "media-type" : None }, "visibility" : "PUBLIC", "path" : "/0000-0002-9658-1473/work/36669776", "display-index" : "1" } ] } summary = OrcidWorkSummary(summary_json) self.assertEqual(summary.doi, None) def test_multiple_ids(self): summary_json = { "last-modified-date" : { "value" : 1506388112650 }, "external-ids" : { "external-id" : [ { "external-id-type" : "eid", "external-id-value" : "2-s2.0-84864877237", "external-id-url" : None, "external-id-relationship" : "SELF" }, { "external-id-type" : "doi", "external-id-value" : "10.3354/meps09890", "external-id-url" : None, "external-id-relationship" : "SELF" } ] }, "work-summary" : [ { "put-code" : 19176128, "created-date" : { "value" : 1444695659490 }, "last-modified-date" : { "value" : 1506388112650 }, "source" : { "source-orcid" : None, "source-client-id" : { "uri" : "https://orcid.org/client/0000-0002-3054-1567", "path" : "0000-0002-3054-1567", "host" : "orcid.org" }, "source-name" : { "value" : "CrossRef Metadata Search" } }, "title" : { "title" : { "value" : "Elephant seal foraging dives track prey distribution, not temperature: Comment on McIntyre et al. (2011)" }, "subtitle" : None, "translated-title" : None }, "external-ids" : { "external-id" : [ { "external-id-type" : "doi", "external-id-value" : "10.3354/meps09890", "external-id-url" : None, "external-id-relationship" : "SELF" } ] }, "type" : "JOURNAL_ARTICLE", "publication-date" : { "year" : { "value" : "2012" }, "month" : { "value" : "08" }, "day" : { "value" : "08" }, "media-type" : None }, "visibility" : "PUBLIC", "path" : "/0000-0001-6723-6833/work/19176128", "display-index" : "0" }]} summary = OrcidWorkSummary(summary_json) self.assertEqual(summary.doi, '10.3354/meps09890') def test_works(self): summaries = self.antonin.work_summaries put_codes = [s.put_code for s in summaries] works = list(self.antonin.fetch_works(put_codes)) titles = [work.title for work in works] self.assertTrue('Complexity of Grammar Induction for Quantum Types' in titles) pubtypes = [work.pubtype for work in works] self.assertTrue('journal-article' in pubtypes)
def test_philipp(self): p = OrcidProfile(orcid_id='0000-0001-6723-6833') summaries = p.work_summaries dois = [summary.doi for summary in summaries] self.assertTrue('10.3354/meps09890' in dois)
def fetch_orcid_records(self, orcid_identifier, profile=None, use_doi=True): """ Queries ORCiD to retrieve the publications associated with a given ORCiD. It also fetches such papers from the CrossRef search interface. :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON). :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow) :returns: a generator, where all the papers found are yielded. (some of them could be in free form, hence not imported) """ cr_api = CrossRefAPI() # Cleanup iD: orcid_id = validate_orcid(orcid_identifier) if orcid_id is None: raise MetadataSourceException('Invalid ORCiD identifier') # Get ORCiD profile try: if profile is None: profile = OrcidProfile(orcid_id=orcid_id) else: profile = OrcidProfile(json=profile) except MetadataSourceException as e: print e return # As we have fetched the profile, let's update the Researcher self.researcher = Researcher.get_or_create_by_orcid(orcid_identifier, profile.json, update=True) if not self.researcher: return # Reference name ref_name = profile.name # curl -H "Accept: application/orcid+json" # 'http://pub.orcid.org/v1.2/0000-0002-8612-8827/orcid-works' -L -i dois = [] # list of DOIs to fetch ignored_papers = [ ] # list of ignored papers due to incomplete metadata # Fetch publications (1st attempt with ORCiD data) pubs = jpath('orcid-profile/orcid-activities/orcid-works/orcid-work', profile, []) for pub in pubs: data_paper = ORCIDDataPaper.from_orcid_metadata( ref_name, orcid_id, pub, stop_if_dois_exists=use_doi) if not data_paper: continue if data_paper.dois and use_doi: # We want to batch it rather than manually do it. dois.extend(data_paper.dois) continue # If the paper is skipped due to invalid metadata. # We first try to reconcile it with local researcher author name. # Then, we consider it missed. if data_paper.skipped: data_paper = self.reconcile_paper( ref_name, orcid_id, pub, overrides={ 'authors': [(self.researcher.name.first, self.researcher.name.last)] }) if data_paper.skipped: print('%s is skipped due to incorrect metadata (%s)' % (data_paper, data_paper.skip_reason)) ignored_papers.append(data_paper.as_dict()) continue yield self.create_paper(data_paper) # 2nd attempt with DOIs and CrossRef if use_doi: # Let's grab papers from CrossRef #for success, paper_or_metadata in self.fetch_crossref_incrementally(cr_api, orcid_id): # if success: # yield paper_or_metadata # else: # ignored_papers.append(paper_or_metadata) # print('This metadata (%s) yields no paper.' % # (unicode(paper_or_metadata))) # Let's grab papers with DOIs found in our ORCiD profile. # FIXME(RaitoBezarius): if we fail here, we should get back the pub # and yield it. for success, paper_or_metadata in self.fetch_metadata_from_dois( cr_api, ref_name, orcid_id, dois): if success: yield paper_or_metadata else: ignored_papers.append(paper_or_metadata) print('This metadata (%s) yields no paper.' % (paper_or_metadata)) self.warn_user_of_ignored_papers(ignored_papers) if ignored_papers: print('Warning: Total ignored papers: %d' % (len(ignored_papers)))
def fetch_orcid_records(self, orcid_identifier, profile=None, use_doi=True): """ Queries ORCiD to retrieve the publications associated with a given ORCiD. It also fetches such papers from the CrossRef search interface. :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON). :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow) :returns: a generator, where all the papers found are yielded. (some of them could be in free form, hence not imported) """ cr_api = CrossRefAPI() # Cleanup iD: orcid_id = validate_orcid(orcid_identifier) if orcid_id is None: raise MetadataSourceException('Invalid ORCiD identifier') # Get ORCiD profile try: if profile is None: profile = OrcidProfile(orcid_id=orcid_id) else: profile = OrcidProfile(json=profile) except MetadataSourceException as e: print e return # As we have fetched the profile, let's update the Researcher self.researcher = Researcher.get_or_create_by_orcid(orcid_identifier, profile.json, update=True) if not self.researcher: return # Reference name ref_name = profile.name ignored_papers = [ ] # list of ignored papers due to incomplete metadata # Get summary publications and separate them in two classes: # - the ones with DOIs, that we will fetch with CrossRef dois_and_putcodes = [] # list of (DOIs,putcode) to fetch # - the ones without: we will fetch ORCID's metadata about them # and try to create a paper with what they provide put_codes = [] for summary in profile.work_summaries: if summary.doi and use_doi: dois_and_putcodes.append((summary.doi, summary.put_code)) else: put_codes.append(summary.put_code) # 1st attempt with DOIs and CrossRef if use_doi: # Let's grab papers with DOIs found in our ORCiD profile. dois = [doi for doi, put_code in dois_and_putcodes] for idx, (success, paper_or_metadata) in enumerate( self.fetch_metadata_from_dois(cr_api, ref_name, orcid_id, dois)): if success: yield paper_or_metadata else: put_codes.append(dois_and_putcodes[idx][1]) # 2nd attempt with ORCID's own crappy metadata works = profile.fetch_works(put_codes) for work in works: if not work: continue # If the paper is skipped due to invalid metadata. # We first try to reconcile it with local researcher author name. # Then, we consider it missed. if work.skipped: print(work.json) print(work.skip_reason) print('work skipped due to incorrect metadata (%s)' % (work.skip_reason)) ignored_papers.append(work.as_dict()) continue yield self.create_paper(work) self.warn_user_of_ignored_papers(ignored_papers) if ignored_papers: print('Warning: Total ignored papers: %d' % (len(ignored_papers)))
def test_credit_name(self): self.assertEqual(self.sergey.name, ('Sergey M.', 'Natanzon')) self.assertEqual(OrcidProfile( orcid_id='0000-0001-9547-293X').name, ('Darío', 'Álvarez'))