def test_fetch_journal(self): terms = {'issn': '0001-3455'} orig_terms = terms.copy() journal = self.api.fetch_journal(terms) self.assertIsInstance(journal, Journal) self.assertEqual(journal.issn, terms['issn']) self.assertEqual(journal.essn, None) # Sadly RoMEO does not provide ESSNs vîa this API endpoint self.assertEqual(journal.publisher.last_updated, dateutil.parser.parse('2013-03-11T11:27:37Z')) self.assertEqual(terms, orig_terms) from_model = Journal.find(issn=terms['issn']) self.assertEqual(from_model, journal)
def test_fix_buggy_romeo_ids(self): """ A long time ago, the SHERPA API returned "DOAJ" or "journal" as publisher id for some journals… so we need to update them appropriately. """ publisher = Publisher(romeo_id='DOAJ', preprint='can', postprint='can', pdfversion='can') publisher.save() journal = Journal(issn='0013-9696', title='Greek Review of Social Research', publisher=publisher) journal.save() # mocked separately as a different endpoint is used with requests_mock.mock() as http_mocker: http_mocker.get('http://www.sherpa.ac.uk/downloads/journal-title-issns.php?format=tsv', content=self.journals_dump_response) with patch.object(Journal, 'change_publisher') as mock_change_publisher: self.api.fetch_all_journals() new_publisher = Journal.objects.get(issn='0013-9696').publisher mock_change_publisher.assert_not_called() self.assertEqual(new_publisher.pk, publisher.pk) self.assertEqual(new_publisher.romeo_id, '2201')
def _get_oairecord_data(cls, data): """ :param data: citeproc metadata :returns: Returns a dict, ready to passed to a BarePaper instance :raises: CiteprocError """ doi = cls._get_doi(data) splash_url = doi_to_url(doi) licenses = data.get('licenses', []) pdf_url = cls._get_pdf_url(doi, licenses, splash_url) journal_title = cls._get_container(data) issn = cls._get_issn(data) journal = Journal.find(issn=issn, title=journal_title) publisher_name = data.get('publisher', '')[:512] publisher = cls._get_publisher(publisher_name, journal) bare_oairecord_data = { 'doi': doi, 'description': cls._get_abstract(data), 'identifier': doi_to_crossref_identifier(doi), 'issn': issn, 'issue': data.get('issue', ''), 'journal': journal, 'journal_title': journal_title, 'pages': data.get('page', ''), 'pdf_url': pdf_url, 'pubdate': cls._get_pubdate(data), 'publisher': publisher, 'publisher_name': publisher_name, 'pubtype': cls._get_pubtype(data), 'source': OaiSource.objects.get(identifier='crossref'), 'splash_url': splash_url, 'volume': data.get('volume', ''), } return bare_oairecord_data
def test_find(self): j1 = Journal(title='Journal of Synthetic Disillusion', issn=None, essn='1234-0707', publisher=self.publisher) j1.save() j2 = Journal(title='Slackline Review', issn='4353-2894', essn=None, publisher=self.publisher) j2.save() self.assertEqual(Journal.find(title='Slackline Review'), j2) self.assertEqual(Journal.find(title='slackline review'), j2) # We look for ISSN and ESSN in both fields, because they could easily be swapped! self.assertEqual(Journal.find(issn='1234-0707'), j1) self.assertEqual(Journal.find(essn='1234-0707'), j1) self.assertEqual(Journal.find(issn='4353-2894'), j2) self.assertEqual(Journal.find(essn='4353-2894'), j2) self.assertEqual(Journal.find(title='nonsense'), None)
def _create_publication(paper, metadata): if not metadata: return if not metadata.get('container-title'): return doi = to_doi(metadata.get('DOI', None)) title = metadata['container-title'] if isinstance(title, list): title = title[0] title = title[:512] issn = metadata.get('ISSN', None) if issn and isinstance(issn, list): issn = issn[0] # TODO pass all the ISSN to the RoMEO interface volume = metadata.get('volume', None) pages = metadata.get('page', None) issue = metadata.get('issue', None) date_dict = metadata.get('issued', dict()) pubdate = None if 'date-parts' in date_dict: dateparts = date_dict.get('date-parts')[0] pubdate = date_from_dateparts(dateparts) # for instance it outputs dates like 2014-2-3 publisher_name = metadata.get('publisher', None) if publisher_name: publisher_name = publisher_name[:512] pubtype = metadata.get('type', 'unknown') pubtype = CROSSREF_PUBTYPE_ALIASES.get(pubtype, pubtype) splash_url = doi_to_url(doi) # PDF availability pdf_url = None licenses = set([(license or {}).get('URL') for license in metadata.get('license', [])]) doi_prefix = doi.split('/')[0] if doi_prefix in free_doi_prefixes or any(map(is_oa_license, licenses)): pdf_url = splash_url # Lookup journal journal = Journal.find(issn=issn, title=title) publisher = None if journal: publisher = journal.publisher AliasPublisher.increment(publisher_name, journal.publisher) else: publisher = Publisher.find(publisher_name) barepub = BareOaiRecord( paper=paper, journal_title=title, issue=issue, volume=volume, pubdate=pubdate, pages=pages, doi=doi, pubtype=pubtype, publisher_name=publisher_name, journal=journal, publisher=publisher, pdf_url=pdf_url, splash_url=splash_url, source=OaiSource.objects.get(identifier='crossref'), identifier=doi_to_crossref_identifier(doi)) rec = paper.add_oairecord(barepub) paper.update_availability() return paper, rec
class TestCiteproc(): """ This class groups tests about the Citeproc class """ test_class = Citeproc @pytest.mark.parametrize('url, expected', is_oai_license_params) def test_is_oa_license(self, url, expected): assert self.test_class.is_oa_license(url) == expected @pytest.mark.usefixtures('db') def test_to_paper(self, container_title, title, citeproc): p = self.test_class.to_paper(citeproc) # Ensure that paper is in database (i.e. created) assert p.pk >= 1 # Check paper fields for author_p, author_c in zip(p.authors_list, citeproc['author']): assert author_p['name']['first'] == author_c['given'] assert author_p['name']['last'] == author_c['family'] assert author_p['affiliation'] == author_c['affiliation'][0][ 'name'] assert author_p['orcid'] == author_c['ORCID'] assert p.pubdate == date(*citeproc['issued']['date-parts'][0]) assert p.title == title # Ensure that oairecord is in database (i.e. created) r = OaiRecord.objects.get(about=p) # Check oairecord fields assert r.doi == citeproc['DOI'] assert r.identifier == doi_to_crossref_identifier(citeproc['DOI']) assert r.issue == citeproc['issue'] assert r.journal_title == container_title assert r.pages == citeproc['page'] assert r.pubdate == date(*citeproc['issued']['date-parts'][0]) assert r.publisher_name == citeproc['publisher'] assert r.source == OaiSource.objects.get(identifier='crossref') assert r.splash_url == doi_to_url(citeproc['DOI']) assert r.volume == citeproc['volume'] @pytest.mark.parametrize('mock_function', ['_get_oairecord_data', '_get_paper_data']) def test_to_paper_invalid_data(self, monkeypatch, mock_function, citeproc): """ If data is invalid, i.e. metadata is corrupted, somethings missing or so, must raise exception """ def raise_citeproc_error(*args, **kwargs): raise CiteprocError monkeypatch.setattr(self.test_class, mock_function, raise_citeproc_error) with pytest.raises(CiteprocError): self.test_class.to_paper(citeproc) def test_to_paper_no_data(self): """ If no data, must raise CiteprocError """ with pytest.raises(CiteprocError): self.test_class.to_paper(None) @pytest.mark.parametrize('name, expected', convert_to_name_pair_list) def test_convert_to_name_pair(self, name, expected): """ Test if name pairing works """ assert self.test_class._convert_to_name_pair(name) == expected @pytest.mark.parametrize('author_elem, expected', [(dict(), None), ({ 'affiliation': [{ 'name': 'Porto' }] }, 'Porto'), ({ 'affiliation': [{ 'name': 'Porto' }, { 'name': 'Lissabon' }] }, 'Porto')]) def test_get_affiliation(self, author_elem, expected): """ Must return the first affiliation if any """ assert self.test_class._get_affiliation(author_elem) == expected def test_get_abstract(self, citeproc): """ Abstract must be set """ assert self.test_class._get_abstract(citeproc) == citeproc['abstract'] def test_get_abstact_missing(self, citeproc): """ If no abstract, assert blank """ del citeproc['abstract'] assert self.test_class._get_abstract(citeproc) == '' def test_get_abstract_escaping(self, citeproc): """ Must do some escaping, e.g. we sometimes get some jats tags """ # We wrap the current abstract into some jats expected = citeproc['abstract'] citeproc['abstract'] = r'<jats:p>{}<\/jats:p>'.format(expected) assert self.test_class._get_abstract(citeproc) == expected def test_get_affiliations(self, affiliations, citeproc): """ Must have the same length as citeproc['author'] and identical to list of affiliations """ r = self.test_class._get_affiliations(citeproc) assert len(r) == len(citeproc.get('author')) assert r == affiliations def test_get_affiliations_no_authors(self, citeproc): """ Must rais exception """ del citeproc['author'] with pytest.raises(CiteprocAuthorError): self.test_class._get_affiliations(citeproc) def test_get_authors(self, citeproc): """ The list of authors shall be a list of BareNames """ r = self.test_class._get_authors(citeproc) assert isinstance(r, list) for barename in r: assert isinstance(barename, BareName) def test_get_authors_empty_list(self, citeproc): """ The list of authors must not be empty """ citeproc['author'] = [] with pytest.raises(CiteprocAuthorError): self.test_class._get_authors(citeproc) def test_get_authors_no_list(self, citeproc): """ author in citeproc must be a list """ del citeproc['author'] with pytest.raises(CiteprocAuthorError): self.test_class._get_authors(citeproc) def test_get_authors_invalid_author(self, monkeypatch, citeproc): """ If 'None' is an entry, raise exception """ # We mock the function and let it return None, so that name_pairs is a list of None monkeypatch.setattr(self.test_class, '_convert_to_name_pair', lambda x: None) with pytest.raises(CiteprocAuthorError): self.test_class._get_authors(citeproc) def test_get_container(self, container_title, citeproc): """ Must return container title """ assert self.test_class._get_container(citeproc) == container_title def test_get_container_missing(self): """ Must return exception """ with pytest.raises(CiteprocContainerTitleError): self.test_class._get_container(dict()) def test_get_doi(self, citeproc): """ Must return the DOI """ assert self.test_class._get_doi(citeproc) == citeproc['DOI'] def test_get_doi_invalid(self): """ Must raise exception """ with pytest.raises(CiteprocDOIError): self.test_class._get_doi({'DOI': 'spanish inquisition'}) def test_get_doi_missing(self): """ Must raise exception """ with pytest.raises(CiteprocDOIError): self.test_class._get_doi(dict()) @pytest.mark.parametrize('issn, expected', [('1234-5675', '1234-5675'), ([ '1234-5675', ], '1234-5675'), ([], '')]) def test_get_issn(self, citeproc, issn, expected): """ Must return the issn or '' """ citeproc['ISSN'] = issn assert self.test_class._get_issn(citeproc) == expected def test_get_issn_missing(self, citeproc): """ Must return '' """ del citeproc['ISSN'] assert self.test_class._get_issn(citeproc) == '' @pytest.mark.usefixtures('mock_alias_publisher_increment', 'mock_journal_find', 'mock_publisher_find') @pytest.mark.parametrize('journal', [Journal(publisher=Publisher()), None]) def test_get_oairecord_data(self, db, monkeypatch, container_title, issn, citeproc, journal): """ We do some assertions on the results, but relatively lax, as we test the called functions, too """ monkeypatch.setattr(Journal, 'find', lambda issn, title: journal) r = self.test_class._get_oairecord_data(citeproc) assert r['doi'] == citeproc['DOI'] assert r['description'] == citeproc['abstract'] assert r['identifier'] == doi_to_crossref_identifier(citeproc['DOI']) assert r['issn'] == issn assert r['issue'] == citeproc['issue'] assert r['journal'] == journal assert r['journal_title'] == container_title assert r['pages'] == citeproc['page'] assert r['pdf_url'] == '' # Is not OA assert r['pubdate'] == date(*citeproc['issued']['date-parts'][0]) assert r['publisher_name'] == citeproc['publisher'] assert r['pubtype'] == citeproc['type'] assert r['source'] == OaiSource.objects.get(identifier='crossref') assert r['splash_url'] == doi_to_url(citeproc['DOI']) assert r['volume'] == citeproc['volume'] @pytest.mark.usefixtures('mock_journal_find', 'mock_publisher_find') def test_get_oairecord_data_missing(self, monkeypatch, container_title, issn, citeproc): """ Some fields may be empty, namely those with a direct get call """ keys = ['abstract', 'issue', 'publisher', 'page', 'volume'] for k in keys: del citeproc[k] r = self.test_class._get_oairecord_data(citeproc) keys = ['description', 'issue', 'publisher_name', 'pages', 'volume'] for k in keys: assert r[k] == '' @pytest.mark.parametrize('orcid, expected', [({ 'ORCID': '0000-0001-8187-9704' }, '0000-0001-8187-9704'), ({ 'ORCID': '0000-0001-8187-9705' }, None), ({}, None)]) def test_get_orcid(self, orcid, expected): """ Must be valid or None """ assert self.test_class._get_orcid(orcid) == expected def test_get_orcids(self, orcids, citeproc): """ Must have the same length as citeproc['author'] and identical to list of orcid """ r = self.test_class._get_orcids(citeproc) assert len(r) == len(citeproc.get('author')) assert r == orcids def test_get_orcid_no_authors(self, citeproc): """ Must rais exception """ del citeproc['author'] with pytest.raises(CiteprocAuthorError): self.test_class._get_orcids(citeproc) def test_get_paper_data(self, affiliations, orcids, title, citeproc): """ We do some assertions on the results, but relatively lax, as we test the called functions, too """ r = self.test_class._get_paper_data(citeproc) assert r['affiliations'] == affiliations for a in r['author_names']: assert isinstance(a, BareName) assert r['orcids'] == orcids assert r['pubdate'] == date(*citeproc['issued']['date-parts'][0]) assert r['title'] == title @pytest.mark.parametrize('doi', [True, False]) @pytest.mark.parametrize('license', [True, False]) def test_get_pdf_url(self, monkeypatch, doi, license): """ Must return true or false """ monkeypatch.setattr(self.test_class, '_is_oa_by_doi', lambda x: doi) monkeypatch.setattr(self.test_class, '_is_oa_by_license', lambda x: license) url = 'https://repository.dissem.in/entry/3242/document.pdf' r = self.test_class._get_pdf_url(doi, license, url) if doi or license: assert r == url else: assert r == '' def test_get_pubdate_issued(self, citeproc): """ If contains issued, take this """ citeproc['created'] = {'date-parts': [[2019, 10, 11]]} citeproc['deposited'] = {'date-parts': [[2019, 10, 12]]} assert self.test_class._get_pubdate(citeproc) == date( *citeproc['issued']['date-parts'][0]) def test_get_pubdate_created(self, citeproc): """ If contains no issued, take created """ del citeproc['issued'] citeproc['created'] = {'date-parts': [[2019, 10, 11]]} citeproc['deposited'] = {'date-parts': [[2019, 10, 12]]} assert self.test_class._get_pubdate(citeproc) == date( *citeproc['created']['date-parts'][0]) def test_get_pubdate_deposited(self, citeproc): """ If contains no issued and created, take deposited """ del citeproc['issued'] citeproc['deposited'] = {'date-parts': [[2019, 10, 12]]} assert self.test_class._get_pubdate(citeproc) == date( *citeproc['deposited']['date-parts'][0]) def test_get_pubdate_no_date(self, citeproc): """ If contains no date, raise exception """ del citeproc['issued'] with pytest.raises(CiteprocDateError): self.test_class._get_pubdate(citeproc) def test_get_pubdate_received_none(self, monkeypatch): """ If no valid date is found, raise exception """ monkeypatch.setattr(self.test_class, '_parse_date', lambda x: None) with pytest.raises(CiteprocDateError): self.test_class._get_pubdate(dict()) @pytest.mark.usefixtures('mock_alias_publisher_increment') def test_get_publisher_by_journal(self): """ Must return Publisher object """ publisher = Publisher() journal = Journal(publisher=publisher) assert self.test_class._get_publisher('p_name', journal) == publisher def test_get_publisher_by_name(self, monkeypatch): """ Must return publisher object """ publisher = Publisher() monkeypatch.setattr(Publisher, 'find', lambda x: publisher) assert self.test_class._get_publisher('p_name', None) == publisher def test_get_pubtype(self): """ Must return something from PAPER_TYPES """ pubtype = 'book' assert self.test_class._get_pubtype({'type': pubtype}) == pubtype def test_get_pubtype_strange(self): """ Must return other """ assert self.test_class._get_pubtype({'type': 'spanish inquisition'}) == 'other' def test_get_pubtype_missing(self): """ Must raise exception """ with pytest.raises(CiteprocPubtypeError): self.test_class._get_pubtype(dict()) def test_get_title(self, citeproc): r = self.test_class._get_title(citeproc) assert r == citeproc['title'][:1024] assert len(r) <= 1024 def test_get_title_length(self, citeproc): """ Title must no be longer than 1024 chars """ citeproc['title'] = 'x' * 2000 r = self.test_class._get_title(citeproc) assert r == citeproc['title'][:1024] assert len(r) <= 1024 def test_get_title_length_with_unicode(self, citeproc): citeproc['title'] = '–' * 1024 r = self.test_class._get_title(citeproc) assert r == citeproc['title'][:341] assert len(r) <= 1024 def test_get_title_no_title(self, citeproc): """ Title is mandatory """ del citeproc['title'] with pytest.raises(CiteprocTitleError): self.test_class._get_title(citeproc) def test_get_title_emtpy_string(self, citeproc): """ If no title is found, expect CiteprocTitleError """ citeproc['title'] = '' with pytest.raises(CiteprocTitleError): self.test_class._get_title(citeproc) @pytest.mark.parametrize('doi, expected', [('10.2195/spam', True), ('10.15122/spam', False)]) def test_is_oa_by_doi(self, doi, expected): """ Must be true or false """ assert self.test_class._is_oa_by_doi(doi) == expected @pytest.mark.parametrize('licenses, expected', [([{ 'URL': 'creativecommons.org/licenses/' }], True), ([{ 'URL': 'https://dissem.in/not_free' }], False), ([{}], False), ([], False)]) def test_is_oa_by_license(self, licenses, expected): """ Must be true or false """ assert self.test_class._is_oa_by_license(licenses) == expected @pytest.mark.parametrize('data, expected', [({ 'date-parts': [[2019, 10, 10]] }, date(2019, 10, 10)), ({ 'raw': '2019-10-10' }, date(2019, 10, 10)), (None, None), ({ 'spam': 'ham' }, None)]) def test_parse_date(self, data, expected): """ Must return a valid date or None """ assert self.test_class._parse_date(data) == expected @pytest.mark.parametrize('date_parts, expected', [([ 2019, ], date(2019, 1, 1)), ([ 2019, 10, ], date(2019, 10, 1)), ([2019, 10, 10], date(2019, 10, 10))]) def test_parse_date_parts(self, date_parts, expected): """ Must parse the date list """ assert self.test_class._parse_date_parts(date_parts) == expected
def fetch_journal(search_terms, matching_mode='exact'): """ Fetch the journal data from RoMEO. Returns an Journal object. search_terms should be a dictionnary object containing at least one of these fields: """ allowed_fields = ['issn', 'jtitle'] terms = search_terms.copy() # Make the title HTML-safe before searching for it in the database or in # the API if 'title' in terms: terms['title'] = kill_html(terms['title']) # Check the arguments if not all(key in allowed_fields for key in terms): raise ValueError('The search terms have to belong to ' + str(allowed_fields) + 'but the dictionary I got is ' + str(terms)) # Remove diacritics (because it has to be sent in ASCII to ROMEO) for key in terms: terms[key] = remove_diacritics(terms[key]) if len(terms[key]) > 256: return None # First check we don't have it already journal = find_journal_in_model(terms) if journal: return journal # Perform the query if matching_mode != 'exact': terms['qtype'] = matching_mode root = perform_romeo_query(terms) # Find the matching journals (if any) journals = list(root.findall('./journals/journal')) if not journals: return None elif len(journals) > 1: print("Warning, " + str(len(journals)) + " journals match the RoMEO request, " + "defaulting to the first one") # TODO different behaviour: get the ISSN and try again. journal = journals[0] names = list(journal.findall('./jtitle')) if not names: raise MetadataSourceException( 'RoMEO returned a journal without title.\n' + 'Terms were: ' + unicode(terms)) if len(names) > 1: print("Warning, " + str(len(names)) + " names provided for one journal, " + "defaulting to the first one") name = kill_html(names[0].text) issn = None try: issn = nstrip(journal.findall('./issn')[0].text) except (KeyError, IndexError): pass # Now we may have additional info, so it's worth trying again in the model model_journal = find_journal_in_model({'issn': issn, 'jtitle': name}) if model_journal: return model_journal # Otherwise we need to find the publisher publishers = root.findall('./publishers/publisher') if not publishers: return None # TODO here we shouldn't default to the first one but look it up using the # <romeopub> publisher_desc = publishers[0] publisher = get_or_create_publisher(publisher_desc) result = Journal(title=name, issn=issn, publisher=publisher) result.save() return result