def test_get_nested(nested_json): """Test the results of recursively parsing a nested dict.""" assert get_nested(nested_json, 'a1') == 'example_a1' assert get_nested(nested_json, 'a', 'b') == 'example_b' assert get_nested(nested_json, 'a', 'b1', 'c') == 'example_c' assert get_nested(nested_json, 'a', 'b2') == ''
def parse(self, response): """Parse a APS JSON file into a HEP record.""" aps_response = json.loads(response.body_as_unicode()) for article in aps_response['data']: record = HEPLoader(item=HEPRecord(), response=response) dois = get_nested(article, 'identifiers', 'doi') record.add_value('dois', dois) journal_doctype = self.article_type_mapping.get( article.get('articleType'), 'other') if journal_doctype == 'other': logger.warning( 'Journal_doctype is %s for article %s. Do we need other mapping for this?' % (journal_doctype, dois)) record.add_value('journal_doctype', journal_doctype) page_nr = article.get('numPages') if page_nr is not None: record.add_value('page_nr', page_nr) arxiv = get_nested(article, 'identifiers', 'arxiv').replace('arXiv:', '') if not arxiv: logger.warning('No arxiv eprints found for article %s.' % dois) else: record.add_value('arxiv_eprints', {'value': arxiv}) record.add_value('abstract', get_nested(article, 'abstract', 'value')) record.add_value('title', get_nested(article, 'title', 'value')) authors, collaborations = self._get_authors_and_collab( article, dois) record.add_value('authors', authors) record.add_value('collaborations', collaborations) record.add_value('journal_title', get_nested(article, 'journal', 'name')) record.add_value('journal_issue', get_nested(article, 'issue', 'number')) record.add_value('journal_volume', get_nested(article, 'volume', 'number')) published_date = article['date'] record.add_value('journal_year', int(published_date[:4])) record.add_value('date_published', published_date) record.add_value('field_categories', [{ 'term': term.get('label'), 'scheme': 'APS', 'source': '', } for term in get_nested(article, 'classificationSchemes', 'subjectAreas')]) copyright_holders = get_nested(article, 'rights', 'copyrightHolders') if copyright_holders: record.add_value('copyright_holder', copyright_holders[0]['name']) record.add_value( 'copyright_year', str(get_nested(article, 'rights', 'copyrightYear'))) record.add_value('copyright_statement', get_nested(article, 'rights', 'rightsStatement')) license = get_license(license_url=get_nested( article, 'rights', 'licenses')[0]['url']) record.add_value('license', license) record.add_value('collections', ['HEP', 'Citeable', 'Published']) yield record.load_item() # Pagination support. Will yield until no more "next" pages are found if 'Link' in response.headers: links = link_header.parse(response.headers['Link']) next = links.links_by_attr_pairs([('rel', 'next')]) if next: next_url = next[0].href yield Request(next_url)