def test_task_update_record(self): with patch('adsmp.tasks.task_index_records.apply_async') as next_task, \ patch('adsmp.app.ADSMasterPipelineCelery.request_aff_augment') as augment: tasks.task_update_record(DenormalizedRecord(bibcode='2015ApJ...815..133S')) self.assertFalse(next_task.called) self.assertTrue(augment.called) with patch('adsmp.solr_updater.delete_by_bibcodes', return_value=[('2015ApJ...815..133S'), ()]) as solr_delete, \ patch('adsmp.app.ADSMasterPipelineCelery.request_aff_augment') as augment, \ patch.object(self.app, 'metrics_delete_by_bibcode', return_value=True) as metrics_delete: tasks.task_update_record(DenormalizedRecord(bibcode='2015ApJ...815..133S', status='deleted')) self.assertTrue(solr_delete.called) self.assertTrue(metrics_delete.called) self.assertFalse(augment.called)
def test_task_update_record(self): with patch('adsmp.tasks.task_index_records.delay') as next_task: self.assertFalse(next_task.called) tasks.task_update_record( DenormalizedRecord(bibcode='2015ApJ...815..133S')) self.assertTrue(next_task.called) self.assertTrue(next_task.call_args[0], ('2015ApJ...815..133S', )) with patch('adsmp.solr_updater.delete_by_bibcodes', return_value=[('2015ApJ...815..133S'), ()]) as solr_delete: tasks.task_update_record( DenormalizedRecord(bibcode='2015ApJ...815..133S', status='deleted')) self.assertTrue(next_task.call_args[0], ('2015ApJ...815..133S', )) self.assertTrue(solr_delete.called)
def test_task_update_record_delete(self): for x, cls in (('fulltext', FulltextUpdate), ('orcid_claims', OrcidClaims)): self.app.update_storage('bibcode', x, {'foo': 'bar'}) self.assertEquals(self.app.get_record('bibcode')[x]['foo'], 'bar') with patch('adsmp.tasks.task_index_records.delay') as next_task: tasks.task_update_record( cls(bibcode='bibcode', status='deleted')) self.assertEquals(self.app.get_record('bibcode')[x], None) self.assertTrue(self.app.get_record('bibcode')) recs = NonBibRecordList() recs.nonbib_records.extend( [NonBibRecord(bibcode='bibcode', status='deleted').data]) with patch('adsmp.tasks.task_index_records.delay') as next_task: tasks.task_update_record(recs) self.assertEquals(self.app.get_record('bibcode')['metrics'], None) self.assertTrue(self.app.get_record('bibcode')) with patch('adsmp.tasks.task_delete_documents') as next_task: tasks.task_update_record( DenormalizedRecord(bibcode='bibcode', status='deleted')) self.assertTrue(next_task.called) self.assertTrue(next_task.call_args[0], ('bibcode', ))
def test_task_index_links(self): """verify data is sent to links microservice update endpoint""" r = Mock() r.status_code = 200 # just make sure we have the entry in a database tasks.task_update_record(DenormalizedRecord(bibcode='linkstest')) with patch.object(self.app, 'get_record', return_value={'bibcode': 'linkstest', 'nonbib_data': {'data_links_rows': [{'baz': 0}]}, 'bib_data_updated': get_date(), 'nonbib_data_updated': get_date(), 'processed': get_date('2025')}), \ patch('requests.put', return_value = r, new_callable=CopyingMock) as p: tasks.task_index_records(['linkstest'], update_solr=False, update_metrics=False, update_links=True, force=True) p.assert_called_with('http://localhost:8080/update', data=json.dumps([{ 'bibcode': 'linkstest', 'data_links_rows': [{ 'baz': 0 }] }]), headers={'Authorization': 'Bearer api_token'}) rec = self.app.get_record(bibcode='linkstest') self.assertEquals(rec['datalinks_checksum'], '0x80e85169') self.assertEquals(rec['solr_checksum'], None) self.assertEquals(rec['metrics_checksum'], None)
def serialize(self, record, **kwargs): if (len(record.keys()) > 0): rec = DenormalizedRecord(**record) task_output_results.delay(rec) else: print ("Null record, not sending to master pipeline")
def test_serialization(self): abstract = "This is a dummy abstract." author = "Foe, J." author_count = 1 denormalized_record = DenormalizedRecord() denormalized_record.data.abstract = abstract denormalized_record.data.author.append(author) denormalized_record.data.author_count = author_count data = denormalized_record.serialize() self.assertEqual(data, '\n\x19{0}:\x07{1}@\x01'.format(abstract, author)) data_str = str(denormalized_record) self.assertEqual(data_str, 'abstract: "{0}"\nauthor: "{1}"\nauthor_count: {2}\n'.format(abstract, author, author_count)) self.assertNotEqual(data, data_str) recovered_bibrecord = DenormalizedRecord.deserializer(data) self.assertTrue(recovered_bibrecord.is_valid()) self.assertEqual(denormalized_record.data.abstract, abstract) self.assertEqual(denormalized_record.data.author[0], author) self.assertEqual(denormalized_record.data.author_count, author_count)
def task_output_results(msg): """ This worker will forward results to the outside exchange (typically an ADSMasterPipeline) to be incorporated into the storage :param msg: contains the bibliographic metadata {'bibcode': '....', 'authors': [....], 'title': '.....', ..... } :return: no return """ logger.debug('Will forward this record: %s', msg) rec = DenormalizedRecord(**msg) app.forward_message(rec) app.update_processed_timestamp(rec.bibcode)
def build_record(app, citation_change, parsed_metadata, citations): if citation_change.content_type != CitationChangeContentType.doi: raise Exception("Only DOI records can be forwarded to master") # Extract required values bibcode = parsed_metadata.get('bibcode') if bibcode is None: raise Exception( "Only records with a bibcode can be forwarded to master") alternate_bibcode = parsed_metadata.get('alternate_bibcode', []) abstract = parsed_metadata.get('abstract', u"") title = parsed_metadata.get('title', u"") keywords = parsed_metadata.get('keywords', []) authors = parsed_metadata.get('authors', []) normalized_authors = parsed_metadata.get('normalized_authors', []) affiliations = parsed_metadata.get('affiliations', [u'-'] * len(authors)) pubdate = parsed_metadata.get('pubdate', get_date().strftime("%Y-%m-%d")) source = parsed_metadata.get('source', u"Unknown") version = parsed_metadata.get('version', u"") doctype = parsed_metadata.get('doctype', u"software") # Clean abstract and title abstract = u''.join( BeautifulSoup(abstract, features="lxml").findAll(text=True)).replace( '\n', ' ').replace('\r', '') title = u''.join(BeautifulSoup( title, features="lxml").findAll(text=True)).replace('\n', ' ').replace('\r', '') # Extract year year = pubdate.split("-")[0] # Build an author_facet_hier list with the following structure: # "0/Blanco-Cuaresma, S", # "1/Blanco-Cuaresma, S/Blanco-Cuaresma, S", # "0/Soubiran, C", # "1/Soubiran, C/Soubiran, C", author_facet_hier = list( itertools.chain.from_iterable( zip(["0/" + a for a in normalized_authors], [ "1/" + a[0] + "/" + a[1] for a in zip(normalized_authors, authors) ]))) # Count n_keywords = len(keywords) n_authors = len(authors) n_citations = len(citations) doi = citation_change.content record_dict = { 'abstract': abstract, 'ack': u'', 'aff': [u"-" if aff == "" else aff for aff in affiliations], 'alternate_bibcode': alternate_bibcode, 'alternate_title': [], 'arxiv_class': [], 'author': authors, 'author_count': n_authors, 'author_facet': normalized_authors, 'author_facet_hier': author_facet_hier, 'author_norm': normalized_authors, 'bibcode': bibcode, 'bibstem': [u'zndo'], 'bibstem_facet': u'zndo', 'copyright': [], 'comment': [], 'database': [u'general', u'astronomy'], 'entry_date': date2solrstamp(citation_change.timestamp.ToDatetime() ), # date2solrstamp(get_date()), 'year': year, 'date': (citation_change.timestamp.ToDatetime() + datetime.timedelta(minutes=30)). strftime( '%Y-%m-%dT%H:%M:%S.%fZ' ), # TODO: Why this date has to be 30 minutes in advance? This is based on ADSImportPipeline SolrAdapter 'doctype': doctype, 'doctype_facet_hier': [u"0/Non-Article", u"1/Non-Article/Software"], 'doi': [doi], 'eid': doi, 'email': [u'-'] * n_authors, 'first_author': authors[0] if n_authors > 0 else u'', 'first_author_facet_hier': author_facet_hier[:2], 'first_author_norm': normalized_authors[0] if n_authors > 0 else u'', 'links_data': [ u'{{"access": "", "instances": "", "title": "", "type": "electr", "url": "{}"}}' .format(app.conf['DOI_URL'] + doi) ], # TODO: How is it different from nonbib? 'identifier': [bibcode, doi] + alternate_bibcode, 'esources': [u"PUB_HTML"], 'citation': citations, 'citation_count': n_citations, 'citation_count_norm': n_citations / n_authors if n_authors > 0 else 0, 'data_count': 1, # Number of elements in `links_data` 'keyword': keywords, 'keyword_facet': keywords, 'keyword_norm': [u"-"] * n_keywords, 'keyword_schema': [u"-"] * n_keywords, 'property': [ u"ESOURCE", u"NONARTICLE", u"NOT REFEREED", u"PUB_OPENACCESS", u"OPENACCESS" ], 'pub': source, 'pub_raw': source, 'pubdate': pubdate, 'pubnote': [], 'read_count': 0, 'title': [title], 'publisher': source, 'version': version } # Status if citation_change.status == Status.new: status = 2 elif citation_change.status == Status.updated: status = 3 elif citation_change.status == Status.deleted: status = 1 # Only use this field for deletions, otherwise Solr will complain the field does not exist # and if this key does not exist in the dict/protobuf, the message will be # treated as new/update by MasterPipeline record_dict['status'] = status else: status = 0 # active record = DenormalizedRecord(**record_dict) nonbib_record = _build_nonbib_record(app, citation_change, record, status) return record, nonbib_record
def test_full_record(self): """This is here also as a documentation.""" solr_record = { 'abstract': u'abstract abstract', 'ack': u'J.H.S. is grateful to Yujin Yang', 'aff': [u'aff1', u'aff2'], 'alternate_bibcode': [u'2015arXiv151103789S'], 'arxiv_class': [u'Astrophysics - Astrophysics of Galaxies'], 'author': [u'Shinn, Jong-Ho', u'Seon, Kwang-Il'], 'author_count': 2, 'author_facet': [u'Shinn, J', u'Seon, K'], 'author_facet_hier': [u'0/Shinn, J', u'1/Shinn, J/Shinn, Jong-Ho', u'0/Seon, K', u'1/Seon, K/Seon, Kwang-Il'], 'author_norm': [u'Shinn, J', u'Seon, K'], 'bibcode': u'2015ApJ...815..133S', 'bibstem': [u'ApJ', u'ApJ...815'], 'bibstem_facet': u'ApJ', 'body': u"body body", 'citation_count': 0, 'citation_count_norm': .2, 'data_count': 20, 'database': [u'astronomy'], 'date': u'2015-12-01T00:00:00.000000Z', 'doctype': u'article', 'doctype_facet_hier': [u'0/Article', u'1/Article/Journal Article'], 'doi': [u'10.1088/0004-637X/815/2/133'], 'eid': u'133', 'email': [u'*****@*****.**', u'-'], 'entry_date': u'2015-12-01T00:00:00.000000Z', 'esources': [u'AUTHOR_PUB', u'PUB_HTML'], 'first_author': u'Shinn, Jong-Ho', 'first_author_facet_hier': [u'0/Shinn, J', u'1/Shinn, J/Shinn, Jong-Ho'], 'first_author_norm': u'Shinn, J', 'fulltext_mtime': u'2019-12-01T00:00:00.000000Z', 'identifier': [u'1511.03789', u'10.1088/0004-637X/815/2/133', u'2015arXiv151103789S'], 'issue': u'2', 'keyword': [u'dust', u'extinction', u'galaxies: halos'], 'keyword_facet': [u'dust', u'ism dust extinction'], 'keyword_norm': [u'dust', u'-'], 'keyword_schema': [u'Astronomy', u'Astronomy'], 'links_data': [u'{"access": "", "instances": "7", "title": "", "type": "simbad", "url": "http://$SIMBAD$/simbo.pl?bibcode=2015ApJ...815..133S"}', u'{"access": "open", "instances": "", "title": "", "type": "pdf", "url": "http://stacks.iop.org/0004-637X/815/133/pdf"}'], 'metadata_mtime': u'2019-12-01T00:00:00.000000Z', 'metrics_mtime': u'2019-12-01T00:00:00.000000Z', 'nedid': [4,5,6], 'nedtype': [u'foo', u'bar', u'baz'], 'ned_object_facet_hier': [u'0/foo', u'1/foo/star'], 'nonbib_mtime': u'2019-12-01T00:00:00.000000Z', 'origin': [u'Elsevier', u'ADS metatada'], 'orcid_mtime': u'2019-12-01T00:00:00.000000Z', 'page': [u'133'], 'page_count': 15, 'page_range': u'133-148', 'property': [u'OPENACCESS', u'REFEREED'], 'pub': u'The Astrophysical Journal', 'pub_raw': u'The Astrophysical Journal, Volume 815, Issue 2, article id. 133, <NUMPAGES>14</NUMPAGES> pp. (2015).', 'pubdate': u'2015-12-00', 'pubnote': [u'33 pages, 7 figures, 5 tables, ApJ in press; doi:10.1088/0004-637X/815/2/133'], 'read_count': 10, 'reference': [u'1941ApJ....93...70H', u'1966ApJ...145..811P'], 'simbid': [1,2,3], 'title': [u'Ultraviolet Radiative Transfer Modeling of Nearby Galaxies'], 'volume': u'815', 'year': u'2015', 'series': u'series name here'} r = DenormalizedRecord(**solr_record) # protobuf actually removes zero values, which is imho cool.... expected = {} expected.update(solr_record) expected.pop('citation_count') self.maxDiff = None self.assertEqual(expected, r.toJSON())
def test_is_valid(self): denormalized_record = DenormalizedRecord() self.assertTrue(denormalized_record.is_valid())
def build_record(app, citation_change, parsed_metadata, citations, db_versions, entry_date=None): if citation_change.content_type != CitationChangeContentType.doi: raise Exception("Only DOI records can be forwarded to master") # Extract required values bibcode = parsed_metadata.get('bibcode') if bibcode is None: raise Exception( "Only records with a bibcode can be forwarded to master") if entry_date is None: entry_date = citation_change.timestamp.ToDatetime() #Check if doi points to a concept record or to a specific version if parsed_metadata.get('version_of', None) not in (None, "", [], ''): is_release = True else: is_release = False alternate_bibcode = parsed_metadata.get('alternate_bibcode', []) abstract = parsed_metadata.get('abstract', "") title = parsed_metadata.get('title', "") keywords = parsed_metadata.get('keywords', []) authors = parsed_metadata.get('authors', []) normalized_authors = parsed_metadata.get('normalized_authors', []) affiliations = parsed_metadata.get('affiliations', ['-'] * len(authors)) pubdate = parsed_metadata.get('pubdate', get_date().strftime("%Y-%m-%d")) try: solr_date = ( datetime.datetime.strptime(pubdate, "%Y-%m-%d") + datetime.timedelta(minutes=30)).strftime('%Y-%m-%dT%H:%M:%S.%fZ') except ValueError: try: #In the event only a year is specified, the date is assumed to be January 1st of the given year. logger.warn( "Publication date does not conform to Y-m-d format. Assuming only year is specified." ) pubdate = pubdate + "-01" + "-01" solr_date = (datetime.datetime.strptime(pubdate, "%Y-%m-%d") + datetime.timedelta(minutes=30) ).strftime('%Y-%m-%dT%H:%M:%S.%fZ') except ValueError: #If above fails, just set it to the current date. Running maintenance_metadata could fix the bad publication date in the future if it is updated upstream. logger.warn( "Cannot parse publication date. Setting to current datetime.") solr_date = date2solrstamp(entry_date) source = parsed_metadata.get('source', "Unknown") version = parsed_metadata.get('version', "") doctype = parsed_metadata.get('doctype', "software") # Clean abstract and title abstract = ''.join( BeautifulSoup(abstract, features="lxml").findAll(text=True)).replace( '\n', ' ').replace('\r', '') title = ''.join(BeautifulSoup(title, features="lxml").findAll(text=True)).replace( '\n', ' ').replace('\r', '') # Extract year year = pubdate.split("-")[0] # Build an author_facet_hier list with the following structure: # "0/Blanco-Cuaresma, S", # "1/Blanco-Cuaresma, S/Blanco-Cuaresma, S", # "0/Soubiran, C", # "1/Soubiran, C/Soubiran, C", author_facet_hier = list( itertools.chain.from_iterable( zip(["0/" + a for a in normalized_authors], [ "1/" + a[0] + "/" + a[1] for a in zip(normalized_authors, authors) ]))) # Count n_keywords = len(keywords) n_authors = len(authors) n_citations = len(citations) doi = citation_change.content record_dict = { 'abstract': abstract, 'ack': '', 'aff': ["-" if aff == "" else aff for aff in affiliations], 'alternate_bibcode': alternate_bibcode, 'alternate_title': [], 'arxiv_class': [], 'author': authors, 'author_count': n_authors, 'author_facet': normalized_authors, 'author_facet_hier': author_facet_hier, 'author_norm': normalized_authors, 'bibcode': bibcode, 'bibstem': ['zndo'], 'bibstem_facet': 'zndo', 'copyright': [], 'comment': [], 'database': ['general', 'astronomy'], 'entry_date': date2solrstamp(entry_date), # date2solrstamp(get_date()), 'year': year, 'date': solr_date, # TODO: Why this date has to be 30 minutes in advance? This is based on ADSImportPipeline SolrAdapter 'doctype': doctype, 'doctype_facet_hier': ["0/Non-Article", "1/Non-Article/Software"], 'doi': [doi], 'eid': doi, 'email': ['-'] * n_authors, 'first_author': authors[0] if n_authors > 0 else '', 'first_author_facet_hier': author_facet_hier[:2], 'first_author_norm': normalized_authors[0] if n_authors > 0 else '', 'links_data': [ '{{"access": "", "instances": "", "title": "", "type": "electr", "url": "{}"}}' .format(app.conf['DOI_URL'] + doi) ], # TODO: How is it different from nonbib? 'identifier': [bibcode, doi] + alternate_bibcode, 'esources': ["PUB_HTML"], 'citation': citations, 'citation_count': n_citations, 'citation_count_norm': n_citations / n_authors if n_authors > 0 else 0, 'data_count': 1, # Number of elements in `links_data` 'keyword': keywords, 'keyword_facet': keywords, 'keyword_norm': ["-"] * n_keywords, 'keyword_schema': ["-"] * n_keywords, 'property': [ "ESOURCE", "NONARTICLE", "NOT REFEREED", "PUB_OPENACCESS", "OPENACCESS" ], 'pub': source, 'pub_raw': source, 'pubdate': pubdate, 'pubnote': [], 'read_count': 0, 'title': [title], 'publisher': source, 'version': version } if version is None: # Concept DOIs may not contain version del record_dict['version'] # Status if citation_change.status == Status.new: status = 2 elif citation_change.status == Status.updated: status = 3 elif citation_change.status == Status.deleted: status = 1 # Only use this field for deletions, otherwise Solr will complain the field does not exist # and if this key does not exist in the dict/protobuf, the message will be # treated as new/update by MasterPipeline record_dict['status'] = status else: status = 0 # active if db_versions not in [{"": ""}, {}, None]: record_dict['property'].append('ASSOCIATED') if is_release: record_dict['property'].append('RELEASE') record = DenormalizedRecord(**record_dict) nonbib_record = _build_nonbib_record(app, citation_change, record, db_versions, status) return record, nonbib_record