Beispiel #1
0
 def test_task_update_record(self):
     with patch('adsmp.tasks.task_index_records.apply_async') as next_task, \
          patch('adsmp.app.ADSMasterPipelineCelery.request_aff_augment') as augment:
         tasks.task_update_record(DenormalizedRecord(bibcode='2015ApJ...815..133S'))
         self.assertFalse(next_task.called)
         self.assertTrue(augment.called)
     
     with patch('adsmp.solr_updater.delete_by_bibcodes', return_value=[('2015ApJ...815..133S'), ()]) as solr_delete, \
          patch('adsmp.app.ADSMasterPipelineCelery.request_aff_augment') as augment, \
          patch.object(self.app, 'metrics_delete_by_bibcode', return_value=True) as metrics_delete:
         tasks.task_update_record(DenormalizedRecord(bibcode='2015ApJ...815..133S', status='deleted'))
         self.assertTrue(solr_delete.called)
         self.assertTrue(metrics_delete.called)
         self.assertFalse(augment.called)
    def test_task_update_record(self):
        with patch('adsmp.tasks.task_index_records.delay') as next_task:
            self.assertFalse(next_task.called)
            tasks.task_update_record(
                DenormalizedRecord(bibcode='2015ApJ...815..133S'))
            self.assertTrue(next_task.called)
            self.assertTrue(next_task.call_args[0], ('2015ApJ...815..133S', ))

        with patch('adsmp.solr_updater.delete_by_bibcodes',
                   return_value=[('2015ApJ...815..133S'), ()]) as solr_delete:
            tasks.task_update_record(
                DenormalizedRecord(bibcode='2015ApJ...815..133S',
                                   status='deleted'))
            self.assertTrue(next_task.call_args[0], ('2015ApJ...815..133S', ))
            self.assertTrue(solr_delete.called)
    def test_task_update_record_delete(self):

        for x, cls in (('fulltext', FulltextUpdate), ('orcid_claims',
                                                      OrcidClaims)):
            self.app.update_storage('bibcode', x, {'foo': 'bar'})
            self.assertEquals(self.app.get_record('bibcode')[x]['foo'], 'bar')
            with patch('adsmp.tasks.task_index_records.delay') as next_task:
                tasks.task_update_record(
                    cls(bibcode='bibcode', status='deleted'))
                self.assertEquals(self.app.get_record('bibcode')[x], None)
                self.assertTrue(self.app.get_record('bibcode'))

        recs = NonBibRecordList()
        recs.nonbib_records.extend(
            [NonBibRecord(bibcode='bibcode', status='deleted').data])
        with patch('adsmp.tasks.task_index_records.delay') as next_task:
            tasks.task_update_record(recs)
            self.assertEquals(self.app.get_record('bibcode')['metrics'], None)
            self.assertTrue(self.app.get_record('bibcode'))

        with patch('adsmp.tasks.task_delete_documents') as next_task:
            tasks.task_update_record(
                DenormalizedRecord(bibcode='bibcode', status='deleted'))
            self.assertTrue(next_task.called)
            self.assertTrue(next_task.call_args[0], ('bibcode', ))
    def test_task_index_links(self):
        """verify data is sent to links microservice update endpoint"""
        r = Mock()
        r.status_code = 200

        # just make sure we have the entry in a database
        tasks.task_update_record(DenormalizedRecord(bibcode='linkstest'))

        with patch.object(self.app, 'get_record', return_value={'bibcode': 'linkstest',
                                                                'nonbib_data': {'data_links_rows': [{'baz': 0}]},
                                                                'bib_data_updated': get_date(),
                                                                'nonbib_data_updated': get_date(),
                                                                'processed': get_date('2025')}), \
             patch('requests.put', return_value = r, new_callable=CopyingMock) as p:
            tasks.task_index_records(['linkstest'],
                                     update_solr=False,
                                     update_metrics=False,
                                     update_links=True,
                                     force=True)
            p.assert_called_with('http://localhost:8080/update',
                                 data=json.dumps([{
                                     'bibcode':
                                     'linkstest',
                                     'data_links_rows': [{
                                         'baz': 0
                                     }]
                                 }]),
                                 headers={'Authorization': 'Bearer api_token'})

        rec = self.app.get_record(bibcode='linkstest')
        self.assertEquals(rec['datalinks_checksum'], '0x80e85169')
        self.assertEquals(rec['solr_checksum'], None)
        self.assertEquals(rec['metrics_checksum'], None)
Beispiel #5
0
    def serialize(self, record, **kwargs):

        if (len(record.keys()) > 0):
            rec = DenormalizedRecord(**record)
            task_output_results.delay(rec)
        else:
            print ("Null record, not sending to master pipeline")
Beispiel #6
0
    def test_serialization(self):
        abstract = "This is a dummy abstract."
        author = "Foe, J."
        author_count = 1

        denormalized_record = DenormalizedRecord()
        denormalized_record.data.abstract = abstract
        denormalized_record.data.author.append(author)
        denormalized_record.data.author_count = author_count
        data = denormalized_record.serialize()
        self.assertEqual(data, '\n\x19{0}:\x07{1}@\x01'.format(abstract, author))
        data_str = str(denormalized_record)
        self.assertEqual(data_str, 'abstract: "{0}"\nauthor: "{1}"\nauthor_count: {2}\n'.format(abstract, author, author_count))
        self.assertNotEqual(data, data_str)

        recovered_bibrecord = DenormalizedRecord.deserializer(data)
        self.assertTrue(recovered_bibrecord.is_valid())
        self.assertEqual(denormalized_record.data.abstract, abstract)
        self.assertEqual(denormalized_record.data.author[0], author)
        self.assertEqual(denormalized_record.data.author_count, author_count)
Beispiel #7
0
def task_output_results(msg):
    """
    This worker will forward results to the outside 
    exchange (typically an ADSMasterPipeline) to be
    incorporated into the storage
    
    :param msg: contains the bibliographic metadata
            
            {'bibcode': '....',
             'authors': [....],
             'title': '.....',
             .....
            }
    :return: no return
    """
    logger.debug('Will forward this record: %s', msg)
    rec = DenormalizedRecord(**msg)
    app.forward_message(rec)
    app.update_processed_timestamp(rec.bibcode)
Beispiel #8
0
def build_record(app, citation_change, parsed_metadata, citations):
    if citation_change.content_type != CitationChangeContentType.doi:
        raise Exception("Only DOI records can be forwarded to master")
    # Extract required values
    bibcode = parsed_metadata.get('bibcode')
    if bibcode is None:
        raise Exception(
            "Only records with a bibcode can be forwarded to master")
    alternate_bibcode = parsed_metadata.get('alternate_bibcode', [])
    abstract = parsed_metadata.get('abstract', u"")
    title = parsed_metadata.get('title', u"")
    keywords = parsed_metadata.get('keywords', [])
    authors = parsed_metadata.get('authors', [])
    normalized_authors = parsed_metadata.get('normalized_authors', [])
    affiliations = parsed_metadata.get('affiliations', [u'-'] * len(authors))
    pubdate = parsed_metadata.get('pubdate', get_date().strftime("%Y-%m-%d"))
    source = parsed_metadata.get('source', u"Unknown")
    version = parsed_metadata.get('version', u"")
    doctype = parsed_metadata.get('doctype', u"software")
    # Clean abstract and title
    abstract = u''.join(
        BeautifulSoup(abstract, features="lxml").findAll(text=True)).replace(
            '\n', ' ').replace('\r', '')
    title = u''.join(BeautifulSoup(
        title,
        features="lxml").findAll(text=True)).replace('\n',
                                                     ' ').replace('\r', '')
    # Extract year
    year = pubdate.split("-")[0]
    # Build an author_facet_hier list with the following structure:
    #   "0/Blanco-Cuaresma, S",
    #   "1/Blanco-Cuaresma, S/Blanco-Cuaresma, S",
    #   "0/Soubiran, C",
    #   "1/Soubiran, C/Soubiran, C",
    author_facet_hier = list(
        itertools.chain.from_iterable(
            zip(["0/" + a for a in normalized_authors], [
                "1/" + a[0] + "/" + a[1]
                for a in zip(normalized_authors, authors)
            ])))

    # Count
    n_keywords = len(keywords)
    n_authors = len(authors)
    n_citations = len(citations)
    doi = citation_change.content
    record_dict = {
        'abstract':
        abstract,
        'ack':
        u'',
        'aff': [u"-" if aff == "" else aff for aff in affiliations],
        'alternate_bibcode':
        alternate_bibcode,
        'alternate_title': [],
        'arxiv_class': [],
        'author':
        authors,
        'author_count':
        n_authors,
        'author_facet':
        normalized_authors,
        'author_facet_hier':
        author_facet_hier,
        'author_norm':
        normalized_authors,
        'bibcode':
        bibcode,
        'bibstem': [u'zndo'],
        'bibstem_facet':
        u'zndo',
        'copyright': [],
        'comment': [],
        'database': [u'general', u'astronomy'],
        'entry_date':
        date2solrstamp(citation_change.timestamp.ToDatetime()
                       ),  # date2solrstamp(get_date()),
        'year':
        year,
        'date': (citation_change.timestamp.ToDatetime() +
                 datetime.timedelta(minutes=30)).
        strftime(
            '%Y-%m-%dT%H:%M:%S.%fZ'
        ),  # TODO: Why this date has to be 30 minutes in advance? This is based on ADSImportPipeline SolrAdapter
        'doctype':
        doctype,
        'doctype_facet_hier': [u"0/Non-Article", u"1/Non-Article/Software"],
        'doi': [doi],
        'eid':
        doi,
        'email': [u'-'] * n_authors,
        'first_author':
        authors[0] if n_authors > 0 else u'',
        'first_author_facet_hier':
        author_facet_hier[:2],
        'first_author_norm':
        normalized_authors[0] if n_authors > 0 else u'',
        'links_data': [
            u'{{"access": "", "instances": "", "title": "", "type": "electr", "url": "{}"}}'
            .format(app.conf['DOI_URL'] + doi)
        ],  # TODO: How is it different from nonbib?
        'identifier': [bibcode, doi] + alternate_bibcode,
        'esources': [u"PUB_HTML"],
        'citation':
        citations,
        'citation_count':
        n_citations,
        'citation_count_norm':
        n_citations / n_authors if n_authors > 0 else 0,
        'data_count':
        1,  # Number of elements in `links_data`
        'keyword':
        keywords,
        'keyword_facet':
        keywords,
        'keyword_norm': [u"-"] * n_keywords,
        'keyword_schema': [u"-"] * n_keywords,
        'property': [
            u"ESOURCE", u"NONARTICLE", u"NOT REFEREED", u"PUB_OPENACCESS",
            u"OPENACCESS"
        ],
        'pub':
        source,
        'pub_raw':
        source,
        'pubdate':
        pubdate,
        'pubnote': [],
        'read_count':
        0,
        'title': [title],
        'publisher':
        source,
        'version':
        version
    }
    # Status
    if citation_change.status == Status.new:
        status = 2
    elif citation_change.status == Status.updated:
        status = 3
    elif citation_change.status == Status.deleted:
        status = 1
        # Only use this field for deletions, otherwise Solr will complain the field does not exist
        # and if this key does not exist in the dict/protobuf, the message will be
        # treated as new/update by MasterPipeline
        record_dict['status'] = status
    else:
        status = 0  # active
    record = DenormalizedRecord(**record_dict)
    nonbib_record = _build_nonbib_record(app, citation_change, record, status)
    return record, nonbib_record
Beispiel #9
0
    def test_full_record(self):
        """This is here also as a documentation."""
        solr_record = {
             'abstract': u'abstract abstract',
             'ack': u'J.H.S. is grateful to Yujin Yang',
             'aff': [u'aff1', u'aff2'],
             'alternate_bibcode': [u'2015arXiv151103789S'],
             'arxiv_class': [u'Astrophysics - Astrophysics of Galaxies'],
             'author': [u'Shinn, Jong-Ho', u'Seon, Kwang-Il'],
             'author_count': 2,
             'author_facet': [u'Shinn, J', u'Seon, K'],
             'author_facet_hier': [u'0/Shinn, J',
                  u'1/Shinn, J/Shinn, Jong-Ho',
                  u'0/Seon, K',
                  u'1/Seon, K/Seon, Kwang-Il'],
             'author_norm': [u'Shinn, J', u'Seon, K'],
             'bibcode': u'2015ApJ...815..133S',
             'bibstem': [u'ApJ', u'ApJ...815'],
             'bibstem_facet': u'ApJ',
             'body': u"body body",
             'citation_count': 0,
             'citation_count_norm': .2,
             'data_count': 20,
             'database': [u'astronomy'],
             'date': u'2015-12-01T00:00:00.000000Z',
             'doctype': u'article',
             'doctype_facet_hier': [u'0/Article', u'1/Article/Journal Article'],
             'doi': [u'10.1088/0004-637X/815/2/133'],
             'eid': u'133',
             'email': [u'*****@*****.**', u'-'],
             'entry_date': u'2015-12-01T00:00:00.000000Z',
             'esources': [u'AUTHOR_PUB', u'PUB_HTML'],
             'first_author': u'Shinn, Jong-Ho',
             'first_author_facet_hier': [u'0/Shinn, J', u'1/Shinn, J/Shinn, Jong-Ho'],
             'first_author_norm': u'Shinn, J',
             'fulltext_mtime': u'2019-12-01T00:00:00.000000Z',
             'identifier': [u'1511.03789',
              u'10.1088/0004-637X/815/2/133',
              u'2015arXiv151103789S'],
             'issue': u'2',
             'keyword': [u'dust',
              u'extinction',
              u'galaxies: halos'],
             'keyword_facet': [u'dust',
              u'ism dust extinction'],
             'keyword_norm': [u'dust',
              u'-'],
             'keyword_schema': [u'Astronomy',
              u'Astronomy'],
             'links_data': [u'{"access": "", "instances": "7", "title": "", "type": "simbad", "url": "http://$SIMBAD$/simbo.pl?bibcode=2015ApJ...815..133S"}',
              u'{"access": "open", "instances": "", "title": "", "type": "pdf", "url": "http://stacks.iop.org/0004-637X/815/133/pdf"}'],
             'metadata_mtime': u'2019-12-01T00:00:00.000000Z',
             'metrics_mtime': u'2019-12-01T00:00:00.000000Z',
             'nedid': [4,5,6],
             'nedtype': [u'foo', u'bar', u'baz'],
             'ned_object_facet_hier': [u'0/foo', u'1/foo/star'],
             'nonbib_mtime': u'2019-12-01T00:00:00.000000Z',
             'origin': [u'Elsevier', u'ADS metatada'],
             'orcid_mtime': u'2019-12-01T00:00:00.000000Z',
             'page': [u'133'],
             'page_count': 15,
             'page_range': u'133-148',
             'property': [u'OPENACCESS', u'REFEREED'],
             'pub': u'The Astrophysical Journal',
             'pub_raw': u'The Astrophysical Journal, Volume 815, Issue 2, article id. 133, <NUMPAGES>14</NUMPAGES> pp. (2015).',
             'pubdate': u'2015-12-00',
             'pubnote': [u'33 pages, 7 figures, 5 tables, ApJ in press; doi:10.1088/0004-637X/815/2/133'],
             'read_count': 10,
             'reference': [u'1941ApJ....93...70H', u'1966ApJ...145..811P'],
             'simbid': [1,2,3],
             'title': [u'Ultraviolet Radiative Transfer Modeling of Nearby Galaxies'],
             'volume': u'815',
             'year': u'2015',
             'series': u'series name here'}

        r = DenormalizedRecord(**solr_record)
        
        # protobuf actually removes zero values, which is imho cool....
        expected = {}
        expected.update(solr_record)
        expected.pop('citation_count')
        self.maxDiff = None
        self.assertEqual(expected, r.toJSON())
Beispiel #10
0
 def test_is_valid(self):
     denormalized_record = DenormalizedRecord()
     self.assertTrue(denormalized_record.is_valid())
Beispiel #11
0
def build_record(app,
                 citation_change,
                 parsed_metadata,
                 citations,
                 db_versions,
                 entry_date=None):
    if citation_change.content_type != CitationChangeContentType.doi:
        raise Exception("Only DOI records can be forwarded to master")
    # Extract required values
    bibcode = parsed_metadata.get('bibcode')
    if bibcode is None:
        raise Exception(
            "Only records with a bibcode can be forwarded to master")
    if entry_date is None:
        entry_date = citation_change.timestamp.ToDatetime()
    #Check if doi points to a concept record or to a specific version
    if parsed_metadata.get('version_of', None) not in (None, "", [], ''):
        is_release = True
    else:
        is_release = False
    alternate_bibcode = parsed_metadata.get('alternate_bibcode', [])
    abstract = parsed_metadata.get('abstract', "")
    title = parsed_metadata.get('title', "")
    keywords = parsed_metadata.get('keywords', [])
    authors = parsed_metadata.get('authors', [])
    normalized_authors = parsed_metadata.get('normalized_authors', [])
    affiliations = parsed_metadata.get('affiliations', ['-'] * len(authors))
    pubdate = parsed_metadata.get('pubdate', get_date().strftime("%Y-%m-%d"))
    try:
        solr_date = (
            datetime.datetime.strptime(pubdate, "%Y-%m-%d") +
            datetime.timedelta(minutes=30)).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
    except ValueError:
        try:
            #In the event only a year is specified, the date is assumed to be January 1st of the given year.
            logger.warn(
                "Publication date does not conform to Y-m-d format. Assuming only year is specified."
            )
            pubdate = pubdate + "-01" + "-01"
            solr_date = (datetime.datetime.strptime(pubdate, "%Y-%m-%d") +
                         datetime.timedelta(minutes=30)
                         ).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
        except ValueError:
            #If above fails, just set it to the current date. Running maintenance_metadata could fix the bad publication date in the future if it is updated upstream.
            logger.warn(
                "Cannot parse publication date. Setting to current datetime.")
            solr_date = date2solrstamp(entry_date)

    source = parsed_metadata.get('source', "Unknown")
    version = parsed_metadata.get('version', "")
    doctype = parsed_metadata.get('doctype', "software")
    # Clean abstract and title
    abstract = ''.join(
        BeautifulSoup(abstract, features="lxml").findAll(text=True)).replace(
            '\n', ' ').replace('\r', '')
    title = ''.join(BeautifulSoup(title,
                                  features="lxml").findAll(text=True)).replace(
                                      '\n', ' ').replace('\r', '')
    # Extract year
    year = pubdate.split("-")[0]
    # Build an author_facet_hier list with the following structure:
    #   "0/Blanco-Cuaresma, S",
    #   "1/Blanco-Cuaresma, S/Blanco-Cuaresma, S",
    #   "0/Soubiran, C",
    #   "1/Soubiran, C/Soubiran, C",
    author_facet_hier = list(
        itertools.chain.from_iterable(
            zip(["0/" + a for a in normalized_authors], [
                "1/" + a[0] + "/" + a[1]
                for a in zip(normalized_authors, authors)
            ])))

    # Count
    n_keywords = len(keywords)
    n_authors = len(authors)
    n_citations = len(citations)
    doi = citation_change.content
    record_dict = {
        'abstract':
        abstract,
        'ack':
        '',
        'aff': ["-" if aff == "" else aff for aff in affiliations],
        'alternate_bibcode':
        alternate_bibcode,
        'alternate_title': [],
        'arxiv_class': [],
        'author':
        authors,
        'author_count':
        n_authors,
        'author_facet':
        normalized_authors,
        'author_facet_hier':
        author_facet_hier,
        'author_norm':
        normalized_authors,
        'bibcode':
        bibcode,
        'bibstem': ['zndo'],
        'bibstem_facet':
        'zndo',
        'copyright': [],
        'comment': [],
        'database': ['general', 'astronomy'],
        'entry_date':
        date2solrstamp(entry_date),  # date2solrstamp(get_date()),
        'year':
        year,
        'date':
        solr_date,  # TODO: Why this date has to be 30 minutes in advance? This is based on ADSImportPipeline SolrAdapter
        'doctype':
        doctype,
        'doctype_facet_hier': ["0/Non-Article", "1/Non-Article/Software"],
        'doi': [doi],
        'eid':
        doi,
        'email': ['-'] * n_authors,
        'first_author':
        authors[0] if n_authors > 0 else '',
        'first_author_facet_hier':
        author_facet_hier[:2],
        'first_author_norm':
        normalized_authors[0] if n_authors > 0 else '',
        'links_data': [
            '{{"access": "", "instances": "", "title": "", "type": "electr", "url": "{}"}}'
            .format(app.conf['DOI_URL'] + doi)
        ],  # TODO: How is it different from nonbib?
        'identifier': [bibcode, doi] + alternate_bibcode,
        'esources': ["PUB_HTML"],
        'citation':
        citations,
        'citation_count':
        n_citations,
        'citation_count_norm':
        n_citations / n_authors if n_authors > 0 else 0,
        'data_count':
        1,  # Number of elements in `links_data`
        'keyword':
        keywords,
        'keyword_facet':
        keywords,
        'keyword_norm': ["-"] * n_keywords,
        'keyword_schema': ["-"] * n_keywords,
        'property': [
            "ESOURCE", "NONARTICLE", "NOT REFEREED", "PUB_OPENACCESS",
            "OPENACCESS"
        ],
        'pub':
        source,
        'pub_raw':
        source,
        'pubdate':
        pubdate,
        'pubnote': [],
        'read_count':
        0,
        'title': [title],
        'publisher':
        source,
        'version':
        version
    }
    if version is None:  # Concept DOIs may not contain version
        del record_dict['version']
    # Status
    if citation_change.status == Status.new:
        status = 2
    elif citation_change.status == Status.updated:
        status = 3
    elif citation_change.status == Status.deleted:
        status = 1
        # Only use this field for deletions, otherwise Solr will complain the field does not exist
        # and if this key does not exist in the dict/protobuf, the message will be
        # treated as new/update by MasterPipeline
        record_dict['status'] = status
    else:
        status = 0  # active
    if db_versions not in [{"": ""}, {}, None]:
        record_dict['property'].append('ASSOCIATED')
    if is_release:
        record_dict['property'].append('RELEASE')

    record = DenormalizedRecord(**record_dict)
    nonbib_record = _build_nonbib_record(app, citation_change, record,
                                         db_versions, status)
    return record, nonbib_record