Example #1
0
def refetch_orcidids(since=None, orcid_ids=None, **kwargs):
    """
    Gets all orcidids that were updated since time X.
    
    :param: since - RFC889 formatted string
    :type: str
    
    :return: no return
    """
    if orcid_ids:
        for oid in orcid_ids:
            tasks.task_index_orcid_profile({'orcidid': oid, 'force': False})
        if not since:
            print 'Done (just the supplied orcidids)'
            return

    logging.captureWarnings(True)
    if not since or isinstance(since, basestring) and since.strip() == "":
        with app.session_scope() as session:
            kv = session.query(KeyValue).filter_by(key='last.refetch').first()
            if kv is not None:
                since = kv.value
            else:
                since = '1974-11-09T22:56:52.518001Z'

    from_date = get_date(since)
    logger.info('Re-fetching orcidids updated since: {0}'.format(
        from_date.isoformat()))

    # then get all new/old orcidids from orcid-service
    orcidids = set(updater.get_all_touched_profiles(app,
                                                    from_date.isoformat()))
    from_date = get_date()

    for orcidid in orcidids:
        try:
            tasks.task_index_orcid_profile.delay({
                'orcidid': orcidid,
                'force': False
            })
        except:  # potential backpressure (we are too fast)
            time.sleep(2)
            print 'Conn problem, retrying...', orcidid
            tasks.task_index_orcid_profile.delay({
                'orcidid': orcidid,
                'force': False
            })

    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.refetch').first()
        if kv is None:
            kv = KeyValue(key='last.refetch', value=from_date.isoformat())
            session.add(kv)
        else:
            kv.value = from_date.isoformat()
        session.commit()

    print 'Done'
    logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))
Example #2
0
def run():
    offended_authors = []
    orcidids = set()

    # go through the rows in order and count those that had all rows removed
    with app.session_scope() as session:
        for r in session.query(ClaimsLog).distinct(
                ClaimsLog.orcidid).yield_per(100):
            orcidids.add(r.orcidid)

        print 'collected', len(orcidids), 'orcidids'

        j = 0
        for orcid in orcidids:
            j += 1
            i = removed = others = 0

            if j % 100 == 0:
                print 'processing', j, 'authors, found so far', len(
                    offended_authors)

            for r in session.query(ClaimsLog).filter(
                    ClaimsLog.orcidid == orcid).order_by(
                        ClaimsLog.id.desc()).yield_per(1000):
                if r.status == '#full-import':  # that concludes the batch
                    if removed == i and i > 0:
                        offended_authors.append(r.orcidid)
                    break

                if r.status == 'removed':
                    removed += 1
                else:
                    others += 1

                i += 1

    print 'found', len(offended_authors), 'instances of all-removed profiles'

    if 'submit' in sys.argv:
        for x in offended_authors:
            tasks.task_index_orcid_profile({'orcidid': x})
Example #3
0
    def test_task_index_orcid_profile(self):

        with patch.object(self.app, 'retrieve_orcid') as retrieve_orcid, \
            patch.object(tasks.requests, 'get') as get, \
            patch.object(self.app, 'get_claims') as get_claims, \
            patch.object(self.app, 'insert_claims') as insert_claims, \
            patch.object(tasks.task_index_orcid_profile, 'apply_async') as task_index_orcid_profile, \
            patch.object(tasks.task_match_claim, 'delay') as next_task:

            r = PropertyMock()
            data = {
                'bibcode': {
                    'status': 'some status',
                    'title': 'some title'
                }
            }
            r.text = str(data)
            r.json = lambda: data
            r.status_code = 200
            get.return_value = r

            get_claims.return_value = (
                {
                    'bibcode1':
                    ('Bibcode1', utils.get_date('2017-01-01'), 'provenance',
                     ['id1', 'id2'], ['Stern, D K', 'author two']),
                    'bibcode2':
                    ('Bibcode2', utils.get_date('2017-01-01'), 'provenance',
                     ['id1', 'id2'], ['author one', 'Stern, D K']),
                    'bibcode3':
                    ('Bibcode3', utils.get_date('2017-01-01'), 'provenance',
                     ['id1', 'id2'], ['Stern, D K', 'author two']),
                },
                {
                    'bibcode1': ('Bibcode1', utils.get_date('2017-01-01')),
                    'bibcode4': ('Bibcode4', utils.get_date('2017-01-01')
                                 ),  # we have, but orcid no more
                },
                {
                    'bibcode2': ('Bibcode2', utils.get_date('2017-01-01')),
                })
            insert_claims.return_value = [
                {
                    'status': u'#full-import',
                    'bibcode': u'',
                    'created': '2017-05-26T21:29:22.726506+00:00',
                    'provenance': u'OrcidImporter',
                    'orcidid': '0000-0003-3041-2092',
                    'id': None
                },
                {
                    'status': u'claimed',
                    'bibcode': 'Bibcode2',
                    'created': '2017-01-01T00:00:00+00:00',
                    'provenance': u'provenance',
                    'orcidid': '0000-0003-3041-2092',
                    'id': None
                },
                {
                    'status': u'claimed',
                    'bibcode': 'Bibcode3',
                    'created': '2017-01-01T00:00:00+00:00',
                    'provenance': u'provenance',
                    'orcidid': '0000-0003-3041-2092',
                    'id': None
                },
                {
                    'status': u'removed',
                    'bibcode': 'Bibcode4',
                    'created': '2017-05-26T21:29:22.728368+00:00',
                    'provenance': u'OrcidImporter',
                    'orcidid': '0000-0003-3041-2092',
                    'id': None
                },
                {
                    'status': u'unchanged',
                    'bibcode': 'Bibcode1',
                    'created': '2017-01-01T00:00:00+00:00',
                    'provenance': u'OrcidImporter',
                    'orcidid': '0000-0003-3041-2092',
                    'id': None
                },
            ]

            self.assertFalse(next_task.called)

            # check authors can be skipped
            retrieve_orcid.return_value = {
                'status': 'blacklisted',
                'name': u'Stern, D K',
                'facts': {
                    u'author': [u'Stern, D', u'Stern, D K', u'Stern, Daniel'],
                    u'orcid_name': [u'Stern, Daniel'],
                    u'author_norm': [u'Stern, D'],
                    u'name': u'Stern, D K'
                },
                'orcidid': u'0000-0003-2686-9241',
                'id': 1,
                'account_id': None,
                'updated': utils.get_date('2017-01-01')
            }

            tasks.task_index_orcid_profile({'orcidid': '0000-0003-3041-2092'})

            self.assertFalse(next_task.called)

            retrieve_orcid.return_value = {
                'status': None,
                'name': u'Stern, D K',
                'facts': {
                    u'author': [u'Stern, D', u'Stern, D K', u'Stern, Daniel'],
                    u'orcid_name': [u'Stern, Daniel'],
                    u'author_norm': [u'Stern, D'],
                    u'name': u'Stern, D K'
                },
                'orcidid': u'0000-0003-2686-9241',
                'id': 1,
                'account_id': None,
                'updated': utils.get_date('2017-01-01')
            }

            tasks.task_index_orcid_profile({'orcidid': '0000-0003-3041-2092'})

            self.assertTrue(next_task.called)
            self.assertEqual(next_task.call_count, 4)

            self.assertEqual([(x.bibcode, x.status)
                              for x in insert_claims.call_args[0][0]],
                             [(u'', u'#full-import'), ('Bibcode2', u'claimed'),
                              ('Bibcode3', u'claimed'),
                              ('Bibcode4', u'removed'),
                              ('Bibcode1', u'unchanged')])

            self.assertEqual([(x[0][0]['bibcode'], x[0][0]['status'])
                              for x in next_task.call_args_list],
                             [('Bibcode2', u'claimed'),
                              ('Bibcode3', u'claimed'),
                              ('Bibcode4', u'removed'),
                              ('Bibcode1', u'unchanged')])

            self.assertEqual(
                (next_task.call_args_list[0][0][0]['bibcode'],
                 next_task.call_args_list[0][0][0]['author_list']),
                ('Bibcode2', ['author one', 'Stern, D K']))

            self.assertEqual(
                (next_task.call_args_list[0][0][0]['bibcode'],
                 next_task.call_args_list[0][0][0]['identifiers']),
                ('Bibcode2', ['id1', 'id2']))