def refetch_orcidids(since=None, orcid_ids=None, **kwargs): """ Gets all orcidids that were updated since time X. :param: since - RFC889 formatted string :type: str :return: no return """ if orcid_ids: for oid in orcid_ids: tasks.task_index_orcid_profile({'orcidid': oid, 'force': False}) if not since: print 'Done (just the supplied orcidids)' return logging.captureWarnings(True) if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.refetch').first() if kv is not None: since = kv.value else: since = '1974-11-09T22:56:52.518001Z' from_date = get_date(since) logger.info('Re-fetching orcidids updated since: {0}'.format( from_date.isoformat())) # then get all new/old orcidids from orcid-service orcidids = set(updater.get_all_touched_profiles(app, from_date.isoformat())) from_date = get_date() for orcidid in orcidids: try: tasks.task_index_orcid_profile.delay({ 'orcidid': orcidid, 'force': False }) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying...', orcidid tasks.task_index_orcid_profile.delay({ 'orcidid': orcidid, 'force': False }) with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.refetch').first() if kv is None: kv = KeyValue(key='last.refetch', value=from_date.isoformat()) session.add(kv) else: kv.value = from_date.isoformat() session.commit() print 'Done' logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))
def run(): offended_authors = [] orcidids = set() # go through the rows in order and count those that had all rows removed with app.session_scope() as session: for r in session.query(ClaimsLog).distinct( ClaimsLog.orcidid).yield_per(100): orcidids.add(r.orcidid) print 'collected', len(orcidids), 'orcidids' j = 0 for orcid in orcidids: j += 1 i = removed = others = 0 if j % 100 == 0: print 'processing', j, 'authors, found so far', len( offended_authors) for r in session.query(ClaimsLog).filter( ClaimsLog.orcidid == orcid).order_by( ClaimsLog.id.desc()).yield_per(1000): if r.status == '#full-import': # that concludes the batch if removed == i and i > 0: offended_authors.append(r.orcidid) break if r.status == 'removed': removed += 1 else: others += 1 i += 1 print 'found', len(offended_authors), 'instances of all-removed profiles' if 'submit' in sys.argv: for x in offended_authors: tasks.task_index_orcid_profile({'orcidid': x})
def test_task_index_orcid_profile(self): with patch.object(self.app, 'retrieve_orcid') as retrieve_orcid, \ patch.object(tasks.requests, 'get') as get, \ patch.object(self.app, 'get_claims') as get_claims, \ patch.object(self.app, 'insert_claims') as insert_claims, \ patch.object(tasks.task_index_orcid_profile, 'apply_async') as task_index_orcid_profile, \ patch.object(tasks.task_match_claim, 'delay') as next_task: r = PropertyMock() data = { 'bibcode': { 'status': 'some status', 'title': 'some title' } } r.text = str(data) r.json = lambda: data r.status_code = 200 get.return_value = r get_claims.return_value = ( { 'bibcode1': ('Bibcode1', utils.get_date('2017-01-01'), 'provenance', ['id1', 'id2'], ['Stern, D K', 'author two']), 'bibcode2': ('Bibcode2', utils.get_date('2017-01-01'), 'provenance', ['id1', 'id2'], ['author one', 'Stern, D K']), 'bibcode3': ('Bibcode3', utils.get_date('2017-01-01'), 'provenance', ['id1', 'id2'], ['Stern, D K', 'author two']), }, { 'bibcode1': ('Bibcode1', utils.get_date('2017-01-01')), 'bibcode4': ('Bibcode4', utils.get_date('2017-01-01') ), # we have, but orcid no more }, { 'bibcode2': ('Bibcode2', utils.get_date('2017-01-01')), }) insert_claims.return_value = [ { 'status': u'#full-import', 'bibcode': u'', 'created': '2017-05-26T21:29:22.726506+00:00', 'provenance': u'OrcidImporter', 'orcidid': '0000-0003-3041-2092', 'id': None }, { 'status': u'claimed', 'bibcode': 'Bibcode2', 'created': '2017-01-01T00:00:00+00:00', 'provenance': u'provenance', 'orcidid': '0000-0003-3041-2092', 'id': None }, { 'status': u'claimed', 'bibcode': 'Bibcode3', 'created': '2017-01-01T00:00:00+00:00', 'provenance': u'provenance', 'orcidid': '0000-0003-3041-2092', 'id': None }, { 'status': u'removed', 'bibcode': 'Bibcode4', 'created': '2017-05-26T21:29:22.728368+00:00', 'provenance': u'OrcidImporter', 'orcidid': '0000-0003-3041-2092', 'id': None }, { 'status': u'unchanged', 'bibcode': 'Bibcode1', 'created': '2017-01-01T00:00:00+00:00', 'provenance': u'OrcidImporter', 'orcidid': '0000-0003-3041-2092', 'id': None }, ] self.assertFalse(next_task.called) # check authors can be skipped retrieve_orcid.return_value = { 'status': 'blacklisted', 'name': u'Stern, D K', 'facts': { u'author': [u'Stern, D', u'Stern, D K', u'Stern, Daniel'], u'orcid_name': [u'Stern, Daniel'], u'author_norm': [u'Stern, D'], u'name': u'Stern, D K' }, 'orcidid': u'0000-0003-2686-9241', 'id': 1, 'account_id': None, 'updated': utils.get_date('2017-01-01') } tasks.task_index_orcid_profile({'orcidid': '0000-0003-3041-2092'}) self.assertFalse(next_task.called) retrieve_orcid.return_value = { 'status': None, 'name': u'Stern, D K', 'facts': { u'author': [u'Stern, D', u'Stern, D K', u'Stern, Daniel'], u'orcid_name': [u'Stern, Daniel'], u'author_norm': [u'Stern, D'], u'name': u'Stern, D K' }, 'orcidid': u'0000-0003-2686-9241', 'id': 1, 'account_id': None, 'updated': utils.get_date('2017-01-01') } tasks.task_index_orcid_profile({'orcidid': '0000-0003-3041-2092'}) self.assertTrue(next_task.called) self.assertEqual(next_task.call_count, 4) self.assertEqual([(x.bibcode, x.status) for x in insert_claims.call_args[0][0]], [(u'', u'#full-import'), ('Bibcode2', u'claimed'), ('Bibcode3', u'claimed'), ('Bibcode4', u'removed'), ('Bibcode1', u'unchanged')]) self.assertEqual([(x[0][0]['bibcode'], x[0][0]['status']) for x in next_task.call_args_list], [('Bibcode2', u'claimed'), ('Bibcode3', u'claimed'), ('Bibcode4', u'removed'), ('Bibcode1', u'unchanged')]) self.assertEqual( (next_task.call_args_list[0][0][0]['bibcode'], next_task.call_args_list[0][0][0]['author_list']), ('Bibcode2', ['author one', 'Stern, D K'])) self.assertEqual( (next_task.call_args_list[0][0][0]['bibcode'], next_task.call_args_list[0][0][0]['identifiers']), ('Bibcode2', ['id1', 'id2']))