def refetch_orcidids(since=None, orcid_ids=None, **kwargs): """ Gets all orcidids that were updated since time X. :param: since - RFC889 formatted string :type: str :return: no return """ worker = RabbitMQWorker(params={ 'publish': 'ads.orcid.fresh-claims', 'exchange': app.config.get('EXCHANGE', 'ads-orcid') }) worker.connect(app.config.get('RABBITMQ_URL')) if orcid_ids: for oid in orcid_ids.split(','): worker.publish({'orcidid': oid, 'force': False}) if not since: print 'Done (just the supplied orcidids)' return logging.captureWarnings(True) if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.refetch').first() if kv is not None: since = kv.value else: since = '1974-11-09T22:56:52.518001Z' from_date = get_date(since) logger.info('Re-fetching orcidids updated since: {0}'.format(from_date.isoformat())) # then get all new/old orcidids from orcid-service orcidids = set(updater.get_all_touched_profiles(from_date.isoformat())) from_date = get_date() for orcidid in orcidids: try: worker.publish({'orcidid': orcidid, 'force': False}) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying...', orcidid worker.publish({'orcidid': orcidid, 'force': False}) with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.refetch').first() if kv is None: kv = KeyValue(key='last.refetch', value=from_date.isoformat()) session.add(kv) else: kv.value = from_date.isoformat() session.commit() print 'Done' logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))
def refetch_orcidids(since=None, orcid_ids=None, **kwargs): """ Gets all orcidids that were updated since time X. :param: since - RFC889 formatted string :type: str :return: no return """ if orcid_ids: for oid in orcid_ids: tasks.task_index_orcid_profile({'orcidid': oid, 'force': False}) if not since: print 'Done (just the supplied orcidids)' return logging.captureWarnings(True) if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.refetch').first() if kv is not None: since = kv.value else: since = '1974-11-09T22:56:52.518001Z' from_date = get_date(since) logger.info('Re-fetching orcidids updated since: {0}'.format( from_date.isoformat())) # then get all new/old orcidids from orcid-service orcidids = set(updater.get_all_touched_profiles(app, from_date.isoformat())) from_date = get_date() for orcidid in orcidids: try: tasks.task_index_orcid_profile.delay({ 'orcidid': orcidid, 'force': False }) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying...', orcidid tasks.task_index_orcid_profile.delay({ 'orcidid': orcidid, 'force': False }) with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.refetch').first() if kv is None: kv = KeyValue(key='last.refetch', value=from_date.isoformat()) session.add(kv) else: kv.value = from_date.isoformat() session.commit() print 'Done' logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))
def reindex_claims(since=None, orcid_ids=None, **kwargs): """ Re-runs all claims, both from the pipeline and from the orcid-service storage. :param: since - RFC889 formatted string :type: str :return: no return """ if orcid_ids: for oid in orcid_ids: tasks.task_index_orcid_profile.delay({ 'orcidid': oid, 'force': True }) if not since: print 'Done (just the supplied orcidids)' return logging.captureWarnings(True) if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.reindex').first() if kv is not None: since = kv.value else: since = '1974-11-09T22:56:52.518001Z' from_date = get_date(since) orcidids = set() logger.info('Loading records since: {0}'.format(from_date.isoformat())) # first re-check our own database (replay the logs) with app.session_scope() as session: for author in session.query( AuthorInfo.orcidid.distinct().label('orcidid')).all(): orcidid = author.orcidid if orcidid and orcidid.strip() != "": try: changed = updater.reindex_all_claims( app, orcidid, since=from_date.isoformat(), ignore_errors=True) if len(changed): orcidids.add(orcidid) tasks.task_index_orcid_profile.delay({ 'orcidid': orcidid, 'force': True }) except: print 'Error processing: {0}'.format(orcidid) traceback.print_exc() continue if len(orcidids) % 100 == 0: print 'Done replaying {0} profiles'.format(len(orcidids)) print 'Now harvesting orcid profiles...' # then get all new/old orcidids from orcid-service all_orcids = set( updater.get_all_touched_profiles(app, from_date.isoformat())) orcidids = all_orcids.difference(orcidids) from_date = get_date() for orcidid in orcidids: try: tasks.task_index_orcid_profile.delay({ 'orcidid': orcidid, 'force': True }) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying...', orcidid tasks.task_index_orcid_profile.delay({ 'orcidid': orcidid, 'force': True }) with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.reindex').first() if kv is None: kv = KeyValue(key='last.reindex', value=from_date.isoformat()) session.add(kv) else: kv.value = from_date.isoformat() session.commit() print 'Done' logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))
def reindex_claims(since=None, **kwargs): """ Re-runs all claims, both from the pipeline and from the orcid-service storage. :param: since - RFC889 formatted string :type: str :return: no return """ logging.captureWarnings(True) if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.reindex').first() if kv is not None: since = kv.value else: since = '1974-11-09T22:56:52.518001Z' from_date = get_date(since) orcidids = set() # trigger re-indexing worker = RabbitMQWorker(params={ 'publish': 'ads.orcid.fresh-claims', 'exchange': app.config.get('EXCHANGE', 'ads-orcid') }) worker.connect(app.config.get('RABBITMQ_URL')) logger.info('Loading records since: {0}'.format(from_date.isoformat())) # first re-check our own database (replay the logs) with app.session_scope() as session: for author in session.query(AuthorInfo.orcidid.distinct().label('orcidid')).all(): orcidid = author.orcidid if orcidid and orcidid.strip() != "": try: changed = updater.reindex_all_claims(orcidid, since=from_date.isoformat(), ignore_errors=True) if len(changed): orcidids.add(orcidid) worker.publish({'orcidid': orcidid, 'force': True}) except: print 'Error processing: {0}'.format(orcidid) traceback.print_exc() continue if len(orcidids) % 100 == 0: print 'Done replaying {0} profiles'.format(len(orcidids)) print 'Now harvesting orcid profiles...' # then get all new/old orcidids from orcid-service all_orcids = set(updater.get_all_touched_profiles(from_date.isoformat())) orcidids = all_orcids.difference(orcidids) from_date = get_date() for orcidid in orcidids: try: worker.publish({'orcidid': orcidid, 'force': True}) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying...', orcidid worker.publish({'orcidid': orcidid, 'force': True}) with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.reindex').first() if kv is None: kv = KeyValue(key='last.reindex', value=from_date.isoformat()) session.add(kv) else: kv.value = from_date.isoformat() session.commit() print 'Done' logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))