def harvest_jobs_run(context,data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run',context,data_dict) source_id = data_dict.get('source_id',None) # Check if there are pending harvest jobs jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'}) log.info('Number of jobs: %i', len(jobs)) sent_jobs = [] if len(jobs) == 0: log.info('No new harvest jobs.') return sent_jobs # i.e. [] # Do not raise an exception as that will cause cron (which runs # this) to produce an error email. # Send each job to the gather queue publisher = get_gather_publisher() for job in jobs: context['detailed'] = False source = harvest_source_show(context,{'id':job['source']}) if source['active']: publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() # Record the running in harvest_status log.info('%i jobs sent to the gather queue to be harvested', len(sent_jobs)) return sent_jobs
def test_redis_queue_purging(self): ''' Test that Redis queue purging doesn't purge the wrong keys. ''' if config.get('ckan.harvest.mq.type') != 'redis': pytest.skip() redis = queue.get_connection() try: redis.set('ckanext-harvest:some-random-key', 'foobar') # Create some fake jobs gather_publisher = queue.get_gather_publisher() gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) fetch_publisher = queue.get_fetch_publisher() fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) num_keys = redis.dbsize() # Create some fake objects gather_consumer = queue.get_gather_consumer() next(gather_consumer.consume(queue.get_gather_queue_name())) fetch_consumer = queue.get_fetch_consumer() next(fetch_consumer.consume(queue.get_fetch_queue_name())) assert redis.dbsize() > num_keys queue.purge_queues() assert redis.get('ckanext-harvest:some-random-key') == 'foobar' assert redis.dbsize() == num_keys assert redis.llen(queue.get_gather_routing_key()) == 0 assert redis.llen(queue.get_fetch_routing_key()) == 0 finally: redis.delete('ckanext-harvest:some-random-key')
def harvest_send_job_to_gather_queue(context, data_dict): ''' Sends a harvest job to the gather queue. :param id: the id of the harvest job :type id: string ''' log.info('Send job to gather queue: %r', data_dict) job_id = logic.get_or_bust(data_dict, 'id') job = toolkit.get_action('harvest_job_show')(context, {'id': job_id}) check_access('harvest_send_job_to_gather_queue', context, job) # gather queue publisher = get_gather_publisher() # Check the source is active source = harvest_source_show(context, {'id': job['source_id']}) if not source['active']: raise toolkit.ValidationError('Source is not active') job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue', job['id']) return harvest_job_dictize(job_obj, context)
def harvest_jobs_run(context,data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run',context,data_dict) source_id = data_dict.get('source_id',None) # Check if there are pending harvest jobs jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'}) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context,{'id':job['source']}) if source['active']: publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_send_job_to_gather_queue(context, data_dict): ''' Sends a harvest job to the gather queue. :param id: the id of the harvest job :type id: string ''' log.info('Send job to gather queue: %r', data_dict) job_id = logic.get_or_bust(data_dict, 'id') job = toolkit.get_action('harvest_job_show')( context, {'id': job_id}) check_access('harvest_send_job_to_gather_queue', context, job) # gather queue publisher = get_gather_publisher() # Check the source is active source = harvest_source_show(context, {'id': job['source_id']}) if not source['active']: raise toolkit.ValidationError('Source is not active') job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue', job['id']) return harvest_job_dictize(job_obj, context)
def test_redis_corrupt(self, mock_log_error): ''' Test that corrupt Redis doesn't stop harvest process and still processes other jobs. ''' if config.get('ckan.harvest.mq.type') != 'redis': pytest.skip() redis = queue.get_connection() try: redis.set('ckanext-harvest:some-random-key-2', 'foobar') # make sure queues/exchanges are created first and are empty gather_consumer = queue.get_gather_consumer() fetch_consumer = queue.get_fetch_consumer() gather_consumer.queue_purge(queue=queue.get_gather_queue_name()) fetch_consumer.queue_purge(queue=queue.get_fetch_queue_name()) # Create some fake jobs and objects with no harvest_job_id gather_publisher = queue.get_gather_publisher() gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) fetch_publisher = queue.get_fetch_publisher() fetch_publisher.send({'harvest_object_id': None}) h_obj_id = str(uuid.uuid4()) fetch_publisher.send({'harvest_object_id': h_obj_id}) # Create some fake objects next(gather_consumer.consume(queue.get_gather_queue_name())) _, _, body = next( fetch_consumer.consume(queue.get_fetch_queue_name())) json_obj = json.loads(body) assert json_obj['harvest_object_id'] == h_obj_id assert mock_log_error.call_count == 1 args, _ = mock_log_error.call_args_list[0] if six.PY2: assert "cannot concatenate 'str' and 'NoneType' objects" in args[ 1] else: assert "must be str, not NoneType" in str(args[1]) finally: redis.delete('ckanext-harvest:some-random-key-2')
def harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) source_id = data_dict.get('source_id', None) # Check if there are pending harvest jobs jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'New' }) log.info('Number of jobs: %i', len(jobs)) sent_jobs = [] if len(jobs) == 0: log.info('No new harvest jobs.') return sent_jobs # i.e. [] # Do not raise an exception as that will cause cron (which runs # this) to produce an error email. # Send each job to the gather queue publisher = get_gather_publisher() for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source']}) if source['active']: publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() # Record the running in harvest_status log.info('%i jobs sent to the gather queue to be harvested', len(sent_jobs)) return sent_jobs
def test_redis_queue_purging(self): ''' Test that Redis queue purging doesn't purge the wrong keys. ''' if config.get('ckan.harvest.mq.type') != 'redis': raise SkipTest() redis = queue.get_connection() try: redis.set('ckanext-harvest:some-random-key', 'foobar') # Create some fake jobs gather_publisher = queue.get_gather_publisher() gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) gather_publisher.send({'harvest_job_id': str(uuid.uuid4())}) fetch_publisher = queue.get_fetch_publisher() fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())}) num_keys = redis.dbsize() # Create some fake objects gather_consumer = queue.get_gather_consumer() next(gather_consumer.consume(queue.get_gather_queue_name())) fetch_consumer = queue.get_fetch_consumer() next(fetch_consumer.consume(queue.get_fetch_queue_name())) ok_(redis.dbsize() > num_keys) queue.purge_queues() assert_equal(redis.get('ckanext-harvest:some-random-key'), 'foobar') assert_equal(redis.dbsize(), num_keys) assert_equal(redis.llen(queue.get_gather_routing_key()), 0) assert_equal(redis.llen(queue.get_fetch_routing_key()), 0) finally: redis.delete('ckanext-harvest:some-random-key')
def harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, { 'id': job_obj.source.id }) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'New' }) if len(jobs) == 0: log.info('No new harvest jobs.') raise NoNewHarvestJobError('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_jobs_run(context, data_dict): log.info("Harvest job run: %r", data_dict) check_access("harvest_jobs_run", context, data_dict) session = context["session"] source_id = data_dict.get("source_id", None) if not source_id: _make_scheduled_jobs(context, data_dict) context["return_objects"] = False # Flag finished jobs as such jobs = harvest_job_list(context, {"source_id": source_id, "status": u"Running"}) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job["gather_finished"]: objects = ( session.query(HarvestObject.id) .filter(HarvestObject.harvest_job_id == job["id"]) .filter(and_((HarvestObject.state != u"COMPLETE"), (HarvestObject.state != u"ERROR"))) .order_by(HarvestObject.import_finished.desc()) ) if objects.count() == 0: job_obj = HarvestJob.get(job["id"]) job_obj.status = u"Finished" last_object = ( session.query(HarvestObject) .filter(HarvestObject.harvest_job_id == job["id"]) .filter(HarvestObject.import_finished != None) .order_by(HarvestObject.import_finished.desc()) .first() ) if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status if "extras_as_string" in context: del context["extras_as_string"] context.update({"validate": False, "ignore_auth": True}) package_dict = logic.get_action("package_show")(context, {"id": job_obj.source.id}) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, {"source_id": source_id, "status": u"New"}) if len(jobs) == 0: log.info("No new harvest jobs.") raise Exception("There are no new harvesting jobs") # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context["detailed"] = False source = harvest_source_show(context, {"id": job["source_id"]}) if source["active"]: job_obj = HarvestJob.get(job["id"]) job_obj.status = job["status"] = u"Running" job_obj.save() publisher.send({"harvest_job_id": job["id"]}) log.info("Sent job %s to the gather queue" % job["id"]) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_jobs_run(context,data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run',context,data_dict) session = context['session'] source_id = data_dict.get('source_id',None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context,{'source_id':source_id,'status':u'Running'}) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status if 'extras_as_string'in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')(context, {'id': job_obj.source.id}) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'}) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context,{'id':job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # recreate job for datajson collection or the like. source = job_obj.source source_config = json.loads(source.config or '{}') datajson_collection = source_config.get( 'datajson_collection') if datajson_collection == 'parents_run': new_job = HarvestJob() new_job.source = source new_job.save() source_config['datajson_collection'] = 'children_run' source.config = json.dumps(source_config) source.save() elif datajson_collection: # reset the key if 'children_run', or anything. source_config.pop("datajson_collection", None) source.config = json.dumps(source_config) source.save() # Reindex the harvest source dataset so it has the latest # status if 'extras_as_string' in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')( context, { 'id': job_obj.source.id }) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'New' }) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs