def _check_for_existing_jobs(context, source_id): ''' Given a source id, checks if there are jobs for this source with status 'New' or 'Running' rtype: boolean ''' data_dict = {'source_id': source_id, 'status': u'New'} exist_new = harvest_job_list(context, data_dict) data_dict = {'source_id': source_id, 'status': u'Running'} exist_running = harvest_job_list(context, data_dict) exist = len(exist_new + exist_running) > 0 return exist
def harvest_job_create_all(context,data_dict): log.info('Harvest job create all: %r', data_dict) check_access('harvest_job_create_all',context,data_dict) data_dict.update({'only_active':True}) # Get all active sources sources = harvest_source_list(context,data_dict) jobs = [] # Create a new job for each, if there isn't already one for source in sources: data_dict ={ 'source_id':source['id'], 'status':u'New' } exists = harvest_job_list(context,data_dict) if len(exists): continue job = harvest_job_create(context,{'source_id':source['id']}) jobs.append(job) log.info('Created jobs for %i harvest sources', len(jobs)) return jobs
def harvest_job_create(context,data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create',context,data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestError('Can not create jobs on inactive sources') # Check if there already is an unrun job for this source data_dict ={ 'source_id':source_id, 'status':u'New' } exists = harvest_job_list(context,data_dict) if len(exists): log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestError('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job,context)
def harvest_jobs_run(context,data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run',context,data_dict) source_id = data_dict.get('source_id',None) # Check if there are pending harvest jobs jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'}) log.info('Number of jobs: %i', len(jobs)) sent_jobs = [] if len(jobs) == 0: log.info('No new harvest jobs.') return sent_jobs # i.e. [] # Do not raise an exception as that will cause cron (which runs # this) to produce an error email. # Send each job to the gather queue publisher = get_gather_publisher() for job in jobs: context['detailed'] = False source = harvest_source_show(context,{'id':job['source']}) if source['active']: publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() # Record the running in harvest_status log.info('%i jobs sent to the gather queue to be harvested', len(sent_jobs)) return sent_jobs
def harvest_job_create(context, data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestError('Can not create jobs on inactive sources') # Check if there already is an unrun job for this source data_dict = {'source_id': source_id, 'status': u'New'} exists = harvest_job_list(context, data_dict) if len(exists): log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestError('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job, context)
def harvest_jobs_run(context,data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run',context,data_dict) source_id = data_dict.get('source_id',None) # Check if there are pending harvest jobs jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'}) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context,{'id':job['source']}) if source['active']: publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def _check_for_existing_jobs(context, source_id): ''' Given a source id, checks if there are jobs for this source with status 'New' or 'Running' rtype: boolean ''' data_dict ={ 'source_id':source_id, 'status':u'New' } exist_new = harvest_job_list(context,data_dict) data_dict ={ 'source_id':source_id, 'status':u'Running' } exist_running = harvest_job_list(context,data_dict) exist = len(exist_new + exist_running) > 0 return exist
def harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) source_id = data_dict.get('source_id', None) # Check if there are pending harvest jobs jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'New' }) log.info('Number of jobs: %i', len(jobs)) sent_jobs = [] if len(jobs) == 0: log.info('No new harvest jobs.') return sent_jobs # i.e. [] # Do not raise an exception as that will cause cron (which runs # this) to produce an error email. # Send each job to the gather queue publisher = get_gather_publisher() for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source']}) if source['active']: publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() # Record the running in harvest_status log.info('%i jobs sent to the gather queue to be harvested', len(sent_jobs)) return sent_jobs
def harvest_jobs_run(context, data_dict): ''' Runs scheduled jobs, checks if any jobs need marking as finished, and resubmits queue items if needed. This should be called every few minutes (e.g. by a cron), or else jobs will never show as finished. This used to also 'run' new jobs created by the web UI, putting them onto the gather queue, but now this is done by default when you create a job. If you need to send do this explicitly, then use ``harvest_send_job_to_gather_queue``. :param source_id: the id of the harvest source, if you just want to check for its finished jobs (optional) :type source_id: string ''' log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id') # Scheduled jobs if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list( context, {'source_id': source_id, 'status': u'Running'}) if len(jobs): for job in jobs: if job['gather_finished']: objects = \ session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(HarvestObject.import_finished != None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished else: job_obj.finished = job['gather_finished'] job_obj.save() log.info('Marking job as finished: %s', job_obj) # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, {'id': job_obj.source.id}) else: log.debug('Ongoing job:%s source:%s', job['id'], job['source_id']) # resubmit old redis tasks resubmit_jobs() return [] # merely for backwards compatibility
def harvest_jobs_run(context, data_dict): ''' Runs scheduled jobs, checks if any jobs need marking as finished, and resubmits queue items if needed. This should be called every few minutes (e.g. by a cron), or else jobs will never show as finished. This used to also 'run' new jobs created by the web UI, putting them onto the gather queue, but now this is done by default when you create a job. If you need to send do this explicitly, then use ``harvest_send_job_to_gather_queue``. :param source_id: the id of the harvest source, if you just want to check for its finished jobs (optional) :type source_id: string ''' log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id') # Scheduled jobs if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): for job in jobs: if job['gather_finished']: num_objects_in_progress = \ session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .count() if num_objects_in_progress == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' log.info('Marking job as finished %s %s', job_obj.source.url, job_obj.id) # save the time of finish, according to the last running # object last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter( HarvestObject.import_finished != None # noqa: E711 ).order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished else: job_obj.finished = job['gather_finished'] job_obj.save() # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, { 'id': job_obj.source.id }) status = get_action('harvest_source_show_status')( context, { 'id': job_obj.source.id }) if config.get('ckan.harvest.status_mail') == 'errors' \ and status['last_job']['stats']['errored']: subject, body = prepare_error_mail( context, job_obj.source_id, status, 'emails/error_email.txt') log.info('Sending error mail') send_mail(context, job_obj.source.id, subject, body) if config.get('ckan.harvest.status_mail') == 'all': subject, body = prepare_summary_mail( context, job_obj.source.id, status, 'emails/summary_email.txt') log.info('Sending summary email') send_mail(context, job_obj.source.id, subject, body) else: log.debug('Ongoing job:%s source:%s', job['id'], job['source_id']) log.debug('No jobs to send to the gather queue') # Resubmit old redis tasks resubmit_jobs() # Resubmit pending objects missing from Redis resubmit_objects() return [] # merely for backwards compatibility
def harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, { 'id': job_obj.source.id }) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'New' }) if len(jobs) == 0: log.info('No new harvest jobs.') raise NoNewHarvestJobError('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_jobs_run(context, data_dict): ''' Runs scheduled jobs, checks if any jobs need marking as finished, and resubmits queue items if needed. If ckanext.harvest.timeout is set: Check if the duration of the job is longer than ckanext.harvest.timeout, then mark that job as finished as there is probably an underlying issue with the harvest process. This should be called every few minutes (e.g. by a cron), or else jobs will never show as finished. This used to also 'run' new jobs created by the web UI, putting them onto the gather queue, but now this is done by default when you create a job. If you need to send do this explicitly, then use ``harvest_send_job_to_gather_queue``. :param source_id: the id of the harvest source, if you just want to check for its finished jobs (optional) :type source_id: string ''' log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) timeout = config.get('ckan.harvest.timeout') session = context['session'] source_id = data_dict.get('source_id') # Scheduled jobs if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): for job in jobs: if timeout: created = datetime.datetime.strptime(job['created'], '%Y-%m-%d %H:%M:%S.%f') now = datetime.datetime.now() if now - created > datetime.timedelta(minutes=int(timeout)): msg = 'Job timeout: %s is taking longer than %s minutes' % ( job['id'], timeout) log.error(msg) job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' job_obj.finished = now job_obj.save() err = HarvestGatherError(message=msg, job=job_obj) err.save() log.info('Marking job as finished due to error: %s %s', job_obj.source.url, job_obj.id) continue if job['gather_finished']: num_objects_in_progress = \ session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .count() if num_objects_in_progress == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' log.info('Marking job as finished %s %s', job_obj.source.url, job_obj.id) # save the time of finish, according to the last running # object last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter( HarvestObject.import_finished != None # noqa: E711 ).order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished else: job_obj.finished = job['gather_finished'] job_obj.save() # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, { 'id': job_obj.source.id }) status = get_action('harvest_source_show_status')( context, { 'id': job_obj.source.id }) if toolkit.asbool(config.get('ckan.harvest.status_mail.errored'))\ and (status['last_job']['stats']['errored']): send_error_mail(context, job_obj.source.id, status) else: log.debug('Ongoing job:%s source:%s', job['id'], job['source_id']) log.debug('No jobs to send to the gather queue') # Resubmit old redis tasks resubmit_jobs() # Resubmit pending objects missing from Redis resubmit_objects() return [] # merely for backwards compatibility
def harvest_jobs_run(context, data_dict): log.info("Harvest job run: %r", data_dict) check_access("harvest_jobs_run", context, data_dict) session = context["session"] source_id = data_dict.get("source_id", None) if not source_id: _make_scheduled_jobs(context, data_dict) context["return_objects"] = False # Flag finished jobs as such jobs = harvest_job_list(context, {"source_id": source_id, "status": u"Running"}) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job["gather_finished"]: objects = ( session.query(HarvestObject.id) .filter(HarvestObject.harvest_job_id == job["id"]) .filter(and_((HarvestObject.state != u"COMPLETE"), (HarvestObject.state != u"ERROR"))) .order_by(HarvestObject.import_finished.desc()) ) if objects.count() == 0: job_obj = HarvestJob.get(job["id"]) job_obj.status = u"Finished" last_object = ( session.query(HarvestObject) .filter(HarvestObject.harvest_job_id == job["id"]) .filter(HarvestObject.import_finished != None) .order_by(HarvestObject.import_finished.desc()) .first() ) if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status if "extras_as_string" in context: del context["extras_as_string"] context.update({"validate": False, "ignore_auth": True}) package_dict = logic.get_action("package_show")(context, {"id": job_obj.source.id}) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, {"source_id": source_id, "status": u"New"}) if len(jobs) == 0: log.info("No new harvest jobs.") raise Exception("There are no new harvesting jobs") # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context["detailed"] = False source = harvest_source_show(context, {"id": job["source_id"]}) if source["active"]: job_obj = HarvestJob.get(job["id"]) job_obj.status = job["status"] = u"Running" job_obj.save() publisher.send({"harvest_job_id": job["id"]}) log.info("Sent job %s to the gather queue" % job["id"]) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_jobs_run(context,data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run',context,data_dict) session = context['session'] source_id = data_dict.get('source_id',None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context,{'source_id':source_id,'status':u'Running'}) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status if 'extras_as_string'in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')(context, {'id': job_obj.source.id}) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'}) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context,{'id':job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_jobs_run(context, data_dict): ''' Runs scheduled jobs, checks if any jobs need marking as finished, and resubmits queue items if needed. If ckanext.harvest.timeout is set: Check if the duration of the job is longer than ckanext.harvest.timeout, then mark that job as finished as there is probably an underlying issue with the harvest process. This should be called every few minutes (e.g. by a cron), or else jobs will never show as finished. This used to also 'run' new jobs created by the web UI, putting them onto the gather queue, but now this is done by default when you create a job. If you need to send do this explicitly, then use ``harvest_send_job_to_gather_queue``. :param source_id: the id of the harvest source, if you just want to check for its finished jobs (optional) :type source_id: string ''' log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) timeout = config.get('ckan.harvest.timeout') session = context['session'] source_id = data_dict.get('source_id') # Scheduled jobs if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): for job in jobs: job_obj = HarvestJob.get(job['id']) if timeout: last_time = job_obj.get_last_action_time() now = datetime.datetime.utcnow() if now - last_time > datetime.timedelta(minutes=int(timeout)): msg = 'Job {} timeout ({} minutes)\n'.format( job_obj.id, timeout) msg += '\tJob created: {}\n'.format(job_obj.created) msg += '\tJob gather finished: {}\n'.format( job_obj.created) msg += '\tJob last action time: {}\n'.format(last_time) job_obj.status = u'Finished' job_obj.finished = now job_obj.save() err = HarvestGatherError(message=msg, job=job_obj) err.save() log.info('Marking job as finished due to error: %s %s', job_obj.source.url, job_obj.id) continue if job['gather_finished']: num_objects_in_progress = \ session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .count() if num_objects_in_progress == 0: job_obj.status = u'Finished' log.info('Marking job as finished %s %s', job_obj.source.url, job_obj.id) # save the time of finish, according to the last running # object last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter( HarvestObject.import_finished != None # noqa: E711 ).order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished else: job_obj.finished = job['gather_finished'] job_obj.save() # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, { 'id': job_obj.source.id }) status = get_action('harvest_source_show_status')( context, { 'id': job_obj.source.id }) notify_all = toolkit.asbool( config.get('ckan.harvest.status_mail.all')) notify_errors = toolkit.asbool( config.get('ckan.harvest.status_mail.errored')) last_job_errors = status['last_job']['stats'].get( 'errored', 0) log.debug( 'Notifications: All:{} On error:{} Errors:{}'.format( notify_all, notify_errors, last_job_errors)) if last_job_errors > 0 and (notify_all or notify_errors): # send_error_mail_ncar(context, job_obj) # get_mail_extra_vars(context, job_obj.source.id, status) send_error_email(context, job_obj.source.id, status) elif notify_all: send_summary_email(context, job_obj.source.id, status) else: log.debug('%d Ongoing jobs for %s (source:%s)', num_objects_in_progress, job['id'], job['source_id']) log.debug('No jobs to send to the gather queue') # Resubmit old redis tasks resubmit_jobs() # Resubmit pending objects missing from Redis resubmit_objects() # log.debug('Start of commit and close') # session.commit() # log.debug(' (Start of close)') # session.close() # log.debug('End of commit and close') return [] # merely for backwards compatibility
def distributed_harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('distributed_harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', None) routing_key = data_dict.get('gather_routing_key', None) exchange_name = data_dict.get('exchange_name', None) fetch_routing_key = data_dict.get('fetch_routing_key', None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status if 'extras_as_string' in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')( context, { 'id': job_obj.source.id }) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'New' }) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_distributed_gather_publisher(exchange_name, routing_key) sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({ 'harvest_job_id': job['id'], 'exchange_name': exchange_name, 'fetch_routing_key': fetch_routing_key }) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # recreate job for datajson collection or the like. source = job_obj.source source_config = json.loads(source.config or '{}') datajson_collection = source_config.get( 'datajson_collection') if datajson_collection == 'parents_run': new_job = HarvestJob() new_job.source = source new_job.save() source_config['datajson_collection'] = 'children_run' source.config = json.dumps(source_config) source.save() elif datajson_collection: # reset the key if 'children_run', or anything. source_config.pop("datajson_collection", None) source.config = json.dumps(source_config) source.save() # Reindex the harvest source dataset so it has the latest # status if 'extras_as_string' in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')( context, { 'id': job_obj.source.id }) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'New' }) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_jobs_run(context, data_dict): ''' Runs scheduled jobs, checks if any jobs need marking as finished, and resubmits queue items if needed. This should be called every few minutes (e.g. by a cron), or else jobs will never show as finished. This used to also 'run' new jobs created by the web UI, putting them onto the gather queue, but now this is done by default when you create a job. If you need to send do this explicitly, then use ``harvest_send_job_to_gather_queue``. :param source_id: the id of the harvest source, if you just want to check for its finished jobs (optional) :type source_id: string ''' log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id') # Scheduled jobs if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): for job in jobs: if job['gather_finished']: objects = \ session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(HarvestObject.import_finished != None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished else: job_obj.finished = job['gather_finished'] job_obj.save() log.info('Marking job as finished: %s', job_obj) # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, { 'id': job_obj.source.id }) else: log.debug('Ongoing job:%s source:%s', job['id'], job['source_id']) # resubmit old redis tasks resubmit_jobs() return [] # merely for backwards compatibility