def harvest_send_job_to_gather_queue(context, data_dict): ''' Sends a harvest job to the gather queue. :param id: the id of the harvest job :type id: string ''' log.info('Send job to gather queue: %r', data_dict) job_id = logic.get_or_bust(data_dict, 'id') job = toolkit.get_action('harvest_job_show')(context, {'id': job_id}) check_access('harvest_send_job_to_gather_queue', context, job) # gather queue publisher = get_gather_publisher() # Check the source is active source = harvest_source_show(context, {'id': job['source_id']}) if not source['active']: raise toolkit.ValidationError('Source is not active') job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue', job['id']) return harvest_job_dictize(job_obj, context)
def harvest_job_create(context,data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create',context,data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestError('Can not create jobs on inactive sources') # Check if there already is an unrun job for this source data_dict ={ 'source_id':source_id, 'status':u'New' } exists = harvest_job_list(context,data_dict) if len(exists): log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestError('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job,context)
def harvest_send_job_to_gather_queue(context, data_dict): ''' Sends a harvest job to the gather queue. :param id: the id of the harvest job :type id: string ''' log.info('Send job to gather queue: %r', data_dict) job_id = logic.get_or_bust(data_dict, 'id') job = toolkit.get_action('harvest_job_show')( context, {'id': job_id}) check_access('harvest_send_job_to_gather_queue', context, job) # gather queue publisher = get_gather_publisher() # Check the source is active source = harvest_source_show(context, {'id': job['source_id']}) if not source['active']: raise toolkit.ValidationError('Source is not active') job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue', job['id']) return harvest_job_dictize(job_obj, context)
def harvest_job_create(context, data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise Exception('Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job, context)
def harvest_job_list(context,data_dict): '''Returns a list of jobs and details of objects and errors. :param status: filter by e.g. "New" or "Finished" jobs :param source_id: filter by a harvest source ''' check_access('harvest_job_list',context,data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('source_id',False) status = data_dict.get('status', False) query = session.query(HarvestJob) if source_id: query = query.filter(HarvestJob.source_id==source_id) if status: query = query.filter(HarvestJob.status==status) query = query.order_by(HarvestJob.created.desc()) jobs = query.all() context['return_error_summary'] = False return [harvest_job_dictize(job, context) for job in jobs]
def harvest_job_list(context,data_dict): '''Returns a list of jobs and details of objects and errors. There is a hard limit of 100 results. :param status: filter by e.g. "New" or "Finished" jobs :param source_id: filter by a harvest source :param offset: paging ''' check_access('harvest_job_list',context,data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('source_id',False) status = data_dict.get('status',False) offset = data_dict.get('offset', 0) query = session.query(HarvestJob) if source_id: query = query.filter(HarvestJob.source_id==source_id) if status: query = query.filter(HarvestJob.status==status) # Have a max for safety query = query.offset(offset).limit(100) jobs = query.all() return [harvest_job_dictize(job,context) for job in jobs]
def harvest_job_list(context, data_dict): '''Returns a list of jobs and details of objects and errors. :param status: filter by e.g. "New" or "Finished" jobs :param source_id: filter by a harvest source ''' check_access('harvest_job_list', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', False) status = data_dict.get('status', False) query = session.query(HarvestJob) if source_id: query = query.filter(HarvestJob.source_id == source_id) if status: query = query.filter(HarvestJob.status == status) query = query.order_by(HarvestJob.created.desc()) jobs = query.all() context['return_error_summary'] = False return [harvest_job_dictize(job, context) for job in jobs]
def harvest_job_create(context,data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create',context,data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise Exception('Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job,context)
def harvest_job_list(context, data_dict): """Returns a list of jobs and details of objects and errors. :param status: filter by e.g. "New" or "Finished" jobs :param source_id: filter by a harvest source """ check_access("harvest_job_list", context, data_dict) model = context["model"] session = context["session"] source_id = data_dict.get("source_id", False) status = data_dict.get("status", False) query = session.query(HarvestJob) if source_id: query = query.filter(HarvestJob.source_id == source_id) if status: query = query.filter(HarvestJob.status == status) query = query.order_by(HarvestJob.created.desc()) jobs = query.all() context["return_error_summary"] = False return [harvest_job_dictize(job, context) for job in jobs]
def harvest_job_create(context, data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestError('Can not create jobs on inactive sources') # Check if there already is an unrun job for this source data_dict = {'source_id': source_id, 'status': u'New'} exists = harvest_job_list(context, data_dict) if len(exists): log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestError('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job, context)
def harvest_source_show_status(context, data_dict): ''' Returns a status report for a harvest source Given a particular source, returns a dictionary containing information about the source jobs, datasets created, errors, etc. Note that this information is already included on the output of harvest_source_show, under the 'status' field. :param id: the id or name of the harvest source :type id: string :rtype: dictionary ''' p.toolkit.check_access('harvest_source_show_status', context, data_dict) model = context.get('model') source = harvest_model.HarvestSource.get(data_dict['id']) if not source: raise p.toolkit.ObjectNotFound( 'Harvest source {0} does not exist'.format(data_dict['id'])) out = { 'job_count': 0, 'last_job': None, 'total_datasets': 0, } jobs = harvest_model.HarvestJob.filter(source=source).all() job_count = len(jobs) if job_count == 0: return out out['job_count'] = job_count # Get the most recent job last_job = harvest_model.HarvestJob.filter(source=source) \ .order_by(harvest_model.HarvestJob.created.desc()).first() if not last_job: return out out['last_job'] = harvest_job_dictize(last_job, context) # Overall statistics packages = model.Session.query(model.Package) \ .join(harvest_model.HarvestObject) \ .filter(harvest_model.HarvestObject.harvest_source_id == source.id) \ .filter( harvest_model.HarvestObject.current == True # noqa: E712 ).filter(model.Package.state == u'active') \ .filter( model.Package.private == False # noqa: E712 ) out['total_datasets'] = packages.count() return out
def harvest_source_show_status(context, data_dict): ''' Returns a status report for a harvest source Given a particular source, returns a dictionary containing information about the source jobs, datasets created, errors, etc. Note that this information is already included on the output of harvest_source_show, under the 'status' field. :param id: the id or name of the harvest source :type id: string :rtype: dictionary ''' p.toolkit.check_access('harvest_source_show_status', context, data_dict) model = context.get('model') source = harvest_model.HarvestSource.get(data_dict['id']) if not source: raise p.toolkit.ObjectNotFound('Harvest source {0} does not exist'.format(data_dict['id'])) out = { 'job_count': 0, 'last_job': None, 'total_datasets': 0, } jobs = harvest_model.HarvestJob.filter(source=source).all() job_count = len(jobs) if job_count == 0: return out out['job_count'] = job_count # Get the most recent job last_job = harvest_model.HarvestJob.filter(source=source) \ .order_by(harvest_model.HarvestJob.created.desc()).first() if not last_job: return out out['last_job'] = harvest_job_dictize(last_job, context) # Overall statistics packages = model.Session.query(model.Package) \ .join(harvest_model.HarvestObject) \ .filter(harvest_model.HarvestObject.harvest_source_id==source.id) \ .filter(harvest_model.HarvestObject.current==True) \ .filter(model.Package.state==u'active') \ .filter(model.Package.private==False) out['total_datasets'] = packages.count() return out
def harvest_job_abort(context, data_dict): ''' Aborts a harvest job. Given a harvest source_id, it looks for the latest one and (assuming it not already Finished) marks it as Finished. It also marks any of that source's harvest objects and (if not complete or error) marks them "ERROR", so any left in limbo are cleaned up. Does not actually stop running any queued harvest fetchs/objects. :param source_id: the name or id of the harvest source with a job to abort :type source_id: string ''' check_access('harvest_job_abort', context, data_dict) model = context['model'] source_id = data_dict.get('source_id') source = harvest_source_show(context, {'id': source_id}) # HarvestJob set status to 'Finished' # Don not use harvest_job_list since it can use a lot of memory last_job = model.Session.query(HarvestJob) \ .filter_by(source_id=source['id']) \ .order_by(HarvestJob.created.desc()).first() if not last_job: raise NotFound('Error: source has no jobs') job = get_action('harvest_job_show')(context, {'id': last_job.id}) if job['status'] != 'Finished': # i.e. New or Running job_obj = HarvestJob.get(job['id']) job_obj.status = new_status = 'Finished' model.repo.commit_and_remove() log.info('Harvest job changed status from "%s" to "%s"', job['status'], new_status) else: log.info('Harvest job unchanged. Source %s status is: "%s"', job['id'], job['status']) # HarvestObjects set to ERROR job_obj = HarvestJob.get(job['id']) objs = job_obj.objects for obj in objs: if obj.state not in ('COMPLETE', 'ERROR'): old_state = obj.state obj.state = 'ERROR' log.info('Harvest object changed state from "%s" to "%s": %s', old_state, obj.state, obj.id) else: log.info('Harvest object not changed from "%s": %s', obj.state, obj.id) model.repo.commit_and_remove() job_obj = HarvestJob.get(job['id']) return harvest_job_dictize(job_obj, context)
def harvest_source_show_status(context, data_dict): """ Returns a status report for a harvest source Given a particular source, returns a dictionary containing information about the source jobs, datasets created, errors, etc. Note that this information is already included on the output of harvest_source_show, under the 'status' field. :param id: the id or name of the harvest source :type id: string :rtype: dictionary """ p.toolkit.check_access("harvest_source_show_status", context, data_dict) model = context.get("model") source = harvest_model.HarvestSource.get(data_dict["id"]) if not source: raise p.toolkit.ObjectNotFound("Harvest source {0} does not exist".format(data_dict["id"])) out = {"job_count": 0, "last_job": None, "total_datasets": 0} jobs = harvest_model.HarvestJob.filter(source=source).all() job_count = len(jobs) if job_count == 0: return out out["job_count"] = job_count # Get the most recent job last_job = harvest_model.HarvestJob.filter(source=source).order_by(harvest_model.HarvestJob.created.desc()).first() if not last_job: return out out["last_job"] = harvest_job_dictize(last_job, context) # Overall statistics packages = ( model.Session.query(model.Package) .join(harvest_model.HarvestObject) .filter(harvest_model.HarvestObject.harvest_source_id == source.id) .filter(harvest_model.HarvestObject.current == True) .filter(model.Package.state == u"active") .filter(model.Package.private == False) ) out["total_datasets"] = packages.count() return out
def harvest_job_show(context, data_dict): check_access('harvest_job_show', context, data_dict) id = data_dict.get('id') attr = data_dict.get('attr', None) job = HarvestJob.get(id, attr=attr) if not job: raise NotFound return harvest_job_dictize(job, context)
def harvest_job_show(context,data_dict): p.toolkit.check_access('harvest_job_show',context,data_dict) id = data_dict.get('id') attr = data_dict.get('attr',None) job = HarvestJob.get(id,attr=attr) if not job: raise NotFound return harvest_job_dictize(job,context)
def harvest_job_show(context, data_dict): check_access("harvest_job_show", context, data_dict) id = data_dict.get("id") attr = data_dict.get("attr", None) job = HarvestJob.get(id, attr=attr) if not job: raise NotFound return harvest_job_dictize(job, context)
def harvest_job_create(context, data_dict): ''' Creates a Harvest Job for a Harvest Source and runs it (by putting it on the gather queue) :param source_id: id of the harvest source to create a job for :type source_id: string :param run: whether to also run it or not (default: True) :type run: bool ''' log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] run_it = data_dict.get('run', True) # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise toolkit.ObjectNotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestSourceInactiveError( 'Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this # source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) if run_it: toolkit.get_action('harvest_send_job_to_gather_queue')(context, { 'id': job.id }) return harvest_job_dictize(job, context)
def harvest_job_create(context, data_dict): ''' Creates a Harvest Job for a Harvest Source and runs it (by putting it on the gather queue) :param source_id: id of the harvest source to create a job for :type source_id: string :param run: whether to also run it or not (default: True) :type run: bool ''' log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] run_it = data_dict.get('run', True) # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise toolkit.NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestSourceInactiveError('Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this # source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) if run_it: toolkit.get_action('harvest_send_job_to_gather_queue')( context, {'id': job.id}) return harvest_job_dictize(job, context)
def harvest_job_list(context,data_dict): p.toolkit.check_access('harvest_job_list',context,data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('source_id',False) status = data_dict.get('status',False) query = session.query(HarvestJob) if source_id: query = query.filter(HarvestJob.source_id==source_id) if status: query = query.filter(HarvestJob.status==status) jobs = query.all() return [harvest_job_dictize(job,context) for job in jobs]
def harvest_job_list(context, data_dict): check_access('harvest_job_list', context, data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('source_id', False) status = data_dict.get('status', False) query = session.query(HarvestJob) if source_id: query = query.filter(HarvestJob.source_id == source_id) if status: query = query.filter(HarvestJob.status == status) jobs = query.all() return [harvest_job_dictize(job, context) for job in jobs]
def harvest_job_list(context,data_dict): check_access('harvest_job_list',context,data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('source_id',False) status = data_dict.get('status', False) query = session.query(HarvestJob) if source_id: query = query.filter(HarvestJob.source_id==source_id) if status: query = query.filter(HarvestJob.status==status) query = query.order_by(HarvestJob.created.desc()) jobs = query.all() context['return_error_summary'] = False return [harvest_job_dictize(job, context) for job in jobs]
def send_error_mail_ncar(context, job_obj): sql = 'select name from package where id = :source_id;' model = context['model'] q = model.Session.execute(sql, {'source_id': job_obj.source.id}) for row in q: harvest_name = str(row['name']) ckan_site_url = config.get('ckan.site_url') job_url = ckan_site_url + '/harvest/' + harvest_name + '/job/' + job_obj.id msg = 'This is a failure-notification of the latest harvest job on ' + ckan_site_url + '.\n\n' msg += 'Harvest Job URL: ' + job_url + '\n\n' sql = '''select g.title as org, s.title as job_title from member m join public.group g on m.group_id = g.id join harvest_source s on s.id = m.table_id where table_id = :source_id;''' q = model.Session.execute(sql, {'source_id': job_obj.source.id}) for row in q: orgName = str(row['org']) msg += 'Organization: ' + str(row['org']) + '\n\n' msg += 'Harvest Source: ' + str(row['job_title']) + '\n\n' msg += 'Date of Harvest: ' + str(job_obj.created) + ' GMT\n\n' out = { 'last_job': None, } out['last_job'] = harvest_job_dictize(job_obj, context) job_dict = get_action('harvest_job_report')(context, {'id': job_obj.id}) error_dicts = job_dict['object_errors'] errored_object_keys = error_dicts.keys() numRecordsInError = len(errored_object_keys) msg += 'Records in Error: ' + str(numRecordsInError) + '\n\n' msg += 'For help, please contact the NCAR Data Stewardship Coordinator (mailto:[email protected]).\n\n\n' if numRecordsInError <= 20: errored_object_keys = errored_object_keys[:20] for key in errored_object_keys: error_dict = error_dicts[key] msg += error_dict['original_url'] + ' :\n\n' for error in error_dict['errors']: msg += error['message'] if error['line']: msg += ' (line ' + str(error['line']) + ')\n\n' else: msg += '\n' msg += '\n\n' else: for key in errored_object_keys: msg += error_dicts[key]['original_url'] + '\n\n' msg += '\n\nError Messages are suppressed if there are more than 20 records with errors.\n' log.debug("msg == " + msg) if numRecordsInError > 0: msg += '\n--\nYou are receiving this email because you are currently set-up as a member of the Organization "' + orgName + '" for ' + config.get( 'ckan.site_title' ) + '. Please do not reply to this email as it was sent from a non-monitored address.' # get org info log.debug('orgName == ' + orgName) org_dict = toolkit.get_action('organization_show')( context, { 'id': orgName.lower(), 'include_users': True }) # get usernames in org usernames = [x['name'] for x in org_dict['users']] log.debug("usernames == " + ','.join(usernames)) # get emails for users email_recipients = [] for username in usernames: user_dict = toolkit.get_action('user_show')(context, { 'id': username }) email_recipients.append(user_dict['email']) log.debug("email_recipients == " + ','.join(email_recipients)) emails = {} for recipient in email_recipients: email = { 'recipient_name': recipient, 'recipient_email': recipient, 'subject': config.get('ckan.site_title') + ' - Harvesting Job - Error Notification', 'body': msg } try: #app_globals._push_object(config['pylons.app_globals']) mailer.mail_recipient(**email) except Exception as e: log.exception(e) log.error( 'Sending Harvest-Notification-Mail failed. Message: ' + msg)