def _update_harvest_source_object(context, data_dict): ''' Updates an actual HarvestSource object with the data dict of the harvest_source dataset. All validation and authorization checks should be used by now, so this function is not to be used directly to update harvest sources. :param data_dict: A standard package data_dict :returns: The created HarvestSource object :rtype: HarvestSource object ''' source_id = data_dict.get('id') log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise logic.NotFound('Harvest source %s does not exist' % source_id) fields = [ 'url', 'title', 'description', 'user_id', 'publisher_id', 'frequency' ] for f in fields: if f in data_dict and data_dict[f] is not None: if f == 'url': data_dict[f] = data_dict[f].strip() source.__setattr__(f, data_dict[f]) # Avoids clashes with the dataset type if 'source_type' in data_dict: source.type = data_dict['source_type'] if 'config' in data_dict: source.config = data_dict['config'] # Don't change state unless explicitly set in the dict if 'state' in data_dict: source.active = data_dict.get('state') == 'active' # Don't commit yet, let package_create do it source.add() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source, status=u'New') log.info( 'Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.add() return source
def _update_harvest_source_object(context, data_dict): ''' Updates an actual HarvestSource object with the data dict of the harvest_source dataset. All validation and authorization checks should be used by now, so this function is not to be used directly to update harvest sources. :param data_dict: A standard package data_dict :returns: The created HarvestSource object :rtype: HarvestSource object ''' source_id = data_dict.get('id') log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise logic.NotFound('Harvest source %s does not exist' % source_id) fields = ['url', 'title', 'description', 'user_id', 'publisher_id', 'frequency'] for f in fields: if f in data_dict and data_dict[f] is not None: if f == 'url': data_dict[f] = data_dict[f].strip() source.__setattr__(f,data_dict[f]) # Avoids clashes with the dataset type if 'source_type' in data_dict: source.type = data_dict['source_type'] if 'config' in data_dict: source.config = data_dict['config'] # Don't change state unless explicitly set in the dict if 'state' in data_dict: source.active = data_dict.get('state') == 'active' # Don't commit yet, let package_create do it source.add() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source,status=u'New') log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.add() return source
def harvest_source_update(context, data_dict): check_access('harvest_source_update', context, data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('id') schema = context.get('schema') or default_harvest_source_schema() log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) data, errors = validate(data_dict, schema) if errors: session.rollback() raise ValidationError(errors, _error_summary(errors)) fields = ['url', 'title', 'type', 'description', 'user_id', 'publisher_id'] for f in fields: if f in data and data[f] is not None: if f == 'url': data[f] = data[f].strip() source.__setattr__(f, data[f]) if 'active' in data_dict: source.active = data['active'] if 'config' in data_dict: source.config = data['config'] source.save() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source, status=u'New') log.info( 'Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.save() # Ensure sqlalchemy writes to the db immediately, since the gather/fetch # runs in a different process and needs the latest source info. Not sure if # this works, but try it. model.repo.commit_and_remove() return harvest_source_dictize(source, context)
def _get_source_status(source, context): ''' TODO: Deprecated, use harvest_source_show_status instead ''' model = context.get('model') out = dict() job_count = HarvestJob.filter(source=source).count() out = { 'job_count': 0, 'next_harvest': '', 'last_harvest_request': '', } if not job_count: out['msg'] = 'No jobs yet' return out else: out['job_count'] = job_count # Get next scheduled job next_job = HarvestJob.filter(source=source, status=u'New').first() if next_job: out['next_harvest'] = 'Scheduled' else: out['next_harvest'] = 'Not yet scheduled' # Get the last finished job last_job = HarvestJob.filter(source=source, status=u'Finished') \ .order_by(HarvestJob.created.desc()).first() if last_job: out['last_harvest_request'] = str(last_job.gather_finished) else: out['last_harvest_request'] = 'Not yet harvested' return out
def harvest_source_update(context,data_dict): check_access('harvest_source_update',context,data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('id') schema = context.get('schema') or default_harvest_source_schema() log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) data, errors = validate(data_dict, schema) if errors: session.rollback() raise ValidationError(errors,_error_summary(errors)) fields = ['url','title','type','description','user_id','publisher_id'] for f in fields: if f in data and data[f] is not None: if f == 'url': data[f] = data[f].strip() source.__setattr__(f,data[f]) if 'active' in data_dict: source.active = data['active'] if 'config' in data_dict: source.config = data['config'] source.save() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source,status=u'New') log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.save() # Ensure sqlalchemy writes to the db immediately, since the gather/fetch # runs in a different process and needs the latest source info. Not sure if # this works, but try it. model.repo.commit_and_remove() return harvest_source_dictize(source,context)
def harvest_source_update(context,data_dict): check_access('harvest_source_update',context,data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('id') schema = context.get('schema') or default_harvest_source_schema() log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) data, errors = validate(data_dict, schema) if errors: session.rollback() raise ValidationError(errors,_error_summary(errors)) fields = ['url','title','type','description','user_id','publisher_id'] for f in fields: if f in data and data[f] is not None: source.__setattr__(f,data[f]) if 'active' in data_dict: source.active = data['active'] if 'config' in data_dict: source.config = data['config'] source.save() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source,status=u'New') log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.save() return harvest_source_dictize(source,context)
def _delete_harvest_source_object(context, data_dict): ''' Deletes an actual HarvestSource object with the id provided on the data dict of the harvest_source dataset. Similarly to the datasets, the source object is not actually deleted, just flagged as inactive. All validation and authorization checks should be used by now, so this function is not to be used directly to delete harvest sources. :param data_dict: A standard package data_dict :returns: The deleted HarvestSource object :rtype: HarvestSource object ''' source_id = data_dict.get('id') log.info('Deleting harvest source: %s', source_id) source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise p.toolkit.ObjectNotFound('Harvest source %s does not exist' % source_id) # Don't actually delete the record, just flag it as inactive source.active = False source.save() # Abort any pending jobs jobs = HarvestJob.filter(source=source, status=u'New') if jobs: log.info('Aborting %i jobs due to deleted harvest source', jobs.count()) for job in jobs: job.status = u'Aborted' job.save() log.debug('Harvest source %s deleted', source_id) return source
def harvest_source_delete(context,data_dict): log.info('Deleting harvest source: %r', data_dict) check_access('harvest_source_delete',context,data_dict) source_id = data_dict.get('id') source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Don't actually delete the record, just flag it as inactive source.active = False source.save() # Abort any pending jobs jobs = HarvestJob.filter(source=source,status=u'New') if jobs: log.info('Aborting %i jobs due to deleted harvest source', jobs.count()) for job in jobs: job.status = u'Aborted' job.save() log.info('Harvest source %s deleted', source_id) return True
def _get_source_status(source, context): ''' TODO: Deprecated, use harvest_source_show_status instead ''' model = context.get('model') detailed = context.get('detailed',True) out = dict() job_count = HarvestJob.filter(source=source).count() out = { 'job_count': 0, 'next_harvest':'', 'last_harvest_request':'', 'last_harvest_statistics':{'added':0,'updated':0,'errors':0,'deleted':0}, 'last_harvest_errors':{'gather':[],'object':[]}, 'overall_statistics':{'added':0, 'errors':0}, 'packages':[]} if not job_count: out['msg'] = 'No jobs yet' return out else: out['job_count'] = job_count # Get next scheduled job next_job = HarvestJob.filter(source=source,status=u'New').first() if next_job: out['next_harvest'] = 'Scheduled' else: out['next_harvest'] = 'Not yet scheduled' # Get the last finished job last_job = HarvestJob.filter(source=source,status=u'Finished') \ .order_by(HarvestJob.created.desc()).first() if last_job: #TODO: Should we encode the dates as strings? out['last_harvest_request'] = str(last_job.gather_finished) if detailed: harvest_job_dict = harvest_job_dictize(last_job, context) # No packages added or updated statistics = out['last_harvest_statistics'] statistics['added'] = harvest_job_dict['stats'].get('new',0) statistics['updated'] = harvest_job_dict['stats'].get('updated',0) statistics['deleted'] = harvest_job_dict['stats'].get('deleted',0) statistics['errors'] = (harvest_job_dict['stats'].get('errored',0) + len(last_job.gather_errors)) if detailed: # We have the gathering errors in last_job.gather_errors, so let's also # get also the object errors. object_errors = model.Session.query(HarvestObjectError).join(HarvestObject) \ .filter(HarvestObject.job==last_job) for gather_error in last_job.gather_errors: out['last_harvest_errors']['gather'].append(gather_error.message) for object_error in object_errors: err = {'object_id':object_error.object.id,'object_guid':object_error.object.guid,'message': object_error.message} out['last_harvest_errors']['object'].append(err) # Overall statistics packages = model.Session.query(distinct(HarvestObject.package_id),Package.name) \ .join(Package).join(HarvestSource) \ .filter(HarvestObject.source==source) \ .filter(HarvestObject.current==True) \ .filter(Package.state==u'active') out['overall_statistics']['added'] = packages.count() if detailed: for package in packages: out['packages'].append(package.name) gather_errors = model.Session.query(HarvestGatherError) \ .join(HarvestJob).join(HarvestSource) \ .filter(HarvestJob.source==source).count() object_errors = model.Session.query(HarvestObjectError) \ .join(HarvestObject).join(HarvestJob).join(HarvestSource) \ .filter(HarvestJob.source==source).count() out['overall_statistics']['errors'] = gather_errors + object_errors else: out['last_harvest_request'] = 'Not yet harvested' return out
def _get_source_status(source, context): ''' TODO: Deprecated, use harvest_source_show_status instead ''' model = context.get('model') out = dict() job_count = HarvestJob.filter(source=source).count() out = { 'job_count': 0, 'next_harvest': '', 'last_harvest_request': '', 'overall_statistics': {'added': 0, 'errors': 0}, } if not job_count: out['msg'] = 'No jobs yet' return out else: out['job_count'] = job_count # Get next scheduled job next_job = HarvestJob.filter(source=source, status=u'New').first() if next_job: out['next_harvest'] = 'Scheduled' else: out['next_harvest'] = 'Not yet scheduled' # Get the last finished job last_job = HarvestJob.filter(source=source, status=u'Finished') \ .order_by(HarvestJob.created.desc()).first() if last_job: # TODO: Should we encode the dates as strings? out['last_harvest_request'] = str(last_job.gather_finished) # Overall statistics packages = model.Session.query(distinct(HarvestObject.package_id), Package.name) \ .join(Package).join(HarvestSource) \ .filter(HarvestObject.source == source) \ .filter( HarvestObject.current == True # noqa: E711 ).filter(Package.state == u'active') out['overall_statistics']['added'] = packages.count() gather_errors = model.Session.query(HarvestGatherError) \ .join(HarvestJob).join(HarvestSource) \ .filter(HarvestJob.source == source).count() object_errors = model.Session.query(HarvestObjectError) \ .join(HarvestObject).join(HarvestJob).join(HarvestSource) \ .filter(HarvestJob.source == source).count() out['overall_statistics']['errors'] = gather_errors + object_errors else: out['last_harvest_request'] = 'Not yet harvested' return out
def _get_source_status(source, context): model = context.get('model') detailed = context.get('detailed',True) out = dict() job_count = HarvestJob.filter(source=source).count() out = { 'job_count': 0, 'next_harvest':'', 'last_harvest_request':'', 'last_harvest_statistics':{'added':0,'updated':0,'errors':0}, 'last_harvest_errors':{'gather':[],'object':[]}, 'overall_statistics':{'added':0, 'errors':0}, 'packages':[]} if not job_count: out['msg'] = 'No jobs yet' return out else: out['job_count'] = job_count # Get next scheduled job next_job = HarvestJob.filter(source=source,status=u'New').first() if next_job: out['next_harvest'] = 'Scheduled' else: out['next_harvest'] = 'Not yet scheduled' # Get the last finished job last_job = HarvestJob.filter(source=source,status=u'Finished') \ .order_by(HarvestJob.created.desc()).first() if last_job: #TODO: Should we encode the dates as strings? out['last_harvest_request'] = str(last_job.gather_finished) #Get HarvestObjects from last job whit links to packages if detailed: last_objects = [obj for obj in last_job.objects if obj.package is not None] if len(last_objects) == 0: # No packages added or updated out['last_harvest_statistics']['added'] = 0 out['last_harvest_statistics']['updated'] = 0 else: # Check wether packages were added or updated for last_object in last_objects: # Check if the same package had been linked before previous_objects = model.Session.query(HarvestObject) \ .filter(HarvestObject.package==last_object.package) \ .count() if previous_objects == 1: # It didn't previously exist, it has been added out['last_harvest_statistics']['added'] += 1 else: # Pacakge already existed, but it has been updated out['last_harvest_statistics']['updated'] += 1 # Last harvest errors # We have the gathering errors in last_job.gather_errors, so let's also # get also the object errors. object_errors = model.Session.query(HarvestObjectError).join(HarvestObject) \ .filter(HarvestObject.job==last_job) out['last_harvest_statistics']['errors'] = len(last_job.gather_errors) \ + object_errors.count() if detailed: for gather_error in last_job.gather_errors: out['last_harvest_errors']['gather'].append(gather_error.message) for object_error in object_errors: err = {'object_id':object_error.object.id,'object_guid':object_error.object.guid,'message': object_error.message} out['last_harvest_errors']['object'].append(err) # Overall statistics packages = model.Session.query(distinct(HarvestObject.package_id),Package.name) \ .join(Package).join(HarvestSource) \ .filter(HarvestObject.source==source) \ .filter(HarvestObject.current==True) \ .filter(Package.state==u'active') out['overall_statistics']['added'] = packages.count() if detailed: for package in packages: out['packages'].append(package.name) gather_errors = model.Session.query(HarvestGatherError) \ .join(HarvestJob).join(HarvestSource) \ .filter(HarvestJob.source==source).count() object_errors = model.Session.query(HarvestObjectError) \ .join(HarvestObject).join(HarvestJob).join(HarvestSource) \ .filter(HarvestJob.source==source).count() out['overall_statistics']['errors'] = gather_errors + object_errors else: out['last_harvest_request'] = 'Not yet harvested' return out
def _get_source_status(source, context): ''' TODO: Deprecated, use harvest_source_show_status instead ''' model = context.get('model') detailed = context.get('detailed', True) out = dict() job_count = HarvestJob.filter(source=source).count() out = { 'job_count': 0, 'next_harvest': '', 'last_harvest_request': '', 'last_harvest_statistics': { 'added': 0, 'updated': 0, 'errors': 0, 'deleted': 0 }, 'last_harvest_errors': { 'gather': [], 'object': [] }, 'overall_statistics': { 'added': 0, 'errors': 0 }, 'packages': [] } if not job_count: out['msg'] = 'No jobs yet' return out else: out['job_count'] = job_count # Get next scheduled job next_job = HarvestJob.filter(source=source, status=u'New').first() if next_job: out['next_harvest'] = 'Scheduled' else: out['next_harvest'] = 'Not yet scheduled' # Get the last finished job last_job = HarvestJob.filter(source=source,status=u'Finished') \ .order_by(HarvestJob.created.desc()).first() if last_job: #TODO: Should we encode the dates as strings? out['last_harvest_request'] = str(last_job.gather_finished) if detailed: harvest_job_dict = harvest_job_dictize(last_job, context) # No packages added or updated statistics = out['last_harvest_statistics'] statistics['added'] = harvest_job_dict['stats'].get('new', 0) statistics['updated'] = harvest_job_dict['stats'].get('updated', 0) statistics['deleted'] = harvest_job_dict['stats'].get('deleted', 0) statistics['errors'] = ( harvest_job_dict['stats'].get('errored', 0) + len(last_job.gather_errors)) if detailed: # We have the gathering errors in last_job.gather_errors, so let's also # get also the object errors. object_errors = model.Session.query(HarvestObjectError).join(HarvestObject) \ .filter(HarvestObject.job==last_job) for gather_error in last_job.gather_errors: out['last_harvest_errors']['gather'].append( gather_error.message) for object_error in object_errors: err = { 'object_id': object_error.object.id, 'object_guid': object_error.object.guid, 'message': object_error.message } out['last_harvest_errors']['object'].append(err) # Overall statistics packages = model.Session.query(distinct(HarvestObject.package_id),Package.name) \ .join(Package).join(HarvestSource) \ .filter(HarvestObject.source==source) \ .filter(HarvestObject.current==True) \ .filter(Package.state==u'active') out['overall_statistics']['added'] = packages.count() if detailed: for package in packages: out['packages'].append(package.name) gather_errors = model.Session.query(HarvestGatherError) \ .join(HarvestJob).join(HarvestSource) \ .filter(HarvestJob.source==source).count() object_errors = model.Session.query(HarvestObjectError) \ .join(HarvestObject).join(HarvestJob).join(HarvestSource) \ .filter(HarvestJob.source==source).count() out['overall_statistics']['errors'] = gather_errors + object_errors else: out['last_harvest_request'] = 'Not yet harvested' return out
def _get_source_status(source, context): model = context.get('model') detailed = context.get('detailed', True) out = dict() job_count = HarvestJob.filter(source=source).count() out = { 'job_count': 0, 'next_harvest': '', 'last_harvest_request': '', 'last_harvest_statistics': { 'added': 0, 'updated': 0, 'errors': 0 }, 'last_harvest_errors': { 'gather': [], 'object': [] }, 'overall_statistics': { 'added': 0, 'errors': 0 }, 'packages': [] } if not job_count: out['msg'] = 'No jobs yet' return out else: out['job_count'] = job_count # Get next scheduled job next_job = HarvestJob.filter(source=source, status=u'New').first() if next_job: out['next_harvest'] = 'Scheduled' else: out['next_harvest'] = 'Not yet scheduled' # Get the last finished job last_job = HarvestJob.filter(source=source,status=u'Finished') \ .order_by(HarvestJob.created.desc()).first() if last_job: #TODO: Should we encode the dates as strings? out['last_harvest_request'] = str(last_job.gather_finished) #Get HarvestObjects from last job whit links to packages if detailed: last_objects = [ obj for obj in last_job.objects if obj.package is not None ] if len(last_objects) == 0: # No packages added or updated out['last_harvest_statistics']['added'] = 0 out['last_harvest_statistics']['updated'] = 0 else: # Check wether packages were added or updated for last_object in last_objects: # Check if the same package had been linked before previous_objects = model.Session.query(HarvestObject) \ .filter(HarvestObject.package==last_object.package) \ .count() if previous_objects == 1: # It didn't previously exist, it has been added out['last_harvest_statistics']['added'] += 1 else: # Pacakge already existed, but it has been updated out['last_harvest_statistics']['updated'] += 1 # Last harvest errors # We have the gathering errors in last_job.gather_errors, so let's also # get also the object errors. object_errors = model.Session.query(HarvestObjectError).join(HarvestObject) \ .filter(HarvestObject.job==last_job) out['last_harvest_statistics']['errors'] = len(last_job.gather_errors) \ + object_errors.count() if detailed: for gather_error in last_job.gather_errors: out['last_harvest_errors']['gather'].append( gather_error.message) for object_error in object_errors: err = { 'object_id': object_error.object.id, 'object_guid': object_error.object.guid, 'message': object_error.message } out['last_harvest_errors']['object'].append(err) # Overall statistics packages = model.Session.query(distinct(HarvestObject.package_id),Package.name) \ .join(Package).join(HarvestSource) \ .filter(HarvestObject.source==source) \ .filter(HarvestObject.current==True) \ .filter(Package.state==u'active') out['overall_statistics']['added'] = packages.count() if detailed: for package in packages: out['packages'].append(package.name) gather_errors = model.Session.query(HarvestGatherError) \ .join(HarvestJob).join(HarvestSource) \ .filter(HarvestJob.source==source).count() object_errors = model.Session.query(HarvestObjectError) \ .join(HarvestObject).join(HarvestJob).join(HarvestSource) \ .filter(HarvestJob.source==source).count() out['overall_statistics']['errors'] = gather_errors + object_errors else: out['last_harvest_request'] = 'Not yet harvested' return out