def harvest_source_clear(context, data_dict): ''' Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself. This is useful to clean history of long running harvest sources to start again fresh. :param id: the id of the harvest source to clear :type id: string ''' check_access('harvest_source_clear', context, data_dict) harvest_source_id = data_dict.get('id', None) source = HarvestSource.get(harvest_source_id) if not source: log.error('Harvest source %s does not exist', harvest_source_id) raise NotFound('Harvest source %s does not exist' % harvest_source_id) harvest_source_id = source.id # Clear all datasets from this source from the index harvest_source_index_clear(context, data_dict) sql = '''begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object where harvest_source_id = '{harvest_source_id}'; delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}'); delete from harvest_job where source_id = '{harvest_source_id}'; delete from package_role where package_id in (select id from package where state = 'to_delete' ); delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package'; delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete')); delete from resource_group_revision where package_id in (select id from package where state = 'to_delete'); delete from package_tag_revision where package_id in (select id from package where state = 'to_delete'); delete from member_revision where table_id in (select id from package where state = 'to_delete'); delete from package_extra_revision where package_id in (select id from package where state = 'to_delete'); delete from package_revision where id in (select id from package where state = 'to_delete'); delete from package_tag where package_id in (select id from package where state = 'to_delete'); delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete')); delete from package_extra where package_id in (select id from package where state = 'to_delete'); delete from member where table_id in (select id from package where state = 'to_delete'); delete from resource_group where package_id in (select id from package where state = 'to_delete'); delete from package where id in (select id from package where state = 'to_delete'); commit;'''.format( harvest_source_id=harvest_source_id) model = context['model'] model.Session.execute(sql) # Refresh the index for this source to update the status object context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')(context, { 'id': harvest_source_id }) if package_dict: package_index = PackageSearchIndex() package_index.index_package(package_dict) return {'id': harvest_source_id}
def harvest_source_reindex(context, data_dict): '''Reindex a single harvest source''' harvest_source_id = logic.get_or_bust(data_dict, 'id') defer_commit = context.get('defer_commit', False) if 'extras_as_string'in context: del context['extras_as_string'] context.update({'ignore_auth': True}) package_dict = logic.get_action('harvest_source_show')( context, {'id': harvest_source_id}) log.debug('Updating search index for harvest source: %s', package_dict.get('name') or harvest_source_id) # Remove configuration values new_dict = {} if package_dict.get('config'): config = json.loads(package_dict['config']) for key, value in package_dict.iteritems(): if key not in config: new_dict[key] = value package_index = PackageSearchIndex() package_index.index_package(new_dict, defer_commit=defer_commit) return True
def harvest_sources_reindex(context, data_dict): ''' Reindexes all harvest source datasets with the latest status ''' log.info('Reindexing all harvest sources') check_access('harvest_sources_reindex', context, data_dict) model = context['model'] packages = model.Session.query(model.Package) \ .filter(model.Package.type==DATASET_TYPE_NAME) \ .filter(model.Package.state==u'active') \ .all() package_index = PackageSearchIndex() for package in packages: if 'extras_as_string' in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')(context, { 'id': package.id }) log.debug('Updating search index for harvest source {0}'.format( package.id)) package_index.index_package(package_dict, defer_commit=True) package_index.commit() log.info('Updated search index for {0} harvest sources'.format( len(packages)))
def harvest_sources_reindex(context, data_dict): """ Reindexes all harvest source datasets with the latest status """ log.info("Reindexing all harvest sources") check_access("harvest_sources_reindex", context, data_dict) model = context["model"] packages = ( model.Session.query(model.Package) .filter(model.Package.type == DATASET_TYPE_NAME) .filter(model.Package.state == u"active") .all() ) package_index = PackageSearchIndex() for package in packages: if "extras_as_string" in context: del context["extras_as_string"] context.update({"validate": False, "ignore_auth": True}) package_dict = logic.get_action("package_show")(context, {"id": package.id}) log.debug("Updating search index for harvest source {0}".format(package.id)) package_index.index_package(package_dict, defer_commit=True) package_index.commit() log.info("Updated search index for {0} harvest sources".format(len(packages)))
def harvest_sources_reindex(context, data_dict): ''' Reindexes all harvest source datasets with the latest status ''' log.info('Reindexing all harvest sources') check_access('harvest_sources_reindex', context, data_dict) model = context['model'] packages = model.Session.query(model.Package) \ .filter(model.Package.type==DATASET_TYPE_NAME) \ .filter(model.Package.state==u'active') \ .all() package_index = PackageSearchIndex() for package in packages: if 'extras_as_string'in context: del context['extras_as_string'] context.update({'ignore_auth': True}) package_dict = logic.get_action('harvest_source_show')(context, {'id': package.id}) log.debug('Updating search index for harvest source {0}'.format(package.id)) package_index.index_package(package_dict, defer_commit=True) package_index.commit() log.info('Updated search index for {0} harvest sources'.format(len(packages)))
def harvest_source_reindex(context, data_dict): '''Reindex a single harvest source''' harvest_source_id = logic.get_or_bust(data_dict, 'id') defer_commit = context.get('defer_commit', False) if 'extras_as_string' in context: del context['extras_as_string'] context.update({'ignore_auth': True}) package_dict = logic.get_action('harvest_source_show')( context, { 'id': harvest_source_id }) log.debug('Updating search index for harvest source: %s', package_dict.get('name') or harvest_source_id) # Remove configuration values new_dict = {} try: config = json.loads(package_dict.get('config', '')) except ValueError: config = {} for key, value in package_dict.items(): if key not in config: new_dict[key] = value package_index = PackageSearchIndex() package_index.index_package(new_dict, defer_commit=defer_commit) return True
def harvest_source_clear(context, data_dict): """ Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself. This is useful to clean history of long running harvest sources to start again fresh. :param id: the id of the harvest source to clear :type id: string """ check_access("harvest_source_clear", context, data_dict) harvest_source_id = data_dict.get("id", None) source = HarvestSource.get(harvest_source_id) if not source: log.error("Harvest source %s does not exist", harvest_source_id) raise NotFound("Harvest source %s does not exist" % harvest_source_id) harvest_source_id = source.id # Clear all datasets from this source from the index harvest_source_index_clear(context, data_dict) sql = """begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object where harvest_source_id = '{harvest_source_id}'; delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}'); delete from harvest_job where source_id = '{harvest_source_id}'; delete from package_role where package_id in (select id from package where state = 'to_delete' ); delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package'; delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete')); delete from resource_group_revision where package_id in (select id from package where state = 'to_delete'); delete from package_tag_revision where package_id in (select id from package where state = 'to_delete'); delete from member_revision where table_id in (select id from package where state = 'to_delete'); delete from package_extra_revision where package_id in (select id from package where state = 'to_delete'); delete from package_revision where id in (select id from package where state = 'to_delete'); delete from package_tag where package_id in (select id from package where state = 'to_delete'); delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete')); delete from package_extra where package_id in (select id from package where state = 'to_delete'); delete from member where table_id in (select id from package where state = 'to_delete'); delete from resource_group where package_id in (select id from package where state = 'to_delete'); delete from package where id in (select id from package where state = 'to_delete'); commit;""".format( harvest_source_id=harvest_source_id ) model = context["model"] model.Session.execute(sql) # Refresh the index for this source to update the status object context.update({"validate": False, "ignore_auth": True}) package_dict = logic.get_action("package_show")(context, {"id": harvest_source_id}) if package_dict: package_index = PackageSearchIndex() package_index.index_package(package_dict) return {"id": harvest_source_id}
def run_job_synchronously(self): import datetime from ckan import model from ckan.plugins import PluginImplementations from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject from ckanext.harvest.queue import fetch_and_import_stages from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() source_id = unicode(self.args[1]) source = HarvestSource.get(source_id) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == source.type: break else: print "No harvester found to handle the job." return job = HarvestJob() job.source = source job.status = "Running" job.gather_started = datetime.datetime.utcnow() job.save() try: harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.utcnow() job.save() for obj_id in harvest_object_ids: obj = HarvestObject.get(obj_id) obj.retry_times += 1 obj.save() fetch_and_import_stages(harvester, obj) job.finished = datetime.datetime.utcnow() job.status = "Done" job.save() # And reindex the harvest source so it gets its counts right. # Must call update on a data_dict as returned by package_show, not the class object. package_index.index_package( get_action('package_show')({ 'validate': False, 'ignore_auth': True }, { 'id': source.id })) finally: job.finished = datetime.datetime.utcnow() if job.status != "Done": job.status = "Error" job.save()
def _update_search_index(package_id, log): ''' Tells CKAN to update its search index for a given package. ''' from ckan import model from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() context_ = {'model': model, 'ignore_auth': True, 'session': model.Session, 'use_cache': False, 'validate': False} package = toolkit.get_action('package_show')(context_, {'id': package_id}) package_index.index_package(package, defer_commit=False) log.info('Search indexed %s', package['name'])
def _update_search_index(package_id, log): """ Tells CKAN to update its search index for a given package. """ from ckan import model from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() context_ = {"model": model, "ignore_auth": True, "session": model.Session, "use_cache": False, "validate": False} package = toolkit.get_action("package_show")(context_, {"id": package_id}) package_index.index_package(package, defer_commit=False) log.info("Search indexed %s", package["name"])
def run_job_synchronously(self): import datetime from ckan import model from ckan.plugins import PluginImplementations from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject from ckanext.harvest.queue import fetch_and_import_stages from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() source_id = unicode(self.args[1]) source = HarvestSource.get(source_id) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == source.type: break else: print "No harvester found to handle the job." return job = HarvestJob() job.source = source job.status = "Running" job.gather_started = datetime.datetime.utcnow() job.save() try: harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.utcnow() job.save() for obj_id in harvest_object_ids: obj = HarvestObject.get(obj_id) obj.retry_times += 1 obj.save() fetch_and_import_stages(harvester, obj) job.finished = datetime.datetime.utcnow() job.status = "Done" job.save() # And reindex the harvest source so it gets its counts right. # Must call update on a data_dict as returned by package_show, not the class object. package_index.index_package(get_action('package_show')({'validate': False, 'ignore_auth': True}, {'id': source.id})) finally: job.finished = datetime.datetime.utcnow() if job.status != "Done": job.status = "Error" job.save()
def harvest_source_reindex(context, data_dict): """Reindex a single harvest source""" harvest_source_id = logic.get_or_bust(data_dict, "id") defer_commit = context.get("defer_commit", False) if "extras_as_string" in context: del context["extras_as_string"] context.update({"ignore_auth": True}) package_dict = logic.get_action("harvest_source_show")(context, {"id": harvest_source_id}) log.debug("Updating search index for harvest source {0}".format(harvest_source_id)) # Remove configuration values new_dict = {} if package_dict.get("config"): config = json.loads(package_dict["config"]) for key, value in package_dict.iteritems(): if key not in config: new_dict[key] = value package_index = PackageSearchIndex() package_index.index_package(new_dict, defer_commit=defer_commit) return True
def import_stage(self, harvest_object): context = { 'model': model, 'session': model.Session, 'user': self._get_user_name(), } log = logging.getLogger(__name__ + '.import') log.debug('Import stage for harvest object: %s', harvest_object.id) if not harvest_object: log.error('No harvest object received') return False self._set_source_config(harvest_object.source.config) if self.force_import: status = 'change' else: status = self._get_object_extra(harvest_object, 'status') # Get the last harvested object (if any) previous_object = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == harvest_object.guid) \ .filter(HarvestObject.current == True).first() # noqa if status == 'delete': # Delete package context.update({ 'ignore_auth': True, }) if harvest_object.package_id: p.toolkit.get_action('package_delete')( context, { 'id': harvest_object.package_id }) log.info('Deleted package {0} with guid {1}'.format( harvest_object.package_id, harvest_object.guid)) return True # Check if it is a non ISO document original_document = self._get_object_extra(harvest_object, 'original_document') original_format = self._get_object_extra(harvest_object, 'original_format') if original_document and original_format: # DEPRECATED use the ISpatialHarvester interface method self.__base_transform_to_iso_called = False content = self.transform_to_iso(original_document, original_format, harvest_object) if not self.__base_transform_to_iso_called: log.warn( 'Deprecation warning: calling transform_to_iso directly is deprecated. ' + 'Please use the ISpatialHarvester interface method instead.' ) for harvester in p.PluginImplementations(ISpatialHarvester): content = harvester.transform_to_iso(original_document, original_format, harvest_object) if content: harvest_object.content = content else: self._save_object_error('Transformation to ISO failed', harvest_object, 'Import') return False else: if harvest_object.content is None: self._save_object_error( 'Empty content for object {0}'.format(harvest_object.id), harvest_object, 'Import') return False # Validate ISO document is_valid, profile, errors = self._validate_document( harvest_object.content, harvest_object) if not is_valid: # If validation errors were found, import will stop unless # configuration per source or per instance says otherwise continue_import = p.toolkit.asbool(config.get('ckanext.spatial.harvest.continue_on_validation_errors', False)) or \ self.source_config.get('continue_on_validation_errors') if not continue_import: return False # Parse ISO document try: iso_parser = ISODocument(harvest_object.content) iso_values = iso_parser.read_values() except Exception as e: self._save_object_error( 'Error parsing ISO document for object {0}: {1}'.format( harvest_object.id, six.text_type(e)), harvest_object, 'Import') return False # Flag previous object as not current anymore if previous_object and not self.force_import: previous_object.current = False previous_object.add() # Update GUID with the one on the document iso_guid = iso_values['guid'] if iso_guid and harvest_object.guid != iso_guid: # First make sure there already aren't current objects # with the same guid existing_object = model.Session.query(HarvestObject.id) \ .filter(HarvestObject.guid == iso_guid) \ .filter(HarvestObject.current == True).first() # noqa if existing_object: self._save_object_error( 'Object {0} already has this guid {1}'.format( existing_object.id, iso_guid), harvest_object, 'Import') return False harvest_object.guid = iso_guid harvest_object.add() # Generate GUID if not present (i.e. it's a manual import) if not harvest_object.guid: m = hashlib.md5() m.update(harvest_object.content.encode('utf8', 'ignore')) harvest_object.guid = m.hexdigest() harvest_object.add() # Get document modified date try: metadata_modified_date = dateutil.parser.parse( iso_values['metadata-date'], ignoretz=True) except ValueError: self._save_object_error( 'Could not extract reference date for object {0} ({1})'.format( harvest_object.id, iso_values['metadata-date']), harvest_object, 'Import') return False harvest_object.metadata_modified_date = metadata_modified_date harvest_object.add() # Build the package dict package_dict = self.get_package_dict(iso_values, harvest_object) for harvester in p.PluginImplementations(ISpatialHarvester): package_dict = harvester.get_package_dict( context, { 'package_dict': package_dict, 'iso_values': iso_values, 'xml_tree': iso_parser.xml_tree, 'harvest_object': harvest_object, }) if not package_dict: log.error( 'No package dict returned, aborting import for object {0}'. format(harvest_object.id)) return False # Create / update the package context.update({ 'extras_as_string': True, 'api_version': '2', 'return_id_only': True }) if self._site_user and context['user'] == self._site_user['name']: context['ignore_auth'] = True # The default package schema does not like Upper case tags tag_schema = logic.schema.default_tags_schema() tag_schema['name'] = [not_empty, six.text_type] # Flag this object as the current one harvest_object.current = True harvest_object.add() if status == 'new': package_schema = logic.schema.default_create_package_schema() package_schema['tags'] = tag_schema context['schema'] = package_schema # We need to explicitly provide a package ID, otherwise ckanext-spatial # won't be be able to link the extent to the package. package_dict['id'] = six.text_type(uuid.uuid4()) package_schema['id'] = [six.text_type] # Save reference to the package on the object harvest_object.package_id = package_dict['id'] harvest_object.add() # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) model.Session.execute( 'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() try: package_id = p.toolkit.get_action('package_create')( context, package_dict) log.info('Created new package %s with guid %s', package_id, harvest_object.guid) except p.toolkit.ValidationError as e: self._save_object_error( 'Validation Error: %s' % six.text_type(e.error_summary), harvest_object, 'Import') return False elif status == 'change': # Check if the modified date is more recent if not self.force_import and previous_object \ and harvest_object.metadata_modified_date <= previous_object.metadata_modified_date: # Assign the previous job id to the new object to # avoid losing history harvest_object.harvest_job_id = previous_object.job.id harvest_object.add() # Delete the previous object to avoid cluttering the object table previous_object.delete() # Reindex the corresponding package to update the reference to the # harvest object if ((config.get('ckanext.spatial.harvest.reindex_unchanged', True) != 'False' or self.source_config.get('reindex_unchanged') != 'False') and harvest_object.package_id): context.update({'validate': False, 'ignore_auth': True}) try: package_dict = logic.get_action('package_show')( context, { 'id': harvest_object.package_id }) except p.toolkit.ObjectNotFound: pass else: for extra in package_dict.get('extras', []): if extra['key'] == 'harvest_object_id': extra['value'] = harvest_object.id if package_dict: package_index = PackageSearchIndex() package_index.index_package(package_dict) log.info('Document with GUID %s unchanged, skipping...' % (harvest_object.guid)) else: package_schema = logic.schema.default_update_package_schema() package_schema['tags'] = tag_schema context['schema'] = package_schema package_dict['id'] = harvest_object.package_id try: package_id = p.toolkit.get_action('package_update')( context, package_dict) log.info('Updated package %s with guid %s', package_id, harvest_object.guid) except p.toolkit.ValidationError as e: self._save_object_error( 'Validation Error: %s' % six.text_type(e.error_summary), harvest_object, 'Import') return False model.Session.commit() return True
def import_stage(self, harvest_object): log = logging.getLogger(__name__ + '.import') log.debug('%s: Import stage for harvest object: %s', self.harvester_name(), harvest_object.id) if not harvest_object: log.error('No harvest object received') return False if not harvest_object.content: log.error('Harvest object contentless') self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_source_config(harvest_object.source.config) status = self._get_object_extra(harvest_object, 'status') # Get the last harvested object (if any) previous_object = Session.query(HarvestObject) \ .filter(HarvestObject.guid == harvest_object.guid) \ .filter(HarvestObject.current == True) \ .first() context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } if status == 'delete': # Delete package p.toolkit.get_action('package_delete')( context, { 'id': harvest_object.package_id }) log.info('Deleted package {0} with guid {1}'.format( harvest_object.package_id, harvest_object.guid)) return True # Flag previous object as not current anymore if previous_object: previous_object.current = False previous_object.add() # Flag this object as the current one harvest_object.current = True harvest_object.add() # Generate GUID if not present (i.e. it's a manual import) if not harvest_object.guid: self._save_object_error( 'Missing GUID for object {0}'.format(harvest_object.id), harvest_object, 'Import') return False # pre-check to skip resource logic in case no changes occurred remotely if status == 'change': # Check if the document has changed m = hashlib.md5() m.update(previous_object.content.encode()) old_md5 = m.hexdigest() m = hashlib.md5() m.update(harvest_object.content.encode()) new_md5 = m.hexdigest() if old_md5 == new_md5: # Assign the previous job id to the new object to # avoid losing history harvest_object.harvest_job_id = previous_object.job.id harvest_object.add() harvest_object.metadata_modified_date = previous_object.metadata_modified_date harvest_object.add() # Delete the previous object to avoid cluttering the object table previous_object.delete() # Reindex the corresponding package to update the reference to the harvest object context.update({'validate': False, 'ignore_auth': True}) try: package_dict = logic.get_action('package_show')( context, { 'id': harvest_object.package_id }) except p.toolkit.ObjectNotFound: pass else: for extra in package_dict.get('extras', []): if extra['key'] == 'harvest_object_id': extra['value'] = harvest_object.id if package_dict: package_index = PackageSearchIndex() package_index.index_package(package_dict) log.info('%s document with GUID %s unchanged, skipping...', self.harvester_name(), harvest_object.guid) model.Session.commit() return "unchanged" # Build the package dict package_dict, metadata = self.create_package_dict( harvest_object.guid, harvest_object.content) if not package_dict: log.error( 'No package dict returned, aborting import for object {0}'. format(harvest_object.id)) return False package_dict['name'] = self._gen_new_name(package_dict['title']) # We need to get the owner organization (if any) from the harvest source dataset source_dataset = model.Package.get(harvest_object.source.id) if source_dataset.owner_org: package_dict['owner_org'] = source_dataset.owner_org self.attach_resources(metadata, package_dict, harvest_object) # Create / update the package context = { 'model': model, 'session': model.Session, 'user': self._get_user_name(), 'extras_as_string': True, 'api_version': '2', 'return_id_only': True } if context['user'] == self._site_user['name']: context['ignore_auth'] = True # The default package schema does not like Upper case tags tag_schema = logic.schema.default_tags_schema() tag_schema['name'] = [not_empty] if status == 'new': package_schema = logic.schema.default_create_package_schema() package_schema['tags'] = tag_schema context['schema'] = package_schema # We need to explicitly provide a package ID, otherwise ckanext-spatial # won't be be able to link the extent to the package. package_dict['id'] = uuid.uuid4().hex package_schema['id'] = [] # Save reference to the package on the object harvest_object.package_id = package_dict['id'] harvest_object.add() # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) Session.execute( 'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() try: package_id = p.toolkit.get_action('package_create')( context, package_dict) log.info('%s: Created new package %s with guid %s', self.harvester_name(), package_id, harvest_object.guid) except p.toolkit.ValidationError as e: self._save_object_error( 'Validation Error: %s' % str(e.error_summary), harvest_object, 'Import') return False elif status == 'change': # we know the internal document did change, bc of a md5 hash comparison done above package_schema = logic.schema.default_update_package_schema() package_schema['tags'] = tag_schema context['schema'] = package_schema package_dict['id'] = harvest_object.package_id try: package_id = p.toolkit.get_action('package_update')( context, package_dict) log.info('%s updated package %s with guid %s', self.harvester_name(), package_id, harvest_object.guid) except p.toolkit.ValidationError as e: self._save_object_error( 'Validation Error: %s' % str(e.error_summary), harvest_object, 'Import') return False model.Session.commit() return True
def harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status if 'extras_as_string' in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')( context, { 'id': job_obj.source.id }) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'New' }) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_jobs_run(context, data_dict): log.info("Harvest job run: %r", data_dict) check_access("harvest_jobs_run", context, data_dict) session = context["session"] source_id = data_dict.get("source_id", None) if not source_id: _make_scheduled_jobs(context, data_dict) context["return_objects"] = False # Flag finished jobs as such jobs = harvest_job_list(context, {"source_id": source_id, "status": u"Running"}) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job["gather_finished"]: objects = ( session.query(HarvestObject.id) .filter(HarvestObject.harvest_job_id == job["id"]) .filter(and_((HarvestObject.state != u"COMPLETE"), (HarvestObject.state != u"ERROR"))) .order_by(HarvestObject.import_finished.desc()) ) if objects.count() == 0: job_obj = HarvestJob.get(job["id"]) job_obj.status = u"Finished" last_object = ( session.query(HarvestObject) .filter(HarvestObject.harvest_job_id == job["id"]) .filter(HarvestObject.import_finished != None) .order_by(HarvestObject.import_finished.desc()) .first() ) if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status if "extras_as_string" in context: del context["extras_as_string"] context.update({"validate": False, "ignore_auth": True}) package_dict = logic.get_action("package_show")(context, {"id": job_obj.source.id}) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, {"source_id": source_id, "status": u"New"}) if len(jobs) == 0: log.info("No new harvest jobs.") raise Exception("There are no new harvesting jobs") # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context["detailed"] = False source = harvest_source_show(context, {"id": job["source_id"]}) if source["active"]: job_obj = HarvestJob.get(job["id"]) job_obj.status = job["status"] = u"Running" job_obj.save() publisher.send({"harvest_job_id": job["id"]}) log.info("Sent job %s to the gather queue" % job["id"]) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_jobs_run(context,data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run',context,data_dict) session = context['session'] source_id = data_dict.get('source_id',None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context,{'source_id':source_id,'status':u'Running'}) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status if 'extras_as_string'in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')(context, {'id': job_obj.source.id}) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'}) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context,{'id':job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs