Python HarvestSource 예제들, ckanext.harvestodm.model.HarvestSource Python 예제들

예제 #1

0

파일 보기

파일: create.py 프로젝트: rossjones/ckanext-harvestodm

def harvest_job_create(context, data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn(
            'Harvest job cannot be created for inactive source %s',
            source_id)
        raise Exception('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this
    # source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn(
            'There is already an unrun job %r for this source %s',
            exists,
            source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job, context)

예제 #2

0

파일 보기

파일: update.py 프로젝트: rossjones/ckanext-harvestodm

def harvest_source_index_clear(context, data_dict):

    check_access('harvest_source_clear', context, data_dict)
    harvest_source_id = data_dict.get('id', None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    conn = make_connection()
    query = ''' +%s:"%s" +site_id:"%s" ''' % (
        'harvest_source_id', harvest_source_id, config.get('ckan.site_id'))
    try:
        conn.delete_query(query)
        if asbool(config.get('ckan.search.solr_commit', 'true')):
            conn.commit()
    except Exception as e:
        log.exception(e)
        raise SearchIndexError(e)
    finally:
        conn.close()

    return {'id': harvest_source_id}

예제 #3

0

파일 보기

파일: validators.py 프로젝트: rossjones/ckanext-harvestodm

def harvest_source_id_exists(value, context):

    result = HarvestSource.get(value, None)

    if not result:
        raise Invalid('Harvest Source with id %r does not exist.' % str(value))
    return value

예제 #4

0

파일 보기

파일: plugin.py 프로젝트: opendatamonitor/ckanext-harvestodm

    def after_show(self, context, data_dict):

        if 'type' in data_dict and data_dict['type'] == DATASET_TYPE_NAME:
            # This is a harvest source dataset, add extra info from the
            # HarvestSource object
            source = HarvestSource.get(data_dict['id'])
            if not source:
                log.error('Harvest source not found for dataset {0}'.format(data_dict['id']))
                return data_dict

            data_dict['status'] = p.toolkit.get_action('harvest_source_show_status')(context, {'id': source.id})

        elif not 'type' in data_dict or data_dict['type'] != DATASET_TYPE_NAME:
            # This is a normal dataset, check if it was harvested and if so, add
            # info about the HarvestObject and HarvestSource

            harvest_object = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.package_id==data_dict['id']) \
                    .filter(HarvestObject.current==True) \
                    .first()

            # If the harvest extras are there, remove them. This can happen eg
            # when calling package_update or resource_update, which call
            # package_show
            if data_dict.get('extras'):
                data_dict['extras'][:] = [e for e in data_dict.get('extras', [])
                                          if not e['key']
                                          in ('harvest_object_id', 'harvest_source_id', 'harvest_source_title',)]


            # We only want to add these extras at index time so they are part
            # of the cached data_dict used to display, search results etc. We
            # don't want them added when editing the dataset, otherwise we get
            # duplicated key errors.
            # The only way to detect indexing right now is checking that
            # validate is set to False.
            if harvest_object and not context.get('validate', True):
                for key, value in [
                    ('harvest_object_id', harvest_object.id),
                    ('harvest_source_id', harvest_object.source.id),
                    ('harvest_source_title', harvest_object.source.title),
                        ]:
                    _add_extra(data_dict, key, value)

        return data_dict

예제 #5

0

파일 보기

파일: plugin.py 프로젝트: opendatamonitor/ckanext-harvestodm

def _delete_harvest_source_object(context, data_dict):
    '''
        Deletes an actual HarvestSource object with the id provided on the
        data dict of the harvest_source dataset. Similarly to the datasets,
        the source object is not actually deleted, just flagged as inactive.
        All validation and authorization checks should be used by now, so
        this function is not to be used directly to delete harvest sources.

        :param data_dict: A standard package data_dict

        :returns: The deleted HarvestSource object
        :rtype: HarvestSource object
    '''

    source_id = data_dict.get('id')

    log.info('Deleting harvest source: %s', source_id)
    db = client.odm
    collection=db.jobs
    document=collection.remove({"base_url":data_dict['url']})

    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise p.toolkit.ObjectNotFound('Harvest source %s does not exist' % source_id)

    # Don't actually delete the record, just flag it as inactive
    source.active = False
    source.save()

    # Abort any pending jobs
    jobs = HarvestJob.filter(source=source, status=u'New')
    if jobs:
        log.info('Aborting %i jobs due to deleted harvest source', jobs.count())
        for job in jobs:
            job.status = u'Aborted'
            job.save()

    log.debug('Harvest source %s deleted', source_id)

    return source

예제 #6

0

파일 보기

파일: queue.py 프로젝트: opendatamonitor/ckanext-harvestodm

def gather_callback(channel, method, header, body):
    text_file = open("/var/local/ckan/default/pyenv/src/ckanext-harvestodm/ckanext/harvestodm/Gather_log.txt", "a")  
    db = client.odm
    db1=db.jobs
    try:
        id = json.loads(body)['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)
        text_file.write('\n')
        text_file.write(str(datetime.datetime.now()))
        text_file.write(' Received harvest job id: %s' % id)
        text_file.write('\n')
    except KeyError:
        log.error('No harvest job id received')
        text_file.write('No harvest job id received')
        text_file.write('\n')
        channel.basic_ack(method.delivery_tag)
        return False

    # Get a publisher for the fetch queue
    publisher = get_fetch_publisher()
    job = HarvestJob.get(id)
    job_source_id=job.source_id
    
    from ckanext.harvestodm.model import HarvestSource
    harvest_source_info= HarvestSource.get(job_source_id)
    cat_url=harvest_source_info.url
    #print("====>"+str(job))
    text_file.write('catalogue url: '+str(cat_url))
    text_file.write('\n')
    print("====>"+str(cat_url))
    if not job:
        log.error('Harvest job does not exist: %s' % id)
        text_file.write('Harvest job does not exist: %s' % id)
        text_file.write('\n')
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest job to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    harvester_found = False
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == job.source.type:
            harvester_found = True
            # Get a list of harvest object ids from the plugin
            job.gather_started = datetime.datetime.utcnow()
	    #print("that: "+str(job.source.url))
            try:
		##call gather stage for the job
                harvest_object_ids = harvester.gather_stage(job)
            except (Exception, KeyboardInterrupt):
                channel.basic_ack(method.delivery_tag)
                harvest_objects = model.Session.query(HarvestObject).filter_by(
                    harvest_job_id=job.id
                )
                for harvest_object in harvest_objects:
                    model.Session.delete(harvest_object)
                model.Session.commit()
                raise
            finally:
                job.gather_finished = datetime.datetime.utcnow()
                job.save()

            if not isinstance(harvest_object_ids, list):
                log.error('Gather stage failed')
                document=db1.find_one({"base_url":cat_url})
                if 'gathered' not in document.keys():
                  document.update({"gathered":"Gather stage failed"})
                  document.update({"last_gathered":0})
                  db1.save(document)
                else:
                  temp_gathered=document['gathered']
                  document.update({"gathered":"Gather stage failed"})
                  document.update({"last_gathered":temp_gathered})
                  db1.save(document)
                text_file.write('Gather stage failed')
                text_file.write('\n')
                publisher.close()
                channel.basic_ack(method.delivery_tag)
                return False

            if len(harvest_object_ids) == 0:
                log.info('No harvest objects to fetch')
                text_file.write('No harvest objects to fetch')
                text_file.write('\n')
                publisher.close()
                channel.basic_ack(method.delivery_tag)
                return False

            log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format(
                        len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:]))
            try:	    
	    	for id in harvest_object_ids:
	        	# Send the id to the fetch queue
	        	publisher.send({'harvest_object_id':id})
            except:
            	#print("WE ARE IN!!")
            	#publisher.close()
            	#self.connection.close()
            	publisher = get_fetch_publisher()
            	for id in harvest_object_ids:
	        	# Send the id to the fetch queue
	        	publisher.send({'harvest_object_id':id})	
	    log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))
	    
	    document=db1.find_one({"base_url":cat_url})
	    if 'gathered' not in document.keys():
	        document.update({"gathered":len(harvest_object_ids)})
	        document.update({"last_gathered":0})
	        db1.save(document)
	        
	    else:
	        temp_gathered=document['gathered']
	        document.update({"gathered":len(harvest_object_ids)})
	        document.update({"last_gathered":temp_gathered})
	        db1.save(document)
	    text_file.write('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))
	    text_file.write('\n')
                   

    if not harvester_found:
        msg = 'No harvester could be found for source type %s' % job.source.type
        err = HarvestGatherError(message=msg,job=job)
        err.save()
        log.error(msg)

    model.Session.remove()
    publisher.close()
    channel.basic_ack(method.delivery_tag)

예제 #7

0

파일 보기

파일: plugin.py 프로젝트: opendatamonitor/ckanext-harvestodm

def _update_harvest_source_object(context, data_dict):
    '''
        Updates an actual HarvestSource object with the data dict
        of the harvest_source dataset. All validation and authorization
        checks should be used by now, so this function is not to be used
        directly to update harvest sources.

        :param data_dict: A standard package data_dict

        :returns: The created HarvestSource object
        :rtype: HarvestSource object
    '''
    language_mappings={'English':'en','Bulgarian':'bg','Croatian':'hr','Czech':'cs','Danish':'da','Icelandic':'is','German':'de','Greek':'el','Spanish':'es','Estonian':'et','Finnish':'fi','French':'fr','Hungarian':'hu','Italian':'it','Lithuanian':'lt','Latvian':'lv','Maltese':'mt','Dutch':'nl','Polish':'pl','Portuguese':'pt','Romanian':'ro','Slovak':'sk','Swedish':'sv','Ukrainian':'uk','Norwegian':'no'}
    source_id = data_dict.get('id')
    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise logic.NotFound('Harvest source %s does not exist' % source_id)


    fields = ['url', 'title', 'description', 'user_id',
              'publisher_id', 'frequency']
    for f in fields:
        if f in data_dict and data_dict[f] is not None:
            if f == 'url':
                data_dict[f] = data_dict[f].strip()
            source.__setattr__(f,data_dict[f])

    # Avoids clashes with the dataset type
    if 'source_type' in data_dict:
        source.type = data_dict['source_type']

    if 'config' in data_dict:
        source.config = data_dict['config']

    # Don't change state unless explicitly set in the dict
    if 'state' in data_dict:
      source.active = data_dict.get('state') == 'active'

    # Don't commit yet, let package_create do it
    source.add()

    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source,status=u'New')
        log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.add()

    client=pymongo.MongoClient(str(mongoclient),int(mongoport))
    db=client.odm
    db_jobs=db.jobs
    if source.type=='html':
	  if 'http' in source.url and 'https' not in source.url :
			  base_url1=source.url[7:]
			  if '/' in base_url1:
				base_url1=base_url1[:base_url1.find('/')]
			  base_url='http://'+str(base_url1)

	  if 'https' in source.url:
			  base_url1=source.url[8:]
			  if '/' in base_url1:
				base_url1=base_url1[:base_url1.find('/')]
			  base_url='https://'+str(base_url1)
    else: base_url=source.url
    #try:
    print(base_url)
    job1=db_jobs.find_one({"cat_url":base_url})
    if job1!=None:
       
    #except:
	  #pass
    
       job={"cat_url":str(base_url),"base_url":str(source.url),"type":str(source.type),"id":str(source.id),"description":str(job1['description']),"frequency":str(source.frequency),
		 "title":str(source.title),'country':str(data_dict['__extras']['catalogue_country']),'language':language_mappings[str(data_dict['__extras']['language'])],'catalogue_date_created':str(data_dict['__extras']['catalogue_date_created']),
		 'catalogue_date_updated':str(data_dict['__extras']['catalogue_date_updated']),'user':str(job1['user'])}
       if 'harmonisation' in job1.keys():
          job.update({'harmonisation':job1['harmonisation']})
       if 'official' in job1.keys():
          job.update({'official':job1['official']})
       if 'date_harvested' in job1.keys():
          job.update({'date_harvested':job1['date_harvested']})
       else:
          job.update({'date_harvested':datetime.datetime.now()})
       db_jobs.remove({'id':job1['id']})
       db_jobs.save(job)
       

    return source

예제 #8

0

파일 보기

파일: plugin.py 프로젝트: opendatamonitor/ckanext-harvestodm

def _create_harvest_source_object(context, data_dict):
    '''
        Creates an actual HarvestSource object with the data dict
        of the harvest_source dataset. All validation and authorization
        checks should be used by now, so this function is not to be used
        directly to create harvest sources. The created harvest source will
        have the same id as the dataset.

        :param data_dict: A standard package data_dict

        :returns: The created HarvestSource object
        :rtype: HarvestSource object
    '''

    log.info('Creating harvest source: %r', data_dict)
    print('##############################')
    print(context)
    print(data_dict)

    source = HarvestSource()
    language_mappings={'English':'en','Bulgarian':'bg','Croatian':'hr','Czech':'cs',\
'Danish':'da','German':'de','Greek':'el','Spanish':'es','Estonian':'et','Finnish':'fi',\
'French':'fr','Hungarian':'hu','Italian':'it','Lithuanian':'lt','Latvian':'lv','Icelandic':'is',\
'Maltese':'mt','Dutch':'nl','Polish':'pl','Portuguese':'pt','Romanian':'ro','Slovak':'sk','Swedish':'sv','Ukrainian':'uk','Norwegian':'no'}
    source.id = data_dict['id']
    source.url = data_dict['url'].strip()
    source.catalogue_country=data_dict['catalogue_country']
    if data_dict['language'] in language_mappings.keys():
	  lang_mapping=language_mappings[str(data_dict['language'])]
	  source.language=lang_mapping
    else:
	  source.language=str(data_dict['language'])
    source.catalogue_date_created=data_dict['catalogue_date_created']
    source.catalogue_date_updated=data_dict['catalogue_date_updated']
    # Avoids clashes with the dataset type
    source.type = data_dict['source_type']
    source.description=data_dict['notes']
    if source.type=='html':
	  if 'http' in source.url and 'https' not in source.url :
			  base_url1=source.url[7:]
			  if '/' in base_url1:
				base_url1=base_url1[:base_url1.find('/')]
			  base_url='http://'+str(base_url1)

	  if 'https' in source.url:
			  base_url1=source.url[8:]
			  if '/' in base_url1:
				base_url1=base_url1[:base_url1.find('/')]
			  base_url='https://'+str(base_url1)
    else: base_url=source.url

    #source.country=data['country']	
    opt = ['active', 'title', 'description', 'user_id',
           'publisher_id', 'config', 'frequency']
    for o in opt:
        if o in data_dict and data_dict[o] is not None:
            source.__setattr__(o,data_dict[o])

    source.active = not data_dict.get('state', None) == 'deleted'

    # Don't commit yet, let package_create do it
    source.add()
    log.info('Harvest source created: %s', source.id)

    ##---------------save job to mongodb--------
    client=pymongo.MongoClient(str(mongoclient),int(mongoport))
    job={"cat_url":str(base_url),"base_url":str(source.url),"type":str(source.type),"id":str(source.id),"description":str(source.description),"frequency":str(source.frequency),
		 "title":str(source.title),'country':str(source.catalogue_country),'language':str(source.language),'catalogue_date_created':str(source.catalogue_date_created),
		 'catalogue_date_updated':str(source.catalogue_date_updated),'date_harvested':datetime.datetime.now(),'user':str(c.user)}
    if 'metadata_mappings' in data_dict.keys():
	  job.update({"metadata_mappings":data_dict["metadata_mappings"]})
    if 'datasets_list_url' in data_dict.keys():
	  job.update({"datasets_list_url":data_dict["datasets_list_url"]})
    if 'dataset_url' in data_dict.keys():
	  job.update({"dataset_url":data_dict["dataset_url"]})
    if 'datasets_list_identifier' in data_dict.keys():
	  job.update({"datasets_list_identifier":data_dict["datasets_list_identifier"]})
    if 'dataset_id' in data_dict.keys():
	  job.update({"dataset_id":data_dict["dataset_id"]})
    if 'apikey' in data_dict['__extras'].keys():
	  job.update({"apikey":data_dict['__extras']["apikey"]})
    db=client.odm
    collection=db.jobs
    collection.save(job)



    return source

예제 #9

0

파일 보기

파일: update.py 프로젝트: rossjones/ckanext-harvestodm

def harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(
        context, {
            'source_id': source_id, 'status': u'Running'})
    # jobs[0]['gather_finished']=True
    # print(jobs)
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:

                objects = session.query(HarvestObject.id) \
                    .filter(HarvestObject.harvest_job_id == job['id']) \
                    .filter(and_((HarvestObject.state != u'COMPLETE'),
                                 (HarvestObject.state != u'ERROR'))) \
                    .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:

                    # harmonisation job create for harvest jobs finished fetch
                    # stage
                    job_source_id = job['source_id']
                    # get harvest info
                    harvest_source_info = HarvestSource.get(job['source_id'])
                    cat_url = harvest_source_info.url
                    title = harvest_source_info.title
                    db = client.odm
                    collection = db.jobs
                    document = collection.find_one({"title": title})
                    # if job exists then create harmonisation job
                    if document is not None:
                        harmonisation_job = {}
                        harmonisation_job['id'] = document['id']
                        harmonisation_job['cat_url'] = document['cat_url']
                        try:
                            harmonised = document['harmonised']
                        except KeyError:
                            harmonised = "not yet"
                        harmonisation_job['harmonised'] = harmonised
                        harmonisation_job['status'] = "pending"
                        harmonisation_job['dates'] = "dates_selected"
                        harmonisation_job['countries'] = "countries_selected"
                        harmonisation_job['catalogue_selection'] = title
                        harmonisation_job['languages'] = "languages_selected"
                        harmonisation_job['resources'] = "resources_selected"
                        harmonisation_job['licenses'] = "licenses_selected"
                        harmonisation_job['categories'] = "categories_selected"
                        harmonisation_job['save'] = "go-harmonisation-complete"

                # create harmonise job to db
                        collection_harmonise_jobs = db.harmonise_jobs
                        collection_harmonise_jobs.save(harmonisation_job)
                        job_obj = HarvestJob.get(job['id'])
                        job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(HarvestObject.import_finished is not None) \
                        .order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        try:
                            job_obj.finished = last_object.import_finished
                        except:
                            pass
                    try:
                        job_obj.save()
                    except:
                        pass
                    # Reindex the harvest source dataset so it has the latest
                    # status

                    try:
                        get_action('harvest_source_reindex')(
                            context, {'id': job_obj.source.id})
                    except:
                        pass

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(
        context, {
            'source_id': source_id, 'status': u'New'})
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs

예제 #10

0

파일 보기

파일: update.py 프로젝트: rossjones/ckanext-harvestodm

def harvest_objects_import(context, data_dict):
    '''
        Reimports the current harvest objects
        It performs the import stage with the last fetched objects, optionally
        belonging to a certain source.
        Please note that no objects will be fetched from the remote server.
        It will only affect the last fetched objects already present in the
        database.
    '''
    log.info('Harvest objects import: %r', data_dict)
    check_access('harvest_objects_import', context, data_dict)

    model = context['model']
    session = context['session']
    source_id = data_dict.get('source_id', None)
    harvest_object_id = data_dict.get('harvest_object_id', None)
    package_id_or_name = data_dict.get('package_id', None)

    segments = context.get('segments', None)

    join_datasets = context.get('join_datasets', True)

    if source_id:
        source = HarvestSource.get(source_id)
        if not source:
            log.error('Harvest source %s does not exist', source_id)
            raise NotFound('Harvest source %s does not exist' % source_id)

        if not source.active:
            log.warn('Harvest source %s is not active.', source_id)
            raise Exception('This harvest source is not active')

        last_objects_ids = session.query(HarvestObject.id) \
            .join(HarvestSource) \
            .filter(HarvestObject.source == source) \
            .filter(HarvestObject.current)

    elif harvest_object_id:
        last_objects_ids = session.query(HarvestObject.id) \
            .filter(HarvestObject.id == harvest_object_id)
    elif package_id_or_name:
        last_objects_ids = session.query(HarvestObject.id) \
            .join(Package) \
            .filter(HarvestObject.current) \
            .filter(Package.state == u'active') \
            .filter(or_(Package.id == package_id_or_name,
                        Package.name == package_id_or_name))
        join_datasets = False
    else:
        last_objects_ids = session.query(HarvestObject.id) \
            .filter(HarvestObject.current)

    if join_datasets:
        last_objects_ids = last_objects_ids.join(Package) \
            .filter(Package.state == u'active')

    last_objects_ids = last_objects_ids.all()

    last_objects_count = 0

    for obj_id in last_objects_ids:
        if segments and str(hashlib.md5(obj_id[0]).hexdigest())[
                0] not in segments:
            continue

        obj = session.query(HarvestObject).get(obj_id)

        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == obj.source.type:
                if hasattr(harvester, 'force_import'):
                    harvester.force_import = True
                harvester.import_stage(obj)
                break
        last_objects_count += 1
    log.info('Harvest objects imported: %s', last_objects_count)
    return last_objects_count

예제 #11

0

파일 보기

파일: update.py 프로젝트: rossjones/ckanext-harvestodm

def harvest_source_clear(context, data_dict):
    '''
    Clears all datasets, jobs and objects related to a harvest source, but
    keeps the source itself. This is useful to clean history of long running
    harvest sources to start again fresh.

    :param id: the id of the harvest source to clear
    :type id: string

    '''
    check_access('harvest_source_clear', context, data_dict)

    harvest_source_id = data_dict.get('id', None)

    source = HarvestSource.get(harvest_source_id)
    if not source:
        log.error('Harvest source %s does not exist', harvest_source_id)
        raise NotFound('Harvest source %s does not exist' % harvest_source_id)

    harvest_source_id = source.id

    # Clear all datasets from this source from the index
    harvest_source_index_clear(context, data_dict)

    model = context['model']

    sql = """select id from related
             where id in
             (select related_id from related_dataset where dataset_id in
             (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}'));"""\
        .format(harvest_source_id=harvest_source_id)

    result = model.Session.execute(sql)
    ids = []
    for row in result:
        ids.append(row[0])
    related_ids = "('" + "','".join(ids) + "')"

    sql = '''begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object where harvest_source_id = '{harvest_source_id}';
    delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}');
    delete from harvest_job where source_id = '{harvest_source_id}';
    delete from package_role where package_id in (select id from package where state = 'to_delete' );
    delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package';
    delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from resource_group_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_tag_revision where package_id in (select id from package where state = 'to_delete');
    delete from member_revision where table_id in (select id from package where state = 'to_delete');
    delete from package_extra_revision where package_id in (select id from package where state = 'to_delete');
    delete from package_revision where id in (select id from package where state = 'to_delete');
    delete from package_tag where package_id in (select id from package where state = 'to_delete');
    delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete'));
    delete from package_extra where package_id in (select id from package where state = 'to_delete');
    delete from member where table_id in (select id from package where state = 'to_delete');
    delete from resource_group where package_id  in (select id from package where state = 'to_delete');
    delete from related_dataset where dataset_id in (select id from package where state = 'to_delete');
    delete from related where id in {related_ids};
    delete from package where id in (select id from package where state = 'to_delete'); commit;'''\
        .format(harvest_source_id=harvest_source_id, related_ids=related_ids)

    model.Session.execute(sql)

    # Refresh the index for this source to update the status object
    get_action('harvest_source_reindex')(context, {'id': harvest_source_id})

    return {'id': harvest_source_id}