def harvest_job_exists(value, context): """Check if a harvest job exists and returns the model if it does""" result = HarvestJob.get(value, None) if not result: raise Invalid('Harvest Job with id %r does not exist.' % str(value)) return result
def harvest_job_show(context, data_dict): check_access('harvest_job_show', context, data_dict) id = data_dict.get('id') attr = data_dict.get('attr', None) job = HarvestJob.get(id, attr=attr) if not job: raise NotFound return harvest_job_dictize(job, context)
def gather_callback(channel, method, header, body): text_file = open("/var/local/ckan/default/pyenv/src/ckanext-harvestodm/ckanext/harvestodm/Gather_log.txt", "a") db = client.odm db1=db.jobs try: id = json.loads(body)['harvest_job_id'] log.debug('Received harvest job id: %s' % id) text_file.write('\n') text_file.write(str(datetime.datetime.now())) text_file.write(' Received harvest job id: %s' % id) text_file.write('\n') except KeyError: log.error('No harvest job id received') text_file.write('No harvest job id received') text_file.write('\n') channel.basic_ack(method.delivery_tag) return False # Get a publisher for the fetch queue publisher = get_fetch_publisher() job = HarvestJob.get(id) job_source_id=job.source_id from ckanext.harvestodm.model import HarvestSource harvest_source_info= HarvestSource.get(job_source_id) cat_url=harvest_source_info.url #print("====>"+str(job)) text_file.write('catalogue url: '+str(cat_url)) text_file.write('\n') print("====>"+str(cat_url)) if not job: log.error('Harvest job does not exist: %s' % id) text_file.write('Harvest job does not exist: %s' % id) text_file.write('\n') channel.basic_ack(method.delivery_tag) return False # Send the harvest job to the plugins that implement # the Harvester interface, only if the source type # matches harvester_found = False for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: harvester_found = True # Get a list of harvest object ids from the plugin job.gather_started = datetime.datetime.utcnow() #print("that: "+str(job.source.url)) try: ##call gather stage for the job harvest_object_ids = harvester.gather_stage(job) except (Exception, KeyboardInterrupt): channel.basic_ack(method.delivery_tag) harvest_objects = model.Session.query(HarvestObject).filter_by( harvest_job_id=job.id ) for harvest_object in harvest_objects: model.Session.delete(harvest_object) model.Session.commit() raise finally: job.gather_finished = datetime.datetime.utcnow() job.save() if not isinstance(harvest_object_ids, list): log.error('Gather stage failed') document=db1.find_one({"base_url":cat_url}) if 'gathered' not in document.keys(): document.update({"gathered":"Gather stage failed"}) document.update({"last_gathered":0}) db1.save(document) else: temp_gathered=document['gathered'] document.update({"gathered":"Gather stage failed"}) document.update({"last_gathered":temp_gathered}) db1.save(document) text_file.write('Gather stage failed') text_file.write('\n') publisher.close() channel.basic_ack(method.delivery_tag) return False if len(harvest_object_ids) == 0: log.info('No harvest objects to fetch') text_file.write('No harvest objects to fetch') text_file.write('\n') publisher.close() channel.basic_ack(method.delivery_tag) return False log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format( len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:])) try: for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id':id}) except: #print("WE ARE IN!!") #publisher.close() #self.connection.close() publisher = get_fetch_publisher() for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id':id}) log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids))) document=db1.find_one({"base_url":cat_url}) if 'gathered' not in document.keys(): document.update({"gathered":len(harvest_object_ids)}) document.update({"last_gathered":0}) db1.save(document) else: temp_gathered=document['gathered'] document.update({"gathered":len(harvest_object_ids)}) document.update({"last_gathered":temp_gathered}) db1.save(document) text_file.write('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids))) text_file.write('\n') if not harvester_found: msg = 'No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg,job=job) err.save() log.error(msg) model.Session.remove() publisher.close() channel.basic_ack(method.delivery_tag)
def harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list( context, { 'source_id': source_id, 'status': u'Running'}) # jobs[0]['gather_finished']=True # print(jobs) if len(jobs): for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: # harmonisation job create for harvest jobs finished fetch # stage job_source_id = job['source_id'] # get harvest info harvest_source_info = HarvestSource.get(job['source_id']) cat_url = harvest_source_info.url title = harvest_source_info.title db = client.odm collection = db.jobs document = collection.find_one({"title": title}) # if job exists then create harmonisation job if document is not None: harmonisation_job = {} harmonisation_job['id'] = document['id'] harmonisation_job['cat_url'] = document['cat_url'] try: harmonised = document['harmonised'] except KeyError: harmonised = "not yet" harmonisation_job['harmonised'] = harmonised harmonisation_job['status'] = "pending" harmonisation_job['dates'] = "dates_selected" harmonisation_job['countries'] = "countries_selected" harmonisation_job['catalogue_selection'] = title harmonisation_job['languages'] = "languages_selected" harmonisation_job['resources'] = "resources_selected" harmonisation_job['licenses'] = "licenses_selected" harmonisation_job['categories'] = "categories_selected" harmonisation_job['save'] = "go-harmonisation-complete" # create harmonise job to db collection_harmonise_jobs = db.harmonise_jobs collection_harmonise_jobs.save(harmonisation_job) job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(HarvestObject.import_finished is not None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: try: job_obj.finished = last_object.import_finished except: pass try: job_obj.save() except: pass # Reindex the harvest source dataset so it has the latest # status try: get_action('harvest_source_reindex')( context, {'id': job_obj.source.id}) except: pass # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list( context, { 'source_id': source_id, 'status': u'New'}) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_job_report(context, data_dict): check_access('harvest_job_show', context, data_dict) model = context['model'] id = data_dict.get('id') job = HarvestJob.get(id) if not job: raise NotFound report = { 'gather_errors': [], 'object_errors': {} } # Gather errors q = model.Session.query(harvest_model.HarvestGatherError) \ .join(harvest_model.HarvestJob) \ .filter(harvest_model.HarvestGatherError.harvest_job_id == job.id) \ .order_by(harvest_model.HarvestGatherError.created.desc()) for error in q.all(): report['gather_errors'].append({ 'message': error.message }) # Object errors # Check if the harvester for this job's source has a method for returning # the URL to the original document original_url_builder = None for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: if hasattr(harvester, 'get_original_url'): original_url_builder = harvester.get_original_url q = model.Session.query( harvest_model.HarvestObjectError, harvest_model.HarvestObject.guid) .join( harvest_model.HarvestObject) .filter( harvest_model.HarvestObject.harvest_job_id == job.id) .order_by( harvest_model.HarvestObjectError.harvest_object_id) for error, guid in q.all(): if error.harvest_object_id not in report['object_errors']: report['object_errors'][error.harvest_object_id] = { 'guid': guid, 'errors': [] } if original_url_builder: url = original_url_builder(error.harvest_object_id) if url: report['object_errors'][ error.harvest_object_id]['original_url'] = url report['object_errors'][error.harvest_object_id]['errors'].append({ 'message': error.message, 'line': error.line, 'type': error.stage }) return report