def gather_stage(self, harvest_job): if harvest_job.source.url.startswith('basic_test'): obj = HarvestObject(guid = 'test1', job = harvest_job) obj.extras.append(HarvestObjectExtra(key='key', value='value')) obj2 = HarvestObject(guid = 'test2', job = harvest_job) obj3 = HarvestObject(guid = 'test_to_delete', job = harvest_job) obj.add() obj2.add() obj3.save() # this will commit both return [obj.id, obj2.id, obj3.id] return []
def gather_stage(self, harvest_job): if harvest_job.source.url.startswith('basic_test'): obj = HarvestObject(guid='test1', job=harvest_job) obj.extras.append(HarvestObjectExtra(key='key', value='value')) obj2 = HarvestObject(guid='test2', job=harvest_job) obj3 = HarvestObject(guid='test_to_delete', job=harvest_job) obj.add() obj2.add() obj3.save() # this will commit both return [obj.id, obj2.id, obj3.id] return []
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.individual.gather') log.debug('DocHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # Get contents try: content = self._get_content_as_unicode(url) except Exception as e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\ filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id).\ first() def create_extras(url, status): return [ HOExtra(key='doc_location', value=url), HOExtra(key='status', value=status) ] if not existing_object: guid = hashlib.md5(url.encode('utf8', 'ignore')).hexdigest() harvest_object = HarvestObject(job=harvest_job, extras=create_extras(url, 'new'), guid=guid) else: harvest_object = HarvestObject( job=harvest_job, extras=create_extras(url, 'change'), guid=existing_object.guid, package_id=existing_object.package_id) harvest_object.add() # Check if it is an ISO document document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content else: extra = HOExtra(object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra(object=harvest_object, key='original_format', value=document_format) extra.save() harvest_object.save() return [harvest_object.id]