def gather_stage(self, harvest_job): url = harvest_job.source.url # Test wether we should use OAI-PMH or DDI metadata_registry = MetadataRegistry() metadata_registry.registerReader('oai_dc', oai_dc_reader) client = oaipmh.client.Client(url, metadata_registry) try: client.identify() except XMLSyntaxError: self.harvester = DDIHarvester() except urllib2.URLError: self._save_gather_error('Could not identify source!', harvest_job) return None if not self.harvester: self.harvester = OAIPMHHarvester() objs = self.harvester.gather_stage(harvest_job) ret = [] for obj in objs: obj = HarvestObject.get(obj) cont = obj.content dict = json.loads(cont) dict['harv'] = jsonpickle.encode(self.harvester) obj.content = json.dumps(dict) obj.save() ret.append(obj.id) return ret
def gather_stage(self, harvest_job): """ The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids """ log.debug("in gather stage: %s" % harvest_job.source.url) try: harvest_obj_ids = [] registry = self._create_metadata_registry() self._set_config(harvest_job.source.config) client = oaipmh.client.Client(harvest_job.source.url, registry, self.credentials) client.identify() # check if identify works for header in self._identifier_generator(client): harvest_obj = HarvestObject(guid=header.identifier(), job=harvest_job) harvest_obj.save() harvest_obj_ids.append(harvest_obj.id) except: log.exception("Gather stage failed %s" % harvest_job.source.url) self._save_gather_error("Could not gather anything from %s!" % harvest_job.source.url, harvest_job) return None return harvest_obj_ids
def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' log.info("in gather stage: %s" % harvest_job.source.url) try: harvest_obj_ids = [] registry = self._create_metadata_registry() self._set_config(harvest_job.source.config) client = oaipmh.client.Client(harvest_job.source.url, registry, self.credentials, force_http_get=self.force_http_get) # Start looking from here client.identify() # check if identify works for header in self._identifier_generator(client): harvest_obj = HarvestObject(guid=header.identifier(), job=harvest_job) harvest_obj.save() harvest_obj_ids.append(harvest_obj.id) log.info("Harvest obj %s created" % harvest_obj.id) # return harvest_obj_ids # This is to get only one record except urllib.error.HTTPError as e: log.exception( 'Gather stage failed on %s (%s): %s, %s' % (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs)) self._save_gather_error( 'Could not gather anything from %s' % harvest_job.source.url, harvest_job) return None except Exception as e: log.exception('Gather stage failed on %s: %s' % ( harvest_job.source.url, str(e), )) self._save_gather_error( 'Could not gather anything from %s: %s / %s' % (harvest_job.source.url, str(e), traceback.format_exc()), harvest_job) return None log.info("Gather stage successfully finished with %s harvest objects" % len(harvest_obj_ids)) return harvest_obj_ids
def _get_client_identifier(self, url, harvest_job=None): registry = MetadataRegistry() registry.registerReader(self.metadata_prefix_value, oai_dc_reader) client = oaipmh.client.Client(url, registry) try: identifier = client.identify() except (urllib2.URLError, urllib2.HTTPError,): if harvest_job: self._save_gather_error( 'Could not gather from %s!' % harvest_job.source.url, harvest_job) return client, None except socket.error: if harvest_job: errno, errstr = sys.exc_info()[:2] self._save_gather_error( 'Socket error OAI-PMH %s, details:\n%s' % (errno, errstr), harvest_job) return client, None except ValueError: # We have no source URL when importing via UI. return client, None except Exception as e: # Guard against miscellaneous stuff. Probably plain bugs. log.debug(traceback.format_exc(e)) return client, None return client, identifier
def _get_client_identifier(self, url, harvest_job=None): registry = MetadataRegistry() if 'metadata_formats' in self.config: for mdp in self.config['metadata_formats']: registry.registerReader(mdp, kata_oai_dc_reader) if self.metadata_prefix_value not in self.config['metadata_formats']: registry.registerReader(self.metadata_prefix_value, kata_oai_dc_reader) else: registry.registerReader(self.metadata_prefix_value, kata_oai_dc_reader) client = oaipmh.client.Client(url, registry) try: identifier = client.identify() client.updateGranularity() #quickfix: to set corrent datetime granularity, updateGranularity has to be called except (urllib2.URLError, urllib2.HTTPError) as err: log.debug("Error occurred: {0}".format(err)) if harvest_job: self._save_gather_error('Could not gather from %s!' % harvest_job.source.url, harvest_job) return client, None except socket.error: if harvest_job: errno, errstr = sys.exc_info()[:2] self._save_gather_error('Socket error OAI-PMH %s, details:\n%s' % (errno, errstr), harvest_job) return client, None except ValueError: # We have no source URL when importing via UI. return client, None except Exception as e: # Guard against miscellaneous stuff. Probably plain bugs. log.debug(traceback.format_exc(e)) return client, None return client, identifier
def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' self._set_config(harvest_job.source.config) sets = [] harvest_objs = [] registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = oaipmh.client.Client(harvest_job.source.url, registry) try: identifier = client.identify() except urllib2.URLError: self._save_gather_error('Could not gather anything from %s!' % harvest_job.source.url, harvest_job) return None domain = identifier.repositoryName() group = Group.by_name(domain) if not group: group = Group(name=domain, description=domain) query = self.config['query'] if 'query' in self.config else '' try: for set in client.listSets(): identifier, name, _ = set if 'query' in self.config: if query in name: sets.append((identifier, name)) else: sets.append((identifier, name)) except NoSetHierarchyError: sets.append(('1', 'Default')) self._save_gather_error('Could not fetch sets!', harvest_job) for set_id, set_name in sets: harvest_obj = HarvestObject(job=harvest_job) harvest_obj.content = json.dumps( { 'set': set_id, \ 'set_name': set_name, \ 'domain': domain } ) harvest_obj.save() harvest_objs.append(harvest_obj.id) model.repo.commit() return harvest_objs
def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' log.debug("in gather stage: %s" % harvest_job.source.url) try: harvest_obj_ids = [] registry = self._create_metadata_registry() self._set_config(harvest_job.source.config) client = oaipmh.client.Client(harvest_job.source.url, registry, self.credentials, force_http_get=self.force_http_get) client.identify() # check if identify works for header in self._identifier_generator(client): # GAS 2016-12-28 if header.isDeleted() and self.ignore_deleted: pass else: harvest_obj = HarvestObject(guid=header.identifier(), job=harvest_job) harvest_obj.save() harvest_obj_ids.append(harvest_obj.id) except urllib2.HTTPError, e: log.exception( 'Gather stage failed on %s (%s): %s, %s' % (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs)) self._save_gather_error( 'Could not gather anything from %s' % harvest_job.source.url, harvest_job) return None
def _oa_connect(): _log("Initializing connection") client = oaipmh.client.Client(ARXIV_URL) out = client.identify() # got to update granularity or we barf with: # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z client.updateGranularity() # register a reader on our client to handle oai_dc metadata # if we do not attempt to read records will fail with: # .../oaipmh/metadata.py", line 37, in readMetadata # KeyError: 'oai_dc' client.getMetadataRegistry().registerReader(METADATA_PREFIX, oaipmh.metadata.oai_dc_reader) return client
def _oa_connect(): _log("Initializing connection") client = oaipmh.client.Client(ARXIV_URL) out = client.identify() # got to update granularity or we barf with: # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z client.updateGranularity() # register a reader on our client to handle oai_dc metadata # if we do not attempt to read records will fail with: # .../oaipmh/metadata.py", line 37, in readMetadata # KeyError: 'oai_dc' client.getMetadataRegistry().registerReader( METADATA_PREFIX, oaipmh.metadata.oai_dc_reader ) return client
def main(): global client print '****** Starting Script ******' client = oaipmh.client.Client(url) out = client.identify() print '****** Connected to repository: %s ******' % out.repositoryName() # got to update granularity or we barf with: # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z client.updateGranularity() # Check if our data type is supported # check_formats(client,metadataPrefix) # register a reader on our client to handle oai_dc metadata # if we do not attempt to read records will fail with: # .../oaipmh/metadata.py", line 37, in readMetadata # KeyError: 'oai_dc' client.getMetadataRegistry().registerReader( metadataPrefix, oaipmh.metadata.oai_dc_reader ) start = time.time() for (c_date,n_date) in loop_months(from_date,until_date,delta_months): # get records try: records = list(get_records(c_date,n_date)) except: print "failed recieving records!" continue # print_records(records, max_recs = 2) filename = export_dir + 'arixv_meta_%s_%s.pkl' % \ (c_date.strftime('%Y-%m-%d'), n_date.strftime('%Y-%m-%d')) write_records(records, filename) print 'Total Time spent: %d seconds' % (time.time() - start)