Beispiel #1
0
 def gather_stage(self, harvest_job):
     url = harvest_job.source.url
     # Test wether we should use OAI-PMH or DDI
     metadata_registry = MetadataRegistry()
     metadata_registry.registerReader('oai_dc', oai_dc_reader)
     client = oaipmh.client.Client(url, metadata_registry)
     try:
         client.identify()
     except XMLSyntaxError:
         self.harvester = DDIHarvester()
     except urllib2.URLError:
         self._save_gather_error('Could not identify source!', harvest_job)
         return None
     if not self.harvester:
         self.harvester = OAIPMHHarvester()
     objs = self.harvester.gather_stage(harvest_job)
     ret = []
     for obj in objs:
         obj = HarvestObject.get(obj)
         cont = obj.content
         dict = json.loads(cont)
         dict['harv'] = jsonpickle.encode(self.harvester)
         obj.content = json.dumps(dict)
         obj.save()
         ret.append(obj.id)
     return ret
    def gather_stage(self, harvest_job):
        """
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        """
        log.debug("in gather stage: %s" % harvest_job.source.url)
        try:
            harvest_obj_ids = []
            registry = self._create_metadata_registry()
            self._set_config(harvest_job.source.config)
            client = oaipmh.client.Client(harvest_job.source.url, registry, self.credentials)

            client.identify()  # check if identify works
            for header in self._identifier_generator(client):
                harvest_obj = HarvestObject(guid=header.identifier(), job=harvest_job)
                harvest_obj.save()
                harvest_obj_ids.append(harvest_obj.id)
        except:
            log.exception("Gather stage failed %s" % harvest_job.source.url)
            self._save_gather_error("Could not gather anything from %s!" % harvest_job.source.url, harvest_job)
            return None
        return harvest_obj_ids
    def gather_stage(self, harvest_job):
        '''
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        log.info("in gather stage: %s" % harvest_job.source.url)

        try:
            harvest_obj_ids = []
            registry = self._create_metadata_registry()
            self._set_config(harvest_job.source.config)
            client = oaipmh.client.Client(harvest_job.source.url,
                                          registry,
                                          self.credentials,
                                          force_http_get=self.force_http_get)
            # Start looking from here
            client.identify()  # check if identify works
            for header in self._identifier_generator(client):
                harvest_obj = HarvestObject(guid=header.identifier(),
                                            job=harvest_job)
                harvest_obj.save()
                harvest_obj_ids.append(harvest_obj.id)
                log.info("Harvest obj %s created" % harvest_obj.id)
                # return harvest_obj_ids # This is to get only one record
        except urllib.error.HTTPError as e:
            log.exception(
                'Gather stage failed on %s (%s): %s, %s' %
                (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs))
            self._save_gather_error(
                'Could not gather anything from %s' % harvest_job.source.url,
                harvest_job)
            return None
        except Exception as e:
            log.exception('Gather stage failed on %s: %s' % (
                harvest_job.source.url,
                str(e),
            ))
            self._save_gather_error(
                'Could not gather anything from %s: %s / %s' %
                (harvest_job.source.url, str(e), traceback.format_exc()),
                harvest_job)
            return None
        log.info("Gather stage successfully finished with %s harvest objects" %
                 len(harvest_obj_ids))
        return harvest_obj_ids
 def _get_client_identifier(self, url, harvest_job=None):
     registry = MetadataRegistry()
     registry.registerReader(self.metadata_prefix_value, oai_dc_reader)
     client = oaipmh.client.Client(url, registry)
     try:
         identifier = client.identify()
     except (urllib2.URLError, urllib2.HTTPError,):
         if harvest_job:
             self._save_gather_error(
                 'Could not gather from %s!' % harvest_job.source.url,
                 harvest_job)
         return client, None
     except socket.error:
         if harvest_job:
             errno, errstr = sys.exc_info()[:2]
             self._save_gather_error(
                 'Socket error OAI-PMH %s, details:\n%s' % (errno, errstr),
                 harvest_job)
         return client, None
     except ValueError:
         # We have no source URL when importing via UI.
         return client, None
     except Exception as e:
         # Guard against miscellaneous stuff. Probably plain bugs.
         log.debug(traceback.format_exc(e))
         return client, None
     return client, identifier
Beispiel #5
0
    def _get_client_identifier(self, url, harvest_job=None):
        registry = MetadataRegistry()

        if 'metadata_formats' in self.config:
            for mdp in self.config['metadata_formats']:
                registry.registerReader(mdp, kata_oai_dc_reader)
            if self.metadata_prefix_value not in self.config['metadata_formats']:
                registry.registerReader(self.metadata_prefix_value, kata_oai_dc_reader)
        else: registry.registerReader(self.metadata_prefix_value, kata_oai_dc_reader)
        
        client = oaipmh.client.Client(url, registry)
        try:
            identifier = client.identify()
            client.updateGranularity() #quickfix: to set corrent datetime granularity, updateGranularity has to be called 
        except (urllib2.URLError, urllib2.HTTPError) as err:
            log.debug("Error occurred: {0}".format(err))
            if harvest_job:
                self._save_gather_error('Could not gather from %s!' % harvest_job.source.url, harvest_job)
            return client, None
        except socket.error:
            if harvest_job:
                errno, errstr = sys.exc_info()[:2]
                self._save_gather_error('Socket error OAI-PMH %s, details:\n%s' % (errno, errstr), harvest_job)
            return client, None
        except ValueError:
            # We have no source URL when importing via UI.
            return client, None
        except Exception as e:
            # Guard against miscellaneous stuff. Probably plain bugs.
            log.debug(traceback.format_exc(e))
            return client, None
        return client, identifier
Beispiel #6
0
    def gather_stage(self, harvest_job):
        '''
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        self._set_config(harvest_job.source.config)
        sets = []
        harvest_objs = []
        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        client = oaipmh.client.Client(harvest_job.source.url, registry)
        try:
            identifier = client.identify()
        except urllib2.URLError:
            self._save_gather_error('Could not gather anything from %s!' %
                                    harvest_job.source.url, harvest_job)
            return None
        domain = identifier.repositoryName()
        group = Group.by_name(domain)
        if not group:
            group = Group(name=domain, description=domain)
        query = self.config['query'] if 'query' in self.config else ''
        try:
            for set in client.listSets():
                identifier, name, _ = set
                if 'query' in self.config:
                    if query in name:
                        sets.append((identifier, name))
                else:
                    sets.append((identifier, name))
        except NoSetHierarchyError:
            sets.append(('1', 'Default'))
            self._save_gather_error('Could not fetch sets!', harvest_job)

        for set_id, set_name in sets:
            harvest_obj = HarvestObject(job=harvest_job)
            harvest_obj.content = json.dumps(
                                             {
                                              'set': set_id, \
                                              'set_name': set_name, \
                                              'domain': domain
                                              }
                                             )
            harvest_obj.save()
            harvest_objs.append(harvest_obj.id)
        model.repo.commit()
        return harvest_objs
Beispiel #7
0
    def gather_stage(self, harvest_job):
        '''
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        log.debug("in gather stage: %s" % harvest_job.source.url)
        try:
            harvest_obj_ids = []
            registry = self._create_metadata_registry()
            self._set_config(harvest_job.source.config)
            client = oaipmh.client.Client(harvest_job.source.url,
                                          registry,
                                          self.credentials,
                                          force_http_get=self.force_http_get)

            client.identify()  # check if identify works
            for header in self._identifier_generator(client):
                # GAS 2016-12-28
                if header.isDeleted() and self.ignore_deleted:
                    pass
                else:
                    harvest_obj = HarvestObject(guid=header.identifier(),
                                                job=harvest_job)
                    harvest_obj.save()
                    harvest_obj_ids.append(harvest_obj.id)
        except urllib2.HTTPError, e:
            log.exception(
                'Gather stage failed on %s (%s): %s, %s' %
                (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs))
            self._save_gather_error(
                'Could not gather anything from %s' % harvest_job.source.url,
                harvest_job)
            return None
def _oa_connect():
    _log("Initializing connection")
    client = oaipmh.client.Client(ARXIV_URL)
    out = client.identify()

    # got to update granularity or we barf with:
    # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z
    client.updateGranularity()

    # register a reader on our client to handle oai_dc metadata
    # if we do not attempt to read records will fail with:
    #   .../oaipmh/metadata.py", line 37, in readMetadata
    #   KeyError: 'oai_dc'
    client.getMetadataRegistry().registerReader(METADATA_PREFIX,
                                                oaipmh.metadata.oai_dc_reader)

    return client
def _oa_connect():
    _log("Initializing connection")
    client = oaipmh.client.Client(ARXIV_URL)
    out = client.identify()

    # got to update granularity or we barf with:
    # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z
    client.updateGranularity()

    # register a reader on our client to handle oai_dc metadata
    # if we do not attempt to read records will fail with:
    #   .../oaipmh/metadata.py", line 37, in readMetadata
    #   KeyError: 'oai_dc'
    client.getMetadataRegistry().registerReader(
        METADATA_PREFIX,
        oaipmh.metadata.oai_dc_reader
        )

    return client
Beispiel #10
0
def main():
    global client

    print '****** Starting Script ******' 

    client = oaipmh.client.Client(url)
    out = client.identify()

    print '****** Connected to repository: %s ******' % out.repositoryName()

    # got to update granularity or we barf with:
    # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z
    client.updateGranularity()

    # Check if our data type is supported
    # check_formats(client,metadataPrefix)

    # register a reader on our client to handle oai_dc metadata
    # if we do not attempt to read records will fail with:
    #   .../oaipmh/metadata.py", line 37, in readMetadata
    #   KeyError: 'oai_dc'
    client.getMetadataRegistry().registerReader(
        metadataPrefix, 
        oaipmh.metadata.oai_dc_reader
        )

    start = time.time()
    for (c_date,n_date) in loop_months(from_date,until_date,delta_months):
        # get records
        try:
            records = list(get_records(c_date,n_date))
        except:
            print "failed recieving records!"
            continue
            
        # print_records(records, max_recs = 2)
        filename = export_dir + 'arixv_meta_%s_%s.pkl' % \
            (c_date.strftime('%Y-%m-%d'), n_date.strftime('%Y-%m-%d'))
        
        write_records(records, filename)

    print 'Total Time spent: %d seconds' % (time.time() - start)