Ejemplo n.º 1
0
    def test_fetch(self):
        '''
        Parse example dataset
        '''
        registry = importformats.create_metadata_registry()
        client = oaipmh.client.Client(_get_fixture(FIXTURE_DATASET), registry)
        record = client.getRecord(identifier=self.TEST_ID, metadataPrefix='oai_dc')

        assert record
Ejemplo n.º 2
0
    def test_fetch(self):
        '''
        Parse example dataset
        '''
        registry = importformats.create_metadata_registry()
        client = oaipmh.client.Client(_get_fixture(FIXTURE_DATASET), registry)
        record = client.getRecord(identifier=self.TEST_ID,
                                  metadataPrefix='oai_dc')

        assert record
Ejemplo n.º 3
0
    def fetch_stage(self, harvest_object):
        '''
        The fetch stage will receive a HarvestObject object and will be
        responsible for:
        - getting the contents of the remote object (e.g. for a CSW server, perform a GetRecordById request).
        - saving the content in the provided HarvestObject.
        - creating and storing any suitable HarvestObjectErrors that may occur.
        - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        log.debug("fetch: %s", harvest_object.guid)
        # Get metadata content from provider
        try:
            # Create a OAI-PMH Client
            config = self._get_configuration(harvest_object)

            registry = self.metadata_registry(config, harvest_object)
            client = oaipmh.client.Client(harvest_object.job.source.url,
                                          registry)

            # Get source URL
            header, metadata, _about = client.getRecord(
                identifier=harvest_object.guid, metadataPrefix=self.md_format)
        except Exception as e:
            import traceback
            traceback.print_exc()
            self._save_object_error(
                'Unable to get metadata from provider: {u}: {e}'.format(
                    u=harvest_object.source.url, e=e), harvest_object)
            return False

        if header and header.isDeleted():
            return self.on_deleted(harvest_object, header)

        # Get contents
        try:
            content = json.dumps(metadata.getMap())
        except Exception as e:
            import traceback
            traceback.print_exc()
            self._save_object_error(
                'Unable to get content for package: {u}: {e}'.format(
                    u=harvest_object.source.url, e=e), harvest_object)
            return False

        # Save the fetched contents in the HarvestObject
        harvest_object.content = content
        harvest_object.save()

        return True
Ejemplo n.º 4
0
    def _fetch_import_record(self, harvest_object, master_data, client, group):
        # The fetch part.
        try:
            header, metadata, _ = client.getRecord(
                metadataPrefix=self.metadata_prefix_value,
                identifier=master_data['record'])
        except XMLSyntaxError:
            log.error('oai_dc XML syntax error: %s' % master_data['record'])
            self._save_object_error(
                'Syntax error.',
                harvest_object, stage='Fetch')
            return False
        except socket.error:
            errno, errstr = sys.exc_info()[:2]
            self._save_object_error(
                'Socket error OAI-PMH %s, details:\n%s' % (errno, errstr),
                harvest_object, stage='Fetch')
            return False
        except urllib2.URLError:
            self._save_object_error(
                'Failed to fetch record.',
                harvest_object, stage='Fetch')
            return False
        except httplib.BadStatusLine:
            self._save_object_error(
                'Bad HTTP response status line.',
                harvest_object, stage='Fetch')
            return False
        if not metadata:
            # Assume that there is no metadata and not an error.
            # Should this be a cause for retry?
            log.warning('No metadata: %s' % master_data['record'])
            return False
        if 'date' not in metadata.getMap() or not metadata.getMap()['date']:
            self._save_object_error(
                'Missing date: %s' % master_data['record'],
                harvest_object, stage='Fetch')
            return False
        master_data['record'] = (header.identifier(), metadata.getMap())
        # Do not save to database (because we can't json nor pickle _Element).
        # The import stage.
        # Gather all relevant information into a dictionary.
        data = {
            'identifier': master_data['record'][0],
            'metadata': self._metadata(master_data['record'][1]),
            'package_name': self._package_name_from_identifier(master_data['record'][0]),
            'package_url': master_data['record'][1]['source'][0] if master_data['record'][1]['source'] else ''
        }

        return oai_dc2ckan(data, oai_dc_reader._namespaces, group, harvest_object)
Ejemplo n.º 5
0
    def fetch_stage(self, harvest_object):
        '''
        The fetch stage will receive a HarvestObject object and will be
        responsible for:
        - getting the contents of the remote object (e.g. for a CSW server, perform a GetRecordById request).
        - saving the content in the provided HarvestObject.
        - creating and storing any suitable HarvestObjectErrors that may occur.
        - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        log.debug("fetch: %s", harvest_object.guid)
        # Get metadata content from provider
        try:
            # Create a OAI-PMH Client
            config = self._get_configuration(harvest_object)

            registry = self.metadata_registry(config, harvest_object)
            client = oaipmh.client.Client(harvest_object.job.source.url, registry)

            # Get source URL
            header, metadata, _about = client.getRecord(identifier=harvest_object.guid, metadataPrefix=self.md_format)
        except Exception as e:
            import traceback
            traceback.print_exc()
            self._save_object_error('Unable to get metadata from provider: {u}: {e}'.format(
                u=harvest_object.source.url, e=e), harvest_object)
            return False

        if header and header.isDeleted():
            return self.on_deleted(harvest_object, header)

        # Get contents
        try:
            content = json.dumps(metadata.getMap())
        except Exception as e:
            import traceback
            traceback.print_exc()
            self._save_object_error('Unable to get content for package: {u}: {e}'.format(
                u=harvest_object.source.url, e=e), harvest_object)
            return False

        # Save the fetched contents in the HarvestObject
        harvest_object.content = content
        harvest_object.save()

        return True
Ejemplo n.º 6
0
 def getrecord():
     client.getRecord(identifier=self.TEST_ID, metadataPrefix='oai_dc')
Ejemplo n.º 7
0
    def fetch_stage(self, harvest_object):
        '''
        The fetch stage will receive a HarvestObject object and will be
        responsible for:
            - getting the contents of the remote object (e.g. for a CSW server,
              perform a GetRecordById request).
            - saving the content in the provided HarvestObject.
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        log.debug("in fetch stage: %s" % harvest_object.guid)
        try:
            self._set_config(harvest_object.job.source.config)
            registry = self._create_metadata_registry()
            client = oaipmh.client.Client(harvest_object.job.source.url,
                                          registry,
                                          self.credentials,
                                          force_http_get=self.force_http_get)
            record = None
            try:
                log.debug("Load %s with metadata prefix '%s'" %
                          (harvest_object.guid, self.md_format))

                self._before_record_fetch(harvest_object)
                record = client.getRecord(identifier=harvest_object.guid,
                                          metadataPrefix=self.md_format)
                self._after_record_fetch(record)
                log.debug('record found!')
            except:
                log.exception('getRecord failed for %s' % harvest_object.guid)
                self._save_object_error(
                    'Get record failed for %s!' % harvest_object.guid,
                    harvest_object)
                return False

            header, metadata, _ = record
            log.debug('metadata %s' % metadata)
            log.debug('header %s' % header)

            try:
                metadata_modified = header.datestamp().isoformat()
            except:
                metadata_modified = None

            try:
                content_dict = metadata.getMap()
                content_dict['set_spec'] = header.setSpec()
                if metadata_modified:
                    content_dict['metadata_modified'] = metadata_modified
                log.debug(content_dict)
                content = json.dumps(content_dict)
            except:
                log.exception('Dumping the metadata failed!')
                self._save_object_error('Dumping the metadata failed!',
                                        harvest_object)
                return False

            harvest_object.content = content
            harvest_object.save()
        except Exception as e:
            log.exception(e)
            self._save_object_error(
                ('Exception in fetch stage for %s: %r / %s' %
                 (harvest_object.guid, e, traceback.format_exc())),
                harvest_object)
            return False

        return True
Ejemplo n.º 8
0
def test_fetch(url, record_id, fmt):
        registry = importformats.create_metadata_registry()
        client = oaipmh.client.Client(url, registry)
        record = client.getRecord(identifier=record_id, metadataPrefix=fmt)
        return record
Ejemplo n.º 9
0
 def getrecord():
     client.getRecord(identifier=self.TEST_ID, metadataPrefix='oai_dc')
Ejemplo n.º 10
0
    def fetch_stage(self, harvest_object):
        '''
        The fetch stage will receive a HarvestObject object and will be
        responsible for:
            - getting the contents of the remote object (e.g. for a CSW server,
              perform a GetRecordById request).
            - saving the content in the provided HarvestObject.
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        log.debug("in fetch stage: %s" % harvest_object.guid)
        try:
            self._set_config(harvest_object.job.source.config)
            registry = self._create_metadata_registry()
            client = oaipmh.client.Client(
                harvest_object.job.source.url,
                registry,
                self.credentials,
                force_http_get=self.force_http_get
            )
            record = None
            try:
                log.debug(
                    "Load %s with metadata prefix '%s'" %
                    (harvest_object.guid, self.md_format)
                )

                self._before_record_fetch(harvest_object)
                record = client.getRecord(
                    identifier=harvest_object.guid,
                    metadataPrefix=self.md_format
                )
                self._after_record_fetch(record)
                log.debug('record found!')
            except:
                log.exception('getRecord failed for %s' % harvest_object.guid)
                self._save_object_error(
                    'Get record failed for %s!' % harvest_object.guid,
                    harvest_object
                )
                return False

            header, metadata, _ = record
            log.debug('metadata %s' % metadata)
            log.debug('header %s' % header)

            try:
                metadata_modified = header.datestamp().isoformat()
            except:
                metadata_modified = None

            try:
                content_dict = metadata.getMap()
                content_dict['set_spec'] = header.setSpec()
                if metadata_modified:
                    content_dict['metadata_modified'] = metadata_modified
                log.debug(content_dict)
                content = json.dumps(content_dict)
            except:
                log.exception('Dumping the metadata failed!')
                self._save_object_error(
                    'Dumping the metadata failed!',
                    harvest_object
                )
                return False

            harvest_object.content = content
            harvest_object.save()
        except Exception, e:
            log.exception(e)
            self._save_object_error(
                (
                    'Exception in fetch stage for %s: %r / %s'
                    % (harvest_object.guid, e, traceback.format_exc())
                ),
                harvest_object
            )
            return False
Ejemplo n.º 11
0
def test_fetch(url, record_id, fmt):
    registry = importformats.create_metadata_registry()
    client = oaipmh.client.Client(url, registry)
    record = client.getRecord(identifier=record_id, metadataPrefix=fmt)
    return record
Ejemplo n.º 12
0
    def fetch_stage(self, harvest_object):
        '''
        The fetch stage will receive a HarvestObject object and will be
        responsible for:
            - getting the contents of the remote object (e.g. for a CSW server,
              perform a GetRecordById request).
            - saving the content in the provided HarvestObject.
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        log.debug("HDR: Fetch url %s" % harvest_object.job.source.url)

        try:
            self._set_config(harvest_object.job.source.config)
            # Registry creation is dependant on job.source.config
            # because of differentiation possibilities in
            # namespaces for equal md_prefix.

            log.debug('Application: ' + self.md_application)
            log.debug('Md_format: ' + self.md_format)
            log.debug('AddInfo: ' + self.additional_info)

            # EPOS - trick to collect extra info via GFZ - solely intended for harvesting of GFZ
            log.debug('Extra citation info URL: ' +
                      self.collect_extra_info_from_gfz)

            registry = self._create_metadata_registry()
            client = oaipmh.client.Client(
                harvest_object.job.source.url,
                registry,
                self.credentials,
                force_http_get=self.force_http_get
            )
            record = None
            try:
                self._before_record_fetch(harvest_object)

                record = client.getRecord(
                    identifier=harvest_object.guid,
                    metadataPrefix=self.md_format
                )
                self._after_record_fetch(record)

            except Exception:
                log.exception('getRecord failed')
                self._save_object_error('Get record failed!', harvest_object)
                return False

            header, metadata, _ = record

            log.debug(record)

            try:
                metadata_modified = header.datestamp().isoformat()
            except Exception:
                metadata_modified = None

            try:
                content_dict = metadata.getMap()

                # HDR? required still?
                content_dict['set_spec'] = header.setSpec()
                if metadata_modified:
                    content_dict['metadata_modified'] = metadata_modified

                content = json.dumps(content_dict,
                                     ensure_ascii=False,
                                     encoding="utf-8")
            except Exception:
                log.exception('Dumping the metadata failed!')
                self._save_object_error(
                    'Dumping the metadata failed!',
                    harvest_object
                )
                return False

            harvest_object.content = content
            harvest_object.save()
        except Exception:
            log.exception('Something went wrong 1!')
            self._save_object_error(
                'Exception in fetch stage',
                harvest_object
            )
            return False

        return True
Ejemplo n.º 13
0
    def _fetch_import_record(self, harvest_object, master_data, client, group):
        # The fetch part.
        metadataPrefixes = []
        if('metadata_formats' in self.config):
            metadataPrefixes = self.config['metadata_formats']
        if self.metadata_prefix_value not in metadataPrefixes:
            metadataPrefixes.append(self.metadata_prefix_value)
        data = {'metadata': {}, 'package_xml_save' : {}, 'package_resource' : {}}
        data['identifier'] = master_data['record']
        data['package_name'] = self._package_name_from_identifier(data['identifier'])
        data['package_url'] = '%s?verb=GetRecord&identifier=%s&%s=%s' % (
                    harvest_object.job.source.url,
                    data['identifier'],
                    self.metadata_prefix_key,
                    self.metadata_prefix_value
        )
         
        for mdp in metadataPrefixes:
            try:
                header, metadata, _ = client.getRecord(metadataPrefix=mdp,
                                                       identifier=master_data['record'])
            except XMLSyntaxError:
                self._add_retry(harvest_object)
                log.error('XML syntax error: %s' % master_data['record'])
                self._save_object_error('Syntax error.', harvest_object, stage='Fetch')
                if (mdp == self.metadata_prefix_value):
                    return False
                else: continue
            except socket.error:
                self._add_retry(harvest_object)
                errno, errstr = sys.exc_info()[:2]
                self._save_object_error('Socket error OAI-PMH %s, details:\n%s' % (errno, errstr),
                                        harvest_object,
                                        stage='Fetch')
                if (mdp == self.metadata_prefix_value):
                    return False
                else: continue
            except urllib2.URLError:
                self._add_retry(harvest_object)
                self._save_object_error('Failed to fetch record.', harvest_object, stage='Fetch')
                if (mdp == self.metadata_prefix_value):
                    return False
                else: continue
            except httplib.BadStatusLine:
                self._add_retry(harvest_object)
                self._save_object_error('Bad HTTP response status line.', harvest_object, stage='Fetch')
                if (mdp == self.metadata_prefix_value):
                    return False
                else: continue
            if not metadata:
                # Assume that there is no metadata and not an error.
                # Should this be a cause for retry?
                log.warning('No metadata: %s' % master_data['record'])
                #return False
            # if 'date' not in metadata.getMap() or not metadata.getMap()['date']:
            #     self._add_retry(harvest_object)
            #     self._save_object_error('Missing date: %s' % master_data['record'], harvest_object, stage='Fetch')
            #     return False
            #master_data['record'] = (header.identifier(), metadata.getMap())
            # Do not save to database (because we can't json nor pickle _Element).
            # The import stage.
            # Gather all relevant information into a dictionary.
            
            data['metadata'][mdp] = metadata.getMap()
            
            try:
                nowstr = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%f')
                #fix for identifiers containing '/' char
                esc_identifier = data['identifier'].replace('/','-');
                label = '%s/%s-%s.xml' % (nowstr, esc_identifier,mdp)
                resource_url = '%s?verb=GetRecord&identifier=%s&%s=%s' % (
                    harvest_object.job.source.url,
                    data['identifier'],
                    self.metadata_prefix_key,
                    mdp
                )

                f = urllib2.urlopen(resource_url)
                x = f.read()
                fileurl = pylons.configuration.config['ckan.site_url'] + pylons.configuration.config['ckan.api_url'] + h.url_for('storage_file', label=label) #quick fix for ckan in non-root url 
                data['package_xml_save'][mdp] = {
                    'label': label,
                    'xml': x
                }
                data['package_resource'][mdp] = {
                    'url': fileurl,
                    'description': 'Original ' + mdp + ' metadata record',
                    'format': 'xml',
                    'size': len(x)
                }
            except (urllib2.HTTPError, urllib2.URLError):
                self._add_retry(harvest_object)
                self._save_object_error('Could not get original metadata record!',
                                        harvest_object, stage='Import')
                if (mdp == self.metadata_prefix_value):
                    return False
                else: continue
            except socket.error:
                self._add_retry(harvest_object)
                errno, errstr = sys.exc_info()[:2]
                self._save_object_error(
                    'Socket error original metadata record %s, details:\n%s' % (errno, errstr),
                    harvest_object, stage='Import')
                if (mdp == self.metadata_prefix_value):
                    return False
                else: continue
            
        return oai_dc2ckan(data, kata_oai_dc_reader._namespaces, group, harvest_object)