コード例 #1
0
class GeocatCatalogueServiceWeb(object):
    def __init__(self, url):
        self.csw = CatalogueServiceWeb(url)
        self.schema = CHE_SCHEMA

    def get_geocat_id_from_csw(self,
                               cqlquery=CQL_QUERY_DEFAULT,
                               cqlterm=CQL_SEARCH_TERM_DEFAULT):
        harvest_query = PropertyIsEqualTo(cqlquery, cqlterm)
        nextrecord = 0
        record_ids = []
        while nextrecord is not None:
            self.csw.getrecords2(constraints=[harvest_query],
                                 maxrecords=50,
                                 startposition=nextrecord)
            if self.csw.response is None or self.csw.results['matches'] == 0:
                raise CswNotFoundError(
                    "No dataset found for harvest query {}".format(
                        harvest_query))
            if self.csw.results['returned'] > 0:
                if self.csw.results['nextrecord'] > 0:
                    nextrecord = self.csw.results['nextrecord']
                else:
                    nextrecord = None
                for id in self.csw.records.keys():
                    record_ids.append(id)
        return record_ids

    def get_record_by_id(self, geocat_id):
        self.csw.getrecordbyid(id=[geocat_id], outputschema=self.schema)
        csw_record_as_string = self.csw.response
        if csw_record_as_string:
            return csw_record_as_string
        else:
            return None
コード例 #2
0
ファイル: harvest_node.py プロジェクト: okfn/ckanext-ngds
class HarvestNode(NgdsDataObject):
    """Stores information about harvest endpoints"""
    csw = None
    
    def __init__(self, url, **kwargs):
        # A URL must be given
        p = urlparse(url)
        self.url = urlunparse((p.scheme, p.netloc, p.path, "", "", "")) # Strip URL to just domain + path
        self.frequency = kwargs.get('frequency', 'manual') # frequency should be one of manual|daily|weekly|monthly
        self.title = kwargs.get('title', 'No Title Was Given') # A title for bookkeeping
        self.node_admin_id = kwargs.get('node_admin_id', None) # Foreign Key to a responsible_party who maintains the remote node
        #self.csw = CatalogueServiceWeb(self.url) # owslib CSW class provides mechanisms for making CSW requests
    
    def setup_csw(self):
        self.csw = CatalogueServiceWeb(self.url)
        
    def do_harvest(self):
        """Perform a harvest from another CSW server"""
        if self.csw == None:
            self.setup_csw()                      
        self.get_records() # Do the first GetRecords request
        ids = self.csw.records.keys() # Start an array to house all of the ids
        print "next: %s, total: %s" % (self.csw.results["nextrecord"], self.csw.results["matches"])
        
        while self.csw.results["nextrecord"] < self.csw.results["matches"] and self.csw.results["nextrecord"] != 0: # Once next_record > number_matched, we've gotten everything
            self.get_records(self.csw.results["nextrecord"], self.csw.results["returned"]) # Get another set, starting from next_record from previous response
            ids += self.csw.records.keys() # Add new ids to the array
            print "next: %s, total: %s" % (self.csw.results["nextrecord"], self.csw.results["matches"])
        
        self.parse_records(ids) # Gather the records themselves
                   
    def parse_records(self, ids):
        """Perform as many GetRecordById requests as needed"""
        print "Gathered %s IDs" % str(len(ids))
        for record_id in ids:
            self.get_record_by_id(record_id)
            rec = HarvestedRecord.from_md_metadata(self.csw.records[record_id], self)
    
    def get_record_by_id(self, record_id):
        """Get a single record, by ID"""
        params = {
            "id": [ record_id ],
            "outputschema": "http://www.isotc211.org/2005/gmd"    
        }
        self.csw.getrecordbyid(**params) # Puts response in self.csw.records        
    
    def get_records(self, start_position=1, max_records=1000):
        """Perform a GetRecords request"""
        params = {
            "typenames": "gmd:MD_Metadata",
            "outputschema": "http://www.isotc211.org/2005/gmd",
            "startposition": start_position,
            "maxrecords": max_records,
            "esn": "brief"          
        }
        self.csw.getrecords(**params) # Puts results in self.csw.records        
コード例 #3
0
ファイル: maindialog.py プロジェクト: oscar1780/QGIS
    def show_metadata(self):
        """show record metadata"""

        if not self.treeRecords.selectedItems():
            return

        item = self.treeRecords.currentItem()
        if not item:
            return

        identifier = get_item_data(item, 'identifier')

        self.disable_ssl_verification = self.disableSSLVerification.isChecked()
        auth = None

        if self.disable_ssl_verification:
            try:
                auth = Authentication(verify=False)
            except NameError:
                pass

        try:
            with OverrideCursor(Qt.WaitCursor):
                cat = CatalogueServiceWeb(
                    self.catalog_url,
                    timeout=self.timeout,  # spellok
                    username=self.catalog_username,
                    password=self.catalog_password,
                    auth=auth)
                cat.getrecordbyid(
                    [self.catalog.records[identifier].identifier])
        except ExceptionReport as err:
            QMessageBox.warning(
                self, self.tr('GetRecords error'),
                self.tr('Error getting response: {0}').format(err))
            return
        except KeyError as err:
            QMessageBox.warning(self, self.tr('Record parsing error'),
                                self.tr('Unable to locate record identifier'))
            return

        record = cat.records[identifier]
        record.xml_url = cat.request

        crd = RecordDialog()
        metadata = render_template('en', self.context, record,
                                   'record_metadata_dc.html')

        style = QgsApplication.reportStyleSheet()
        crd.textMetadata.document().setDefaultStyleSheet(style)
        crd.textMetadata.setHtml(metadata)
        crd.exec_()
コード例 #4
0
    def test_GetRecordById(self):
        csw = CatalogueServiceWeb(service)
        tofetch = identifiers[:2]
        csw.getrecordbyid(tofetch, outputschema=GMD)
        nrecords = len(csw.records)
        assert nrecords == len(tofetch), nrecords
        for ident in csw.records:
            identifiers.append(ident)
            assert isinstance(csw.records[ident], MD_Metadata), (ident, csw.records[ident])

        csw.getrecordbyid(["nonexistent"], outputschema=GMD)
        nrecords = len(csw.records)
        assert nrecords == 0, nrecords
コード例 #5
0
    def test_GetRecordById(self):
        csw = CatalogueServiceWeb(service)
        tofetch = identifiers[:2]
        csw.getrecordbyid(tofetch, outputschema=GMD)
        nrecords = len(csw.records)
        assert nrecords == len(tofetch), nrecords
        for ident in csw.records:
            identifiers.append(ident)
            assert isinstance(csw.records[ident],
                              MD_Metadata), (ident, csw.records[ident])

        csw.getrecordbyid(["nonexistent"], outputschema=GMD)
        nrecords = len(csw.records)
        assert nrecords == 0, nrecords
コード例 #6
0
    def get_csw_record_by_id(self, csw_url, identifier):
        '''
        Function to return OWSLib CSW record record from specified CSW URL using UUID as the search criterion
        '''
        csw = CatalogueServiceWeb(csw_url)
        assert csw.identification.type == 'CSW', '%s is not a valid CSW service' % csw_url

        csw.getrecordbyid(id=[identifier], esn='full', outputschema='own')

        # Ensure there is exactly one record found
        assert len(
            csw.records) > 0, 'No CSW records found for ID "%s"' % identifier
        assert len(
            csw.records) == 1, 'Multiple CSW records found for ID "%s"' % identifier

        return csw.records.values()[0]
コード例 #7
0
    def get_csw_record_by_id(self, csw_url, identifier):
        '''
        Function to return OWSLib CSW record record from specified CSW URL using UUID as the search criterion
        '''
        csw = CatalogueServiceWeb(csw_url)
        assert csw.identification.type == 'CSW', '%s is not a valid CSW service' % csw_url

        csw.getrecordbyid(id=[identifier], esn='full', outputschema='own')

        # Ensure there is exactly one record found
        assert len(
            csw.records) > 0, 'No CSW records found for ID "%s"' % identifier
        assert len(
            csw.records) == 1, 'Multiple CSW records found for ID "%s"' % identifier

        return csw.records.values()[0]
コード例 #8
0
ファイル: maindialog.py プロジェクト: smagic20/QGIS
    def show_metadata(self):
        """show record metadata"""

        if not self.treeRecords.selectedItems():
            return

        item = self.treeRecords.currentItem()
        if not item:
            return

        identifier = get_item_data(item, 'identifier')

        try:
            QApplication.setOverrideCursor(QCursor(Qt.WaitCursor))
            cat = CatalogueServiceWeb(self.catalog_url,
                                      timeout=self.timeout,
                                      username=self.catalog_username,
                                      password=self.catalog_password)
            cat.getrecordbyid([self.catalog.records[identifier].identifier])
        except ExceptionReport as err:
            QApplication.restoreOverrideCursor()
            QMessageBox.warning(
                self, self.tr('GetRecords error'),
                self.tr('Error getting response: {0}').format(err))
            return
        except KeyError as err:
            QMessageBox.warning(self, self.tr('Record parsing error'),
                                self.tr('Unable to locate record identifier'))
            QApplication.restoreOverrideCursor()
            return

        QApplication.restoreOverrideCursor()

        record = cat.records[identifier]
        record.xml_url = cat.request

        crd = RecordDialog()
        metadata = render_template('en', self.context, record,
                                   'record_metadata_dc.html')

        style = QgsApplication.reportStyleSheet()
        crd.textMetadata.document().setDefaultStyleSheet(style)
        crd.textMetadata.setHtml(metadata)
        crd.exec_()
コード例 #9
0
ファイル: maindialog.py プロジェクト: cayetanobv/QGIS
    def show_metadata(self):
        """show record metadata"""

        if not self.treeRecords.selectedItems():
            return

        item = self.treeRecords.currentItem()
        if not item:
            return

        identifier = get_item_data(item, 'identifier')

        try:
            QApplication.setOverrideCursor(QCursor(Qt.WaitCursor))
            cat = CatalogueServiceWeb(self.catalog_url, timeout=self.timeout,
                                      username=self.catalog_username,
                                      password=self.catalog_password)
            cat.getrecordbyid(
                [self.catalog.records[identifier].identifier])
        except ExceptionReport as err:
            QApplication.restoreOverrideCursor()
            QMessageBox.warning(self, self.tr('GetRecords error'),
                                self.tr('Error getting response: {0}').format(err))
            return
        except KeyError as err:
            QMessageBox.warning(self,
                                self.tr('Record parsing error'),
                                self.tr('Unable to locate record identifier'))
            QApplication.restoreOverrideCursor()
            return

        QApplication.restoreOverrideCursor()

        record = cat.records[identifier]
        record.xml_url = cat.request

        crd = RecordDialog()
        metadata = render_template('en', self.context,
                                   record, 'record_metadata_dc.html')

        style = QgsApplication.reportStyleSheet()
        crd.textMetadata.document().setDefaultStyleSheet(style)
        crd.textMetadata.setHtml(metadata)
        crd.exec_()
コード例 #10
0
ファイル: spider.py プロジェクト: arbakker/pdok-spider-python
def get_service_url(result):
    md_id = result["mdId"]
    csw = CatalogueServiceWeb(CSW_URL)
    csw.getrecordbyid(id=[md_id])
    record = csw.records[md_id]
    uris = record.uris
    service_url = ""

    if len(uris) > 0:
        service_url = uris[0]["url"]
        service_url = service_url.split("?")[0]
        service_str = result["protocol"].split(":")[1]
        if "https://geodata.nationaalgeoregister.nl/tiles/service/wmts" in service_url:
            service_url = "https://geodata.nationaalgeoregister.nl/tiles/service/wmts"
        service_url = f"{service_url}?request=GetCapabilities&service={service_str}"
    else:
        error_message = (
            f"expected at least 1 service url in service record {md_id}, found 0"
        )
        logging.error(error_message)
    return {"mdId": md_id, "url": service_url}
コード例 #11
0
ファイル: harvester.py プロジェクト: SP7-Ritmare/geogate
def harvest_csw(src_url, dest_url):
    stop = 0
    flag = 0
    maxrecords = 10

    src = CatalogueServiceWeb(src_url)
    dest = CatalogueServiceWeb(dest_url)

    while stop == 0:
        if flag == 0:  # first run, start from 0
            startposition = 0
        else:  # subsequent run, startposition is now paged
            startposition = src.results['nextrecord']

        src.getrecords2(esn='full', startposition=startposition, maxrecords=maxrecords)
        
        print(src.results)

        if src.results['nextrecord'] == 0 \
           or src.results['returned'] == 0 \
           or src.results['nextrecord'] > src.results['matches']:  # end the loop, exhausted all records
            stop = 1
            break

        
        # harvest each record to destination CSW
        for i in list(src.records):
            print "insert", i
            src.getrecordbyid(id=[i], outputschema='http://www.isotc211.org/2005/gmd')
            md = src._exml.find('{http://www.isotc211.org/2005/gmd}MD_Metadata')
            f = open('/tmp/a.xml', 'w')
            f.write(etree.tostring(md))
            f.close()
            dest.transaction(ttype='insert', typename='gmd:MD_Metadata', record=open("/tmp/a.xml").read())

        flag = 1
コード例 #12
0
ファイル: spider.py プロジェクト: arbakker/pdok-spider-python
def get_record_by_id(mdId):
    csw = CatalogueServiceWeb(CSW_URL)
    csw.getrecordbyid(id=[mdId])
    return csw.records[mdId]
コード例 #13
0
ファイル: search_backend.py プロジェクト: rldhont/QGIS
class CSW202Search(SearchBase):
    def __init__(self, url, timeout, username, password, auth):
        super().__init__(url, timeout, username, password, auth)

        self.type = CATALOG_TYPES[0]
        self.format = 'xml'
        self.service_info_template = 'csw_service_metadata.html'
        self.record_info_template = 'record_metadata_dc.html'
        self.constraints = []

        self.conn = CatalogueServiceWeb(
            self.url,  # spellok
            timeout=self.timeout,
            username=self.username,
            password=self.password,
            auth=self.auth)

        self.request = self.conn.request
        self.response = self.conn.response

    def query_records(self, bbox=[], keywords=None, limit=10, offset=1):

        self.constraints = []

        # only apply spatial filter if bbox is not global
        # even for a global bbox, if a spatial filter is applied, then
        # the CSW server will skip records without a bbox
        if bbox and bbox != ['-180', '-90', '180', '90']:
            minx, miny, maxx, maxy = bbox
            self.constraints.append(
                BBox([miny, minx, maxy, maxx],
                     crs='urn:ogc:def:crs:EPSG::4326'))

        # keywords
        if keywords:
            # TODO: handle multiple word searches
            self.constraints.append(PropertyIsLike('csw:AnyText', keywords))

        if len(self.constraints) > 1:  # exclusive search (a && b)
            self.constraints = [self.constraints]

        self.conn.getrecords2(constraints=self.constraints,
                              maxrecords=limit,
                              startposition=offset,
                              esn='full')

        self.matches = self.conn.results['matches']
        self.returned = self.conn.results['returned']

        self.request = self.conn.request
        self.response = self.conn.response

    def records(self):
        recs = []

        for record in self.conn.records:
            rec = {
                'identifier': None,
                'type': None,
                'title': None,
                'bbox': None
            }

            if self.conn.records[record].identifier:
                rec['identifier'] = self.conn.records[record].identifier
            if self.conn.records[record].type:
                rec['type'] = self.conn.records[record].type
            if self.conn.records[record].title:
                rec['title'] = self.conn.records[record].title
            if self.conn.records[record].bbox:
                rec['bbox'] = bbox_list_to_dict(self.conn.records[record].bbox)

            rec['links'] = (self.conn.records[record].uris +
                            self.conn.records[record].references)

            recs.append(rec)

        return recs

    def get_record(self, identifier):
        self.conn.getrecordbyid([identifier])

        return self.conn.records[identifier]
コード例 #14
0
ファイル: CSW_Consuming.py プロジェクト: PGCHM/Usage_OWSLib
# Search for bird data in Canada
bbox_query = BBox([-141, 42, -52, 84])

csw.getrecords2(constraints=[birds_query, bbox_query])

csw.results

# Search for keywords like ‘birds’ or ‘fowl’
birds_query_like = PropertyIsLike('dc:subject', '%birds%')
fowl_query_like = PropertyIsLike('dc:subject', '%fowl%')
csw.getrecords2(constraints=[birds_query_like, fowl_query_like])
csw.results

# Search for a specific record:
csw.getrecordbyid(id=['9250AA67-F3AC-6C12-0CB9-0662231AA181'])
c.records['9250AA67-F3AC-6C12-0CB9-0662231AA181'].title

# Search with a CQL query

csw.getrecords(cql='csw:AnyText like "%birds%"')

csw.transaction(ttype='insert',
                typename='gmd:MD_Metadata',
                record=open("file.xml").read())

# update ALL records
csw.transaction(ttype='update',
                typename='csw:Record',
                propertyname='dc:title',
                propertyvalue='New Title')
コード例 #15
0
class CswBaseHandler(object):
    def __init__(self, url, username=None, password=None):
        self.url = url
        self.username = username
        self.password = password
        try:
            self.remote = CatalogueServiceWeb(self.url,
                                              timeout=3600,
                                              lang='fr-FR',
                                              version='2.0.2',
                                              skip_caps=True,
                                              username=self.username,
                                              password=self.password)
        except Exception:
            raise CswReadError()

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.close()

    def close(self):
        # Fake
        logger.info('Close CSW connection')

    @CswExceptionsHandler()
    def get_packages(self, *args, **kwargs):
        self.remote.getrecords2(**kwargs)
        records = self.remote.records.copy()
        res = []
        for k in list(records.keys()):
            try:
                package = self.get_package(k)
            except CswBaseError as e:
                logger.warning(e)
            else:
                res.append(package)
        return res

    @CswExceptionsHandler()
    def get_package(self, id, *args, **kwargs):

        self.remote.getrecordbyid(
            [id], outputschema='http://www.isotc211.org/2005/gmd')

        records = self.remote.records.copy()

        rec = records[id]

        xml = rec.xml
        if not rec.__class__.__name__ == 'MD_Metadata':
            raise CswBaseError('outputschema error')
        # if not (rec.stdname == 'ISO 19115:2003/19139' and rec.stdver == '1.0'):
        #     raise CswBaseError('outputschema error: stdname:{} stdver:{}'.format(rec.stdname, rec.stdver))
        if rec.hierarchy and not rec.hierarchy == 'dataset':  # 7218
            raise CswBaseError(
                'MD {id} is not a Dataset'.format(id=rec.identifier))

        # _encoding = rec.charset

        id = rec.identifier
        title = rec.identification.title
        name = slugify(title)
        notes = description = rec.identification.abstract
        thumbnail = None

        keywords, themes = [], []
        for item in rec.identification.keywords2:
            if not item.__class__.__name__ == 'MD_Keywords':
                continue
            if item.type == 'theme':
                themes += item.keywords
            keywords += item.keywords

        tags = []
        for keyword in keywords:
            if not keyword:
                continue
            keyword_match = re.compile('[\w\s\-.]*$', re.UNICODE)
            if keyword_match.match(keyword):
                tags.append({'display_name': keyword})

        groups = [{'name': name} for name in rec.identification.topiccategory]
        if themes:
            groups += [{'name': name} for name in themes]

        dataset_creation_date = None
        dataset_modification_date = None
        dataset_publication_date = None
        if rec.identification.date:
            for item in rec.identification.date:
                if not item.__class__.__name__ == 'CI_Date':
                    continue
                if item.type == 'creation':
                    dataset_creation_date = item.date
                elif item.type == 'publication':
                    dataset_publication_date = item.date
                elif item.type in ('modification', 'revision'):
                    dataset_modification_date = item.date

        frequency = None
        geocover = None
        granularity = None
        organisation = {
            'id': None,
            'name': None,
            'title': None,
            'description': None,
            'created': None,
            'is_organization': True,
            'state': 'active',
            'image_url': None,
            'type': 'organization',
            'approval_status': 'approved',
        }

        license_titles = rec.identification.uselimitation or []

        support = None
        data_type = None
        author = None
        author_email = None
        maintainer = None
        maintainer_email = None

        bbox = None
        spatial = None
        if hasattr(rec.identification, 'bbox'):
            xmin = rec.identification.bbox.minx
            ymin = rec.identification.bbox.miny
            xmax = rec.identification.bbox.maxx
            ymax = rec.identification.bbox.maxy

            bbox = transform(bounds_to_wkt(xmin, ymin, xmax, ymax), '4326')
            spatial = {
                'type':
                'Polygon',
                'coordinates': [[[xmin, ymin], [xmax, ymin], [xmax, ymax],
                                 [xmin, ymax], [xmin, ymin]]]
            }

        resources = []
        for item in rec.distribution.online:
            name = hasattr(item, 'name') and item.name or ''
            description = hasattr(item,
                                  'description') and item.description or ''
            protocol = hasattr(item, 'protocol') and item.protocol or ''
            mimetype = hasattr(item, 'mimetype') and item.mimetype or ''
            url = hasattr(item, 'url') and item.url or ''
            resource = {
                'name': name,
                'description': description,
                'protocol': protocol,
                'mimetype': mimetype,
                'url': url,
            }
            resources.append(resource)

        return {
            'state': 'active',
            'type': 'dataset',
            'id': id,
            'name': name,
            'title': title,
            'notes': notes,
            'thumbnail': thumbnail,
            'num_tags': len(tags),
            'tags': tags,
            'groups': groups,
            'metadata_created': dataset_creation_date,
            'metadata_modified': dataset_modification_date,
            'dataset_creation_date': dataset_creation_date,
            'dataset_modification_date': dataset_modification_date,
            'dataset_publication_date': dataset_publication_date,
            'frequency': frequency,
            'geocover': geocover,
            'granularity': granularity,
            'organization': organisation,
            'license_titles': license_titles,
            'support': support,
            'datatype': data_type,
            'author': author,
            'author_email': author_email,
            'maintainer': maintainer,
            'maintainer_email': maintainer_email,
            'num_resources': len(resources),
            'resources': resources,
            'spatial': spatial,
            'bbox': bbox,
            'xml': xml,
        }
コード例 #16
0
class CSWRepository(HarvestRepository):
    """ CSW Repository """
    def setRepoParams(self, repoParams):
        self.metadataprefix = "csw"
        super(CSWRepository, self).setRepoParams(repoParams)
        try:
            self.cswrepo = CatalogueServiceWeb(self.url)
        except:
            self.cswrepo = None
        self.domain_metadata = []

    def _crawl(self):
        kwargs = {
            "repo_id": self.repository_id,
            "repo_url": self.url,
            "repo_set": self.set,
            "repo_name": self.name,
            "repo_type": "csw",
            "enabled": self.enabled,
            "repo_thumbnail": self.thumbnail,
            "item_url_pattern": self.item_url_pattern,
            "abort_after_numerrors": self.abort_after_numerrors,
            "max_records_updated_per_run": self.max_records_updated_per_run,
            "update_log_after_numitems": self.update_log_after_numitems,
            "record_refresh_days": self.record_refresh_days,
            "repo_refresh_days": self.repo_refresh_days,
            "homepage_url": self.homepage_url,
            "repo_oai_name": self.repo_oai_name
        }
        self.repository_id = self.db.update_repo(**kwargs)

        if self.cswrepo is None:
            self.logger.error("Could not initiate this repo to crawl it")
            return

        item_count = 0
        while True:
            try:
                self.cswrepo.getrecords2(
                    startposition=self.cswrepo.results['nextrecord'])
            except:
                self.cswrepo.getrecords2()

            for rec in self.cswrepo.records:
                result = self.db.write_header(
                    self.cswrepo.records[rec].identifier, self.repository_id)
                item_count = item_count + 1
                if (item_count % self.update_log_after_numitems == 0):
                    tdelta = time.time() - self.tstart + 0.1
                    self.logger.info(
                        "Done {} item headers after {} ({:.1f} items/sec)".
                        format(item_count, self.formatter.humanize(tdelta),
                               item_count / tdelta))
            if item_count >= self.cswrepo.results['matches']:
                break

        self.logger.info("Found {} items in feed".format(item_count))

    def format_csw_to_oai(self, csw_record, local_identifier):
        record = {}

        if csw_record.language == "eng":
            record["title"] = csw_record.title
            record["title"] = record["title"].strip()
            record["title_fr"] = ""
            record["tags"] = csw_record.subjects
        elif csw_record.language == "fre":
            record["title_fr"] = csw_record.title
            record["title_fr"] = record["title_fr"].strip()
            record["title"] = ""
            record["tags_fr"] = csw_record.subjects

        record["description"] = csw_record.abstract
        record["identifier"] = local_identifier
        record["creator"] = self.name
        record["series"] = ""

        if csw_record.bbox:
            if float(csw_record.bbox.minx) > float(csw_record.bbox.maxx):
                # longitude values (minx and maxx) are switched by oswlib; switch them back
                record["geobboxes"] = [{
                    "southLat": csw_record.bbox.miny,
                    "westLon": csw_record.bbox.maxx,
                    "northLat": csw_record.bbox.maxy,
                    "eastLon": csw_record.bbox.minx
                }]
            elif float(csw_record.bbox.miny) > float(csw_record.bbox.maxy):
                # sometimes x and y values are switched, so the lats are longs and vice versa
                # we can look for the same discrepancy that happens in the longs, except it's in the y values now
                record["geobboxes"] = [{
                    "southLat": csw_record.bbox.minx,
                    "westLon": csw_record.bbox.maxy,
                    "northLat": csw_record.bbox.maxx,
                    "eastLon": csw_record.bbox.miny
                }]
            else:
                # default if nothing is wrong (this code isn't executed currently)
                record["geobboxes"] = [{
                    "southLat": csw_record.bbox.miny,
                    "westLon": csw_record.bbox.minx,
                    "northLat": csw_record.bbox.maxy,
                    "eastLon": csw_record.bbox.maxx
                }]

        return record

    @rate_limited(5)
    def _update_record(self, record):
        if self.cswrepo is None:
            return

        try:
            self.cswrepo.getrecordbyid(id=[record['local_identifier']])
        except:
            self.logger.error("Unable to update record: {}".format(
                record['local_identifier']))
            self.db.delete_record(record)
            return False

        if self.cswrepo.records:
            csw_record = self.cswrepo.records[record['local_identifier']]
            oai_record = self.format_csw_to_oai(csw_record,
                                                record['local_identifier'])
            # We have to request a second schema to get valid dates, no idea if issue is Hakai-specific
            self.cswrepo.getrecordbyid(
                id=[record['local_identifier']],
                outputschema="http://www.isotc211.org/2005/gmd")
            oai_record["pub_date"] = self.cswrepo.records[
                record['local_identifier']].datestamp
            oai_record["pub_date"] = re.sub(
                "[T ][0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.?[0-9]*[Z]?$", "",
                oai_record["pub_date"])
            if oai_record:
                try:
                    self.db.write_record(oai_record, self)
                except:
                    if self.dump_on_failure == True:
                        try:
                            print(csw_record)
                        except:
                            pass
            return True

        else:
            # This record was deleted
            self.db.delete_record(record)
            return True

        return False
コード例 #17
0
if request is None or url is None:
    usage()
    sys.exit(3)

if schema == 'iso':
    outputschema = 'http://www.isotc211.org/2005/gmd'

# init
c = CatalogueServiceWeb(url, lang, version)

if request == 'GetCapabilities':
    pass
elif request == 'DescribeRecord':
    c.describerecord(typename)
elif request == 'GetRecordById':
    c.getrecordbyid([id])
elif request == 'GetDomain':
    c.getdomain(dname, dtype)
elif request == 'GetRecords':
    c.getrecords(qtype, [keyword], bbox, esn, sortby, schema)

if print_request is True:  # print the request
    print c.request

if validate is True:  # do XML validation
    print 'Validating request XML'
    if util.xmlvalid(c.request, csw.schema_location.split()[1]) is True:
        print 'request is valid XML'
    else:
        print 'request is NOT valid XML'
コード例 #18
0
ファイル: ckan_csw.py プロジェクト: fterrier/ckanext-zhgis
class CkanMetadata(object):
    """ Provides general access to CSW for CKAN """
    def __init__(self, url, schema, version='2.0.2', lang='en-US'):
        self.schema = schema
        self.catalog = CatalogueServiceWeb(
            url,
            lang,
            version,
            timeout=10,
            skip_caps=True
        )
        self.metadata = dict.fromkeys([
            'id',
            'name',
            'title',
            'url',
            'author',
            'maintainer',
            'maintainer_email',
            'license_url',
            'version',
            'service_url',
            'service_type',
            'notes',
            'tags',
            'metadata_url',
            'metadata_raw',
        ])

    def get_by_search(self, searchterm, propertyname='csw:AnyText'):
        """ Returns the found csw dataset with the given searchterm """
        self.catalog.getrecords(
            keywords=[searchterm],
            propertyname=propertyname
        )
        if (self.catalog.response is None or
                self.catalog.results['matches'] == 0):
            raise DatasetNotFoundError(
                "No dataset for the given searchterm '%s' (%s) found"
                % (searchterm, propertyname)
            )
        return self.catalog.records

    def get_by_id(self, id):
        """ Returns the csw dataset with the given id """
        self.catalog.getrecordbyid(id=[id], outputschema=self.schema)
        return self.catalog.response

    def get_id_by_dataset_name(self, dataset_name):
        """
            Returns the id of a dataset identified by it's name.
            If there are multiple datasets with the given name,
            only the id of the first one is returned.
        """
        dataset_list = self.get_by_search(dataset_name, 'title')
        return dataset_list.itervalues().next().identifier

    def get_attribute(self, ckan_attribute, dataset_name=None):
        """
        Abstract method to define the mapping
        of a ckan attribute to a csw attribute
        """
        raise NotImplementedError

    def get_xml(self, id):
        dataset_xml_string = self.get_by_id(id)
        if dataset_xml_string is None:
            raise DatasetNotFoundError("Dataset with id %s not found" % id)
        return dataset_xml_string

    def get_ckan_metadata_by_id(self, id, language='de'):
        log.debug("Dataset ID: %s" % id)

        dataset_xml = etree.fromstring(self.get_xml(id))
        for key in self.metadata:
            log.debug("Metadata key: %s" % key)
            attribute = self.get_attribute(key)
            self.metadata[key] = attribute.get_value(
                xml=dataset_xml,
                lang=language
            )
        return self.metadata

    def get_ckan_metadata(self, dataset_name, language='de'):
        """ Returns the requested dataset mapped to CKAN attributes """
        id = self.get_id_by_dataset_name(dataset_name)
        return self.get_ckan_metadata_by_id(id, language)
コード例 #19
0
class CSWSource:
    """ A CSW Harvest Source """

    csw = None
    csw_info = {}

    errors = []
    datasets = []  # all datasets included
    validation_errors = []
    duplicates = []  # list of datasets with the same identifier

    def __init__(self, url):
        self.url = url

    def get_cleaned_url(self):
        # remove all URL params
        parts = urlparse(self.url)
        return urlunparse(
            (parts.scheme, parts.netloc, parts.path, None, None, None))

    def connect_csw(self, clean_url=True, timeout=120):
        # connect to csw source
        url = self.get_cleaned_url() if clean_url else self.url
        try:
            self.csw = CatalogueServiceWeb(url, timeout=timeout)
        except Exception as e:
            error = f'Error connection CSW: {e}'
            self.errors.append(error)
            return False

        self.read_csw_info()

        return True

    def as_json(self):
        self.read_csw_info()
        return self.csw_info

    def get_records(self, page=10, outputschema='gmd', esn='brief'):
        # iterate pages to get all records
        self.csw_info['records'] = {}
        self.csw_info['pages'] = 0

        # TODO get filters fom harvest source
        # https://github.com/GSA/ckanext-spatial/blob/datagov/ckanext/spatial/harvesters/csw.py#L90
        cql = None

        # output schema
        # outputschema: the outputSchema (default is 'http://www.opengis.net/cat/csw/2.0.2')
        # "csw" at GeoDataGovGeoportalHarvester
        # "gmd" at CSWHarvester
        # outputschema = 'gmd'  # https://github.com/geopython/OWSLib/blob/master/owslib/csw.py#L551

        startposition = 0
        kwa = {
            "constraints": [],
            "typenames": 'csw:Record',
            "esn": esn,
            # esn: the ElementSetName 'full', 'brief' or 'summary' (default is 'full')
            "startposition": startposition,
            "maxrecords": page,
            "outputschema": namespaces[outputschema],
            "cql": cql,
        }

        matches = 0
        self.csw_info['records'] = {}

        while True:
            try:
                self.csw.getrecords2(**kwa)
            except Exception as e:
                error = f'Error getting records(2): {e}'
                self.errors.append(error)
                break
            if self.csw.exceptionreport:
                exceptions = self.csw.exceptionreport.exceptions
                error = 'Error getting records: {}'.format(exceptions)
                self.errors.append(error)
                # raise Exception(error)
                break

            self.csw_info['pages'] += 1
            if matches == 0:
                matches = self.csw.results['matches']

            records = self.csw.records.items()

            for record in records:
                key, csw_record = record
                if outputschema == 'gmd':
                    # it's a MD_Metadata object
                    # https://github.com/geopython/OWSLib/blob/3338340e6a9c19dd3388240815d35d60a0d0cf4c/owslib/iso.py#L31
                    value = self.md_metadata_to_dict(csw_record)
                elif outputschema == 'csw':
                    # it's a CSWRecord
                    raise Exception('Not using CSWRecords')

                value['esn'] = esn
                self.csw_info['records'][key] = value
                yield value

            if len(records) == 0:
                break

            startposition += page
            if startposition > matches:
                break

            kwa["startposition"] = startposition

        self.csw_info['total_records'] = len(self.csw_info['records'].keys())

    def get_record(self, identifier, esn='full', outputschema='gmd'):
        #  Get Full record info
        try:
            records = self.csw.getrecordbyid(
                [identifier], outputschema=namespaces[outputschema])
        except ExceptionReport as e:
            self.errors.append(f'Error getting record {e}')
            # 'Invalid parameter value: locator=outputSchema' is an XML error
            return None

        csw_record = self.csw.records[identifier]
        dict_csw_record = self.md_metadata_to_dict(csw_record)

        record = self.csw_info['records'].get(identifier, {})
        record.update(dict_csw_record)
        record['esn'] = esn
        record['outputschema'] = outputschema

        self.csw_info['records'][identifier] = record

        return record

    def md_metadata_to_dict(self, mdm):
        # analize an md_metadata object
        ret = {}
        # ret['xml'] = mdm.xml it's a bytes type
        ret['identifier'] = mdm.identifier
        ret['parentidentifier'] = mdm.parentidentifier
        ret['language'] = mdm.language
        ret['dataseturi'] = mdm.dataseturi
        ret['languagecode'] = mdm.languagecode
        ret['datestamp'] = mdm.datestamp
        ret['charset'] = mdm.charset
        ret['hierarchy'] = mdm.hierarchy
        ret['contact'] = []
        for ctc in mdm.contact:
            contact = {
                'name': ctc.name,
                'organization': ctc.organization,
                'city': ctc.city,
                'email': ctc.email,
                'country': ctc.country
            }
            ret['contact'].append(contact)

        ret['datetimestamp'] = mdm.datetimestamp
        ret['stdname'] = mdm.stdname
        ret['stdver'] = mdm.stdver
        ret['locales'] = []
        for lo in mdm.locales:
            ret['locales'].append({
                'id': lo.id,
                'languagecode': lo.languagecode,
                'charset': lo.charset
            })

        # ret['referencesystem'] = mdm.referencesystem
        # this two will be reemplaced by "identificationinfo"
        #   ret['identification'] = mdm.identification
        #   ret['serviceidentification'] = mdm.serviceidentification
        ret['identificationinfo'] = []
        for ii in mdm.identificationinfo:
            iid = {
                'title': ii.title,
                'abstract': ii.abstract
            }  # there are much more info
            ret['identificationinfo'].append(iid)

        ret['contentinfo'] = []
        for ci in mdm.contentinfo:
            cid = {'xml': ci.xml}  # there are much more info
            ret['contentinfo'].append(cid)

        ret['distribution'] = {}
        if mdm.distribution is not None:
            dd = {
                'format': mdm.distribution.format,
                'version': mdm.distribution.version
            }  # there are much more info
            ret['distribution'] = dd

        # TODO ret['dataquality'] = mdm.dataquality
        return ret

    def read_csw_info(self):
        csw_info = {}
        service = self.csw
        # Check each service instance conforms to OWSLib interface
        service.alias = 'CSW'
        csw_info['version'] = service.version
        csw_info['identification'] = {}  # service.identification
        csw_info['identification']['type'] = service.identification.type
        csw_info['identification']['version'] = service.identification.version
        csw_info['identification']['title'] = service.identification.title
        csw_info['identification'][
            'abstract'] = service.identification.abstract
        csw_info['identification'][
            'keywords'] = service.identification.keywords
        csw_info['identification'][
            'accessconstraints'] = service.identification.accessconstraints
        csw_info['identification']['fees'] = service.identification.fees

        csw_info['provider'] = {}
        csw_info['provider']['name'] = service.provider.name
        csw_info['provider']['url'] = service.provider.url
        ctc = service.provider.contact
        contact = {
            'name': ctc.name,
            'organization': ctc.organization,
            'site': ctc.site,
            'instructions': ctc.instructions,
            'email': ctc.email,
            'country': ctc.country
        }
        csw_info['provider']['contact'] = contact

        csw_info['operations'] = []
        for op in service.operations:
            methods = op.methods
            for method in methods:
                if type(method) == dict:
                    constraints = []
                    for k, v in method.items():
                        if k == 'constraints':
                            for c in v:
                                if type(c) == dict:
                                    constraints.append(c)
                                else:
                                    mc = {'name': c.name, 'values': c.values}
                                    constraints.append(mc)
                            method['constraints'] = constraints

            operation = {
                'name': op.name,
                'formatOptions': op.formatOptions,
                'methods': methods
            }
            csw_info['operations'].append(operation)

        self.csw_info.update(csw_info)
        return self.csw_info

    def get_original_url(self, harvest_id=None):
        # take the URL and add required params
        parts = urlparse(self.url)
        # urlparse('http://www.cwi.nl:80/%7Eguido/Python.html?q=90&p=881')
        # ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='q=90&p=881', fragment='')

        params = {
            'SERVICE': 'CSW',
            'VERSION': '2.0.2',
            'REQUEST': 'GetRecordById',
            'OUTPUTSCHEMA': 'http://www.isotc211.org/2005/gmd',
            'OUTPUTFORMAT': 'application/xml',
        }
        if harvest_id is not None:
            params['ID'] = harvest_id

        url = urlunparse((parts.scheme, parts.netloc, parts.path, None,
                          urlencode(params), None))

        return url

    def validate(self):
        errors = []  # to return list of validation errors
        # return False, errors

        return True, None

    def remove_duplicated_identifiers(self):
        unique_identifiers = []

        for dataset in self.datasets:
            idf = dataset['identifier']
            if idf not in unique_identifiers:
                unique_identifiers.append(idf)
            else:
                self.duplicates.append(idf)
                self.datasets.remove(dataset)

        return self.duplicates

    def count_resources(self):
        """ read all datasets and count resources """
        total = 0
        for dataset in self.datasets:
            pass  # TODO
        return total

    def save_validation_errors(self, path):
        dmp = json.dumps(self.validation_errors, indent=2)
        f = open(path, 'w')
        f.write(dmp)
        f.close()

    def save_duplicates(self, path):
        dmp = json.dumps(self.duplicates, indent=2)
        f = open(path, 'w')
        f.write(dmp)
        f.close()

    def save_datasets_as_data_packages(self, folder_path):
        """ save each dataset from a data.json source as _datapackage_ """
        for dataset in self.datasets:
            package = Package()

            #TODO check this, I'm learning datapackages
            resource = Resource({'data': dataset})
            resource.infer()  #adds "name": "inline"

            #FIXME identifier uses incompables characthers as paths (e.g. /).
            # could exist duplicates paths from different resources
            # use BASE64 or hashes
            idf = slugify(dataset['identifier'])

            resource_path = os.path.join(folder_path,
                                         f'resource_data_json_{idf}.json')
            if not resource.valid:
                raise Exception('Invalid resource')

            resource.save(resource_path)

            package.add_resource(descriptor=resource.descriptor)
            package_path = os.path.join(folder_path,
                                        f'pkg_data_json_{idf}.zip')
            package.save(target=package_path)
コード例 #20
0
class CkanMetadata(object):
    """ Provides general access to CSW for CKAN """
    def __init__(self, url, schema, version='2.0.2', lang='en-US'):
        self.schema = schema
        self.catalog = CatalogueServiceWeb(url,
                                           lang,
                                           version,
                                           timeout=10,
                                           skip_caps=True)
        self.metadata = dict.fromkeys([
            'id',
            'name',
            'title',
            'url',
            'author',
            'maintainer',
            'maintainer_email',
            'license_url',
            'version',
            'service_url',
            'service_type',
            'notes',
            'tags',
            'metadata_url',
            'metadata_raw',
        ])

    def get_by_search(self, searchterm, propertyname='csw:AnyText'):
        """ Returns the found csw dataset with the given searchterm """
        self.catalog.getrecords(keywords=[searchterm],
                                propertyname=propertyname)
        if (self.catalog.response is None
                or self.catalog.results['matches'] == 0):
            raise DatasetNotFoundError(
                "No dataset for the given searchterm '%s' (%s) found" %
                (searchterm, propertyname))
        return self.catalog.records

    def get_by_id(self, id):
        """ Returns the csw dataset with the given id """
        self.catalog.getrecordbyid(id=[id], outputschema=self.schema)
        return self.catalog.response

    def get_id_by_dataset_name(self, dataset_name):
        """
            Returns the id of a dataset identified by it's name.
            If there are multiple datasets with the given name,
            only the id of the first one is returned.
        """
        dataset_list = self.get_by_search(dataset_name, 'title')
        return dataset_list.itervalues().next().identifier

    def get_attribute(self, ckan_attribute, dataset_name=None):
        """
        Abstract method to define the mapping
        of a ckan attribute to a csw attribute
        """
        raise NotImplementedError

    def get_xml(self, id):
        dataset_xml_string = self.get_by_id(id)
        if dataset_xml_string is None:
            raise DatasetNotFoundError("Dataset with id %s not found" % id)
        return dataset_xml_string

    def get_ckan_metadata_by_id(self, id, language='de'):
        log.debug("Dataset ID: %s" % id)

        dataset_xml = etree.fromstring(self.get_xml(id))
        for key in self.metadata:
            log.debug("Metadata key: %s" % key)
            attribute = self.get_attribute(key)
            self.metadata[key] = attribute.get_value(xml=dataset_xml,
                                                     lang=language)
        return self.metadata

    def get_ckan_metadata(self, dataset_name, language='de'):
        """ Returns the requested dataset mapped to CKAN attributes """
        id = self.get_id_by_dataset_name(dataset_name)
        return self.get_ckan_metadata_by_id(id, language)
コード例 #21
0
def main():
    parser = argparse.ArgumentParser(description = __doc__)
    parser.add_argument('config', help = 'path of configuration file')
    parser.add_argument('-d', '--dry-run', action = 'store_true',
        help = "simulate harvesting, don't update CKAN repository")
    parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity')

    global args
    args = parser.parse_args()
    logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout)

    config_parser = ConfigParser.SafeConfigParser(dict(
        here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))),
        ))
    config_parser.read(args.config)
    conf = conv.check(conv.pipe(
        conv.test_isinstance(dict),
        conv.struct(
            {
                'ckan.api_key': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                'ckan.site_url': conv.pipe(
                    conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True,
                        full = True),
                    conv.not_none,
                    ),
                'user_agent': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                },
            default = 'drop',
            ),
        conv.not_none,
        ))(dict(config_parser.items('Etalab-CKAN-Harvesters')), conv.default_state)

    harvester = helpers.Harvester(
        supplier_abbreviation = u'onm',
        supplier_title = u"Office national de l'eau et des milieux aquatiques",
        target_headers = {
            'Authorization': conf['ckan.api_key'],
            'User-Agent': conf['user_agent'],
            },
        target_site_url = conf['ckan.site_url'],
        )
    source_site_url = u'http://opendata-sie-back.brgm-rec.fr/geosource/srv/eng/csw'  # Recette environment

    if not args.dry_run:
        harvester.retrieve_target()

    # Retrieve short infos of packages in source.
    csw = CatalogueServiceWeb(source_site_url)

    bad_indexes = []
    index = 0
    limit = 50
    record_by_id = {}
    while True:
        try:
            csw.getrecords(maxrecords = limit, startposition = index)
        except:
            if limit == 1:
                # Bad record found. Skip it.
                bad_indexes.append(index)
                index += 1
                limit = 50
            else:
                # Retry one by one to find bad record and skip it.
                limit = 1
        else:
            for id, record in csw.records.iteritems():
                record_by_id[id] = record
            next_index = csw.results['nextrecord']
            if next_index <= index:
                break
            index = next_index

    # Retrieve packages from source.
    formats = set()
    groups = [
        harvester.upsert_group(dict(
            title = u'Environnement',
            )),
        ]
    temporals = set()
    types = set()
    for record_id in record_by_id.iterkeys():
        csw.getrecordbyid(id = [record_id])
        record = csw.records[record_id]

        formats.add(record.format)
        temporals.add(record.temporal)
        types.add(record.type)

        if not args.dry_run:
            package = dict(
                license_id = u'fr-lo',
                notes = u'\n\n'.join(
                    fragment
                    for fragment in (
                        record.abstract,
                        record.source,
                        )
                    if fragment
                    ),
                resources = [
                    dict(
                        description = uri.get('description') or None,
                        format = {
                            'CSV': 'CSV',
                            'ESRI Shapefile': 'SHP',
                            'MIF / MID': 'MIF / MID',  # TODO?
                            'RDF': 'RDF',
                            'SHP': 'SHP',
                            'Txt': 'TXT',
                            'WMS': 'WMS',
                            }.get(record.format, record.format),
                        name = uri.get('name'),
                        url = uri['url'],
                        )
                    for uri in record.uris
                    ],
                tags = [
                    dict(name = strings.slugify(subject))
                    for subject in record.subjects
                    ],
#                territorial_coverage = TODO
                # Datasets have a granularity of either "commune" or "poi". Since the both are indexed the same way, use
                # "poi".
                territorial_coverage_granularity = 'poi',
                title = record.title,
#                url = u'URL TODO',
                )

            log.info(u'Harvested package: {}'.format(package['title']))
            harvester.add_package(package, harvester.supplier, record.title, package['url'], groups = groups)

    if not args.dry_run:
        harvester.update_target()

    log.info(u'Formats: {}'.format(sorted(formats)))
    log.info(u'Temporals: {}'.format(sorted(temporals)))
    log.info(u'Types: {}'.format(sorted(types)))

    return 0
コード例 #22
0
ファイル: csw-client.py プロジェクト: CowanSM/owslib
if request is None or url is None:
    usage()
    sys.exit(3)

if schema == 'iso':
  outputschema = 'http://www.isotc211.org/2005/gmd'

# init
c = CatalogueServiceWeb(url, lang, version)

if request == 'GetCapabilities':
    pass
elif request == 'DescribeRecord':
    c.describerecord(typename)
elif request == 'GetRecordById':
    c.getrecordbyid([id])
elif request == 'GetDomain':
    c.getdomain(dname, dtype)
elif request == 'GetRecords':
    c.getrecords(qtype, [keyword], bbox, esn, sortby, schema)

if print_request is True: # print the request
    print c.request

if validate is True: # do XML validation
    print 'Validating request XML'
    if util.xmlvalid(c.request, csw.schema_location.split()[1]) is True:
        print 'request is valid XML'
    else:
        print 'request is NOT valid XML'
コード例 #23
0
ファイル: geonetwork.py プロジェクト: wildintellect/geonode
 def get_by_uuid(self, uuid):
     csw = CatalogueServiceWeb(self.base + "srv/en/csw")
     csw.getrecordbyid([uuid], outputschema=namespaces["gmd"])
     recs = csw.records
     return recs.values()[0] if len(recs) > 0 else None
コード例 #24
0
ファイル: csw_scanner.py プロジェクト: thriuin/od-harvester
class CswScanner:
    def __init__(self):
        self.napids = []
        self.start_pos = 0

        # Get the CSW URL, Username and Password

        ini_config = ConfigParser()
        ini_config.read('harvester.ini')
        csw_url = ini_config.get('csw', 'csw.url')
        csw_user = ini_config.get('csw', 'csw.username')
        csw_passwd = ini_config.get('csw', 'csw.password')
        if csw_user and csw_passwd:
            self.csw = CatalogueServiceWeb(csw_url,
                                           username=csw_user,
                                           password=csw_passwd,
                                           timeout=20)
        else:
            self.csw = CatalogueServiceWeb(csw_url, timeout=20)

#### The since date is currently being ignored.

    def get_all_ids(self, since=None):

        while True:
            if since is not None:
                scan_date = since.strftime('%Y-%m-%d')
                since_query = PropertyIsGreaterThanOrEqualTo(
                    'Modified', scan_date)
                self.csw.getrecords2(esn='brief',
                                     startposition=self.start_pos,
                                     typenames='gmd:MD_Metadata',
                                     constraints=[since_query])
            else:
                #self.csw.getrecords2(esn='brief', startposition=self.start_pos, typenames='gmd:MD_Metadata')
                self.csw.getrecords2(
                    xml=
                    '<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2" service="CSW" version="2.0.2" resultType="results" outputSchema="csw:IsoRecord"><csw:Query typeNames="gmd:MD_Metadata"><csw:Constraint version="1.1.0"><Filter xmlns="http://www.opengis.net/ogc" xmlns:gml="http://www.opengis.net/gml"/></csw:Constraint></csw:Query></csw:GetRecords>'
                )
            if self.csw.results['returned'] == 0:
                break
            print '{0}Found {1}{2}{3} records'.format(
                Fore.GREEN, Fore.BLUE, self.csw.results['matches'], Fore.GREEN)
            print '{0}Next record: {1}{2}'.format(
                Fore.GREEN, Fore.BLUE, self.csw.results['nextrecord'])
            self.start_pos = self.csw.results['nextrecord']

            for rec in self.csw.records:
                try:
                    print u'{0}{1}{2}: {3}'.format(
                        Fore.RED, rec, Fore.CYAN,
                        self.csw.records[rec].title.decode('utf-8'))
                except UnicodeEncodeError:
                    print u'{0}Unprintable title for {1}{2}'.format(
                        Fore.GREEN, Fore.RED, rec)
                self.napids.append(rec)

    def load_naps(self):

        ns = Namespaces()
        gmd = ns.get_namespace('gmd')
        session = connect_to_database()

        for napid in self.napids:

            print '{0}Full NAP Record for {1}{2}'.format(
                Fore.GREEN, Fore.CYAN, napid)
            self.csw.getrecordbyid(id=[napid], outputschema=gmd)

            ec_rec = find_record_by_uuid(session, napid, query_class=ECRecord)

            if ec_rec is None:
                ec_rec = ECRecord(
                    uuid=self.csw.records[napid].identifier,
                    title=self.csw.records[napid].identification.title,
                    state='active',
                    nap_record=self.csw.records[napid].xml,
                    csw_scanned=datetime.now().isoformat())
            else:
                ec_rec.title = self.csw.records[napid].identification.title,
                ec_rec.state = 'active',
                ec_rec.nap_record = self.csw.records[napid].xml,
                ec_rec.csw_scanned = datetime.now().isoformat()

            add_record(session, ec_rec)

        session.close_all()
コード例 #25
0
class GeonetUserHandler(metaclass=Singleton):
    def __init__(self):
        self.username = GEONETWORK_LOGIN
        self.password = GEONETWORK_PASSWORD
        self.remote = CatalogueServiceWeb(urljoin(GEONETWORK_URL,
                                                  'srv/fre/csw-publication'),
                                          timeout=GEONETWORK_TIMEOUT,
                                          lang='fr-FR',
                                          version='2.0.2',
                                          skip_caps=True,
                                          username=self.username,
                                          password=self.password)

    def _get(self, url, params):
        r = requests.get(url,
                         params=params,
                         auth=(self.username, self.password))
        r.raise_for_status()
        return r

    def _q(self, identifier):
        r = self._get(urljoin(GEONETWORK_URL, 'srv/fre/q'), {
            'uuid': identifier,
            '_content_type': 'json'
        })
        metadata = r.json().get('metadata')
        if metadata \
                and len(metadata) == 1 \
                and metadata[0]['uuid'] == identifier:
            return metadata[0]['id']
        # Sinon error ?

    def _md_publish(self, identifier):
        return self._get(urljoin(GEONETWORK_URL, 'srv/fre/md.publish'),
                         {'ids': identifier})

    def _transaction(self, ttype, identifier, record=None):
        return self.remote.transaction(ttype=ttype,
                                       typename='gmd:MD_Metadata',
                                       identifier=identifier,
                                       record=record)

    def is_record_exists(self, id):
        return self.get_record(id) and True or False

    def get_record(self, id):
        try:
            self.remote.getrecordbyid(
                id=[id], outputschema='http://www.isotc211.org/2005/gmd')
        except requests.exceptions.HTTPError as e:
            logger.exception(e)
            if (e.response.status_code == 404):
                logger.warning("Error 404 was ignored.")
                return None
            raise e
        else:
            logger.debug("Get MD record with dc:identifier = '%s'" % id)
            return self.remote.records.get(id)

    def create_record(self, id, record):
        logger.debug("Create MD record with dc:identifier = '%s'" % id)
        return self._transaction('insert', id, record=record)

    def update_record(self, id, record):
        logger.debug("Update MD record with dc:identifier = '%s'" % id)
        return self._transaction('update', id, record=record)

    def delete_record(self, id):
        logger.debug("Delete MD record with dc:identifier = '%s'" % id)
        return self._transaction('delete', id)

    def publish(self, id):
        return self._md_publish(self._q(id))
コード例 #26
0
class CSWRepository(HarvestRepository):
    """ CSW Repository """
    def setRepoParams(self, repoParams):
        self.metadataprefix = "csw"
        super(CSWRepository, self).setRepoParams(repoParams)
        self.cswrepo = CatalogueServiceWeb(self.url)
        self.domain_metadata = []

    def _crawl(self):
        kwargs = {
            "repo_id": self.repository_id,
            "repo_url": self.url,
            "repo_set": self.set,
            "repo_name": self.name,
            "repo_type": "csw",
            "enabled": self.enabled,
            "repo_thumbnail": self.thumbnail,
            "item_url_pattern": self.item_url_pattern,
            "abort_after_numerrors": self.abort_after_numerrors,
            "max_records_updated_per_run": self.max_records_updated_per_run,
            "update_log_after_numitems": self.update_log_after_numitems,
            "record_refresh_days": self.record_refresh_days,
            "repo_refresh_days": self.repo_refresh_days,
            "homepage_url": self.homepage_url
        }
        self.repository_id = self.db.update_repo(**kwargs)

        item_count = 0
        while True:
            try:
                self.cswrepo.getrecords2(
                    startposition=self.cswrepo.results['nextrecord'])
            except:
                self.cswrepo.getrecords2()

            for rec in self.cswrepo.records:
                result = self.db.write_header(
                    self.cswrepo.records[rec].identifier, self.repository_id)
                item_count = item_count + 1
                if (item_count % self.update_log_after_numitems == 0):
                    tdelta = time.time() - self.tstart + 0.1
                    self.logger.info(
                        "Done {} item headers after {} ({:.1f} items/sec)".
                        format(item_count, self.formatter.humanize(tdelta),
                               item_count / tdelta))
            if item_count == self.cswrepo.results['matches']:
                break

        self.logger.info("Found {} items in feed".format(item_count))

    def format_csw_to_oai(self, csw_record, local_identifier):
        record = {}

        record["title"] = csw_record.title
        record["description"] = csw_record.abstract
        record["tags"] = csw_record.subjects
        record["identifier"] = local_identifier
        record["creator"] = self.name
        record["contact"] = self.contact
        record["series"] = ""

        return record

    def _rate_limited(max_per_second):
        """ Decorator that make functions not be called faster than a set rate """
        threading = __import__('threading')
        lock = threading.Lock()
        min_interval = 1.0 / float(max_per_second)

        def decorate(func):
            last_time_called = [0.0]

            @wraps(func)
            def rate_limited_function(*args, **kwargs):
                lock.acquire()
                elapsed = time.clock() - last_time_called[0]
                left_to_wait = min_interval - elapsed

                if left_to_wait > 0:
                    time.sleep(left_to_wait)

                lock.release()

                ret = func(*args, **kwargs)
                last_time_called[0] = time.clock()
                return ret

            return rate_limited_function

        return decorate

    @_rate_limited(5)
    def _update_record(self, record):

        self.cswrepo.getrecordbyid(id=[record['local_identifier']])
        if self.cswrepo.records:
            csw_record = self.cswrepo.records[record['local_identifier']]
            oai_record = self.format_csw_to_oai(csw_record,
                                                record['local_identifier'])
            # We have to request a second schema to get valid dates, no idea if issue is Hakai-specific
            self.cswrepo.getrecordbyid(
                id=[record['local_identifier']],
                outputschema="http://www.isotc211.org/2005/gmd")
            oai_record["pub_date"] = self.cswrepo.records[
                record['local_identifier']].datestamp
            oai_record["pub_date"] = re.sub(
                "[T ][0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.?[0-9]*[Z]?$", "",
                oai_record["pub_date"])
            if oai_record:
                self.db.write_record(oai_record, self.repository_id,
                                     self.metadataprefix.lower(),
                                     self.domain_metadata)
            return True

        else:
            # This record was deleted
            self.db.delete_record(record)
            return True

        return False
コード例 #27
0
class GeonetUserHandler(metaclass=Singleton):
    def __init__(self):
        self.username = GEONET_USERNAME
        self.password = GEONET_PASSWORD
        self.remote = CatalogueServiceWeb(urljoin(GEONET_URL,
                                                  'srv/fre/csw-publication'),
                                          timeout=GEONET_TIMEOUT,
                                          lang='fr-FR',
                                          version='2.0.2',
                                          skip_caps=True,
                                          username=self.username,
                                          password=self.password)

    def _get(self, url, params):
        r = requests.get(url,
                         params=params,
                         auth=(self.username, self.password))
        r.raise_for_status()
        return r

    def _q(self, identifier):
        r = self._get(urljoin(GEONET_URL, 'srv/fre/q'), {
            'uuid': identifier,
            '_content_type': 'json'
        })
        metadata = r.json().get('metadata')
        if metadata \
                and len(metadata) == 1 \
                and metadata[0]['uuid'] == identifier:
            return metadata[0]['id']
        # Sinon error ?

    def _md_publish(self, identifier):
        return self._get(urljoin(GEONET_URL, 'srv/fre/md.publish'),
                         {'ids': identifier})

    def _transaction(self, ttype, identifier, record):
        params = {
            'identifier': identifier,
            'record': record,
            'ttype': ttype,
            'typename': 'gmd:MD_Metadata'
        }
        return self.remote.transaction(**params)

    def is_record_exists(self, id):
        return self.get_record(id) and True or False

    def get_record(self, id):
        self.remote.getrecordbyid(
            id=[id], outputschema='http://www.isotc211.org/2005/gmd')
        return self.remote.records.get(id)

    def create_record(self, id, record):
        return self._transaction('insert', id, record)

    def update_record(self, id, record):
        return self._transaction('update', id, record)

    # def delete_record(self, id):
    #     return self.remote.transaction('delete', id)

    def publish(self, id):
        return self._md_publish(self._q(id))
コード例 #28
0
    def reimport_batch(self, package_ids, context):
        '''Batch-reimport all packages in `package_ids` from their original
           harvest source.'''

        ckan_fb_mapping = {}

        # first, do checks that can be done without connection to FIS-Broker
        for package_id in package_ids:
            package = Package.get(package_id)

            if not package:
                raise PackageIdDoesNotExistError(package_id)

            if not dataset_was_harvested(package):
                raise PackageNotHarvestedError(package_id)

            harvester = harvester_for_package(package)
            harvester_url = harvester.url
            harvester_type = harvester.type
            if not harvester_type == HARVESTER_ID:
                raise PackageNotHarvestedInFisbrokerError(package_id)

            fb_guid = fisbroker_guid(package)
            if not fb_guid:
                raise NoFisbrokerIdError(package_id)

            ckan_fb_mapping[package.id] = fb_guid

        # get the harvest source for FIS-Broker datasets
        fb_source = get_fisbroker_source()
        if not fb_source:
            raise NoFBHarvesterDefined()
        source_id = fb_source.get('id', None)

        # Create and start a new harvest job
        job_dict = toolkit.get_action('harvest_job_create')(context, {'source_id': source_id})
        harvest_job = HarvestJob.get(job_dict['id'])
        harvest_job.gather_started = datetime.datetime.utcnow()
        assert harvest_job

        # instatiate the CSW connector (on the reasonable assumption that harvester_url is
        # the same for all package_ids)
        package_id = None
        reimported_packages = []
        try:
            csw = CatalogueServiceWeb(harvester_url)
            for package_id, fb_guid in ckan_fb_mapping.items():
                # query connector to get resource document
                csw.getrecordbyid([fb_guid], outputschema=namespaces['gmd'])

                # show resource document
                record = csw.records.get(fb_guid, None)
                if record:
                    obj = HarvestObject(guid=fb_guid,
                                        job=harvest_job,
                                        content=record.xml,
                                        package_id=package_id,
                                        extras=[
                                            HarvestObjectExtra(key='status',value='change'),
                                            HarvestObjectExtra(key='type',value='reimport'),
                                        ])
                    obj.save()

                    assert obj, obj.content

                    harvester = FisbrokerPlugin()
                    harvester.force_import = True
                    harvester.import_stage(obj)
                    rejection_reason = self._dataset_rejected(obj)
                    if rejection_reason:
                        raise FBImportError(package_id, rejection_reason)

                    harvester.force_import = False
                    Session.refresh(obj)

                    reimported_packages.append(record)

                else:
                    raise NotFoundInFisbrokerError(package_id, fb_guid)

        except RequestException as error:
            raise NoConnectionError(package_id, harvester_url, str(error.__class__.__name__))


        # successfully finish harvest job
        harvest_job.status = u'Finished'
        harvest_job.finished = datetime.datetime.utcnow()
        harvest_job.save()

        return reimported_packages
コード例 #29
0
class CSWSource(HarvesterBaseSource):
    """ A CSW Harvest Source """

    csw = None
    csw_info = {}

    def __init__(self, url):
        super().__init__()
        self.url = url

    def get_cleaned_url(self):
        # remove all URL params
        parts = urlparse(self.url)
        return urlunparse(
            (parts.scheme, parts.netloc, parts.path, None, None, None))

    def fetch(self, clean_url=True, timeout=120):
        # connect to csw source
        url = self.get_cleaned_url() if clean_url else self.url
        try:
            self.csw = CatalogueServiceWeb(url, timeout=timeout)
        except Exception as e:
            error = f'Error connection CSW: {e}'
            self.errors.append(error)
            logger.error(error)
            raise

        self.read_csw_info()

    def as_json(self):
        self.read_csw_info()
        return self.csw_info

    def get_records(self, page=10, outputschema='gmd', esn='brief'):
        # iterate pages to get all records
        self.csw_info['records'] = {}
        self.csw_info['pages'] = 0

        # TODO get filters fom harvest source
        # https://github.com/GSA/ckanext-spatial/blob/datagov/ckanext/spatial/harvesters/csw.py#L90
        cql = None

        # output schema
        # outputschema: the outputSchema (default is 'http://www.opengis.net/cat/csw/2.0.2')
        # "csw" at GeoDataGovGeoportalHarvester
        # "gmd" at CSWHarvester
        # outputschema = 'gmd'  # https://github.com/geopython/OWSLib/blob/master/owslib/csw.py#L551

        startposition = 0
        kwa = {
            "constraints": [],
            "typenames": 'csw:Record',
            "esn": esn,
            # esn: the ElementSetName 'full', 'brief' or 'summary' (default is 'full')
            "startposition": startposition,
            "maxrecords": page,
            "outputschema": namespaces[outputschema],
            "cql": cql,
        }

        matches = 0
        self.csw_info['records'] = {}
        while True:
            try:
                self.csw.getrecords2(**kwa)
            except Exception as e:
                error = f'Error getting records(2): {e}'
                self.errors.append(error)
                break

            if self.csw.exceptionreport:
                exceptions = self.csw.exceptionreport.exceptions
                error = 'Error getting records: {}'.format(exceptions)
                self.errors.append(error)
                # raise Exception(error)
                break

            self.csw_info['pages'] += 1
            if matches == 0:
                matches = self.csw.results['matches']

            records = self.csw.records.items()

            for record in records:
                key, csw_record = record
                if outputschema == 'gmd':
                    # it's a MD_Metadata object
                    # https://github.com/geopython/OWSLib/blob/3338340e6a9c19dd3388240815d35d60a0d0cf4c/owslib/iso.py#L31
                    value = self.md_metadata_to_dict(csw_record)
                elif outputschema == 'csw':
                    # it's a CSWResource
                    error = 'Not using CSW schema, we require GMD'
                    value['error'] = error

                try:
                    value['iso_values'] = self.read_values_from_xml(
                        xml_data=value['content'])
                except Exception as e:
                    error = f'Error reading ISO values {e}'
                    value['error'] = error
                    raise  # Exception(error)

                value['esn'] = esn
                self.csw_info['records'][key] = value
                yield value

            if len(records) == 0:
                break

            startposition += page
            if startposition > matches:
                break

            kwa["startposition"] = startposition

        self.csw_info['total_records'] = len(self.csw_info['records'].keys())

    def get_record(self, identifier, esn='full', outputschema='gmd'):
        #  Get Full record info
        try:
            records = self.csw.getrecordbyid(
                [identifier], outputschema=namespaces[outputschema])
        except ExceptionReport as e:
            self.errors.append(f'Error getting record {e}')
            # 'Invalid parameter value: locator=outputSchema' is an XML error
            return None

        csw_record = self.csw.records[identifier]
        dict_csw_record = self.md_metadata_to_dict(csw_record)

        record = self.csw_info['records'].get(identifier, {})
        record.update(dict_csw_record)
        record['esn'] = esn
        record['outputschema'] = outputschema

        self.csw_info['records'][identifier] = record

        return record

    def read_values_from_xml(self, xml_data):
        # transform the XML in a dict as ISODocument class
        # (https://github.com/GSA/ckanext-spatial/blob/2a25f8d60c31add77e155c4136f2c0d4e3b86385/ckanext/spatial/model/harvested_metadata.py#L461) did with its read_values function.

        iso_parser = ISODocument(xml_str=xml_data)
        return iso_parser.read_values()

    def process_xml(self, raw_xml):
        # get the XML part we need
        # check samples at /samples folder

        try:
            str_xml = raw_xml.decode('utf-8')
        except Exception as e:
            error = f'Unable to decode bytes as UTF-8: {e}'
            raise Exception(error)
        str_xml = str_xml.replace('\\n', '\n').replace('\\t', '\t')

        try:
            mdtree = xet.fromstring(str_xml)
        except Exception as e:
            error = f'{e}\n\n - Unable to parse string. \n\n: \t{str_xml[:350]} \n\n'
            raise Exception(error)

        # check if root IS what we are looking for
        needed = [
            '{http://www.isotc211.org/2005/gmd}MD_Metadata',
            '{http://www.isotc211.org/2005/gmi}MI_Metadata'
        ]
        if mdtree.tag in needed:
            gm = mdtree
        else:
            # https://docs.python.org/3/library/xml.etree.elementtree.html#parsing-xml-with-namespaces
            ns = {
                'gmd': 'http://www.isotc211.org/2005/gmd',
                'gmi': 'http://www.isotc211.org/2005/gmi'
            }

            gm1 = mdtree.find('gmd:MD_Metadata', ns)
            gm2 = mdtree.find('gmi:MI_Metadata', ns)
            gm = gm1 or gm2
            # if we have not a xmlns reference the search fails
            if gm is None:
                gm1 = mdtree.find('MD_Metadata')
                gm2 = mdtree.find('MI_Metadata')

            if gm is None:
                # removed undefined tg:"{tg}"
                error = f'Unable to find MD_Metadata. \n\n: \t{str_xml[:150]} \n\n mdtree.root: {mdtree.tag}'
                raise Exception(error)
        try:
            res = xet.tostring(gm)
        except Exception as e:
            error = f'{e}\n\n - gm1:{gm1} gm2:{gm2}\n\n Unable to string. \n\n: \t{str_xml[:150]} \n\n mdtree.root: {mdtree.tag}'
            raise Exception(error)

        if type(res) != str:
            res = res.decode('utf-8')

        return res

    def md_metadata_to_dict(self, mdm):
        # analyze an md_metadata object
        ret = {}

        ret['content'] = self.process_xml(raw_xml=mdm.xml)
        res = '<?xml version="1.0" encoding="UTF-8"?>\n{}'.format(
            ret['content'])
        ret['xml'] = res
        ret['identifier'] = mdm.identifier
        ret['parentidentifier'] = mdm.parentidentifier
        ret['language'] = mdm.language
        ret['dataseturi'] = mdm.dataseturi
        ret['languagecode'] = mdm.languagecode
        ret['datestamp'] = mdm.datestamp
        ret['charset'] = mdm.charset
        ret['hierarchy'] = mdm.hierarchy
        ret['contact'] = []
        for ctc in mdm.contact:
            contact = {
                'name': ctc.name,
                'organization': ctc.organization,
                'city': ctc.city,
                'email': ctc.email,
                'country': ctc.country
            }
            ret['contact'].append(contact)

        ret['datetimestamp'] = mdm.datetimestamp
        ret['stdname'] = mdm.stdname
        ret['stdver'] = mdm.stdver
        ret['locales'] = []
        for lo in mdm.locales:
            ret['locales'].append({
                'id': lo.id,
                'languagecode': lo.languagecode,
                'charset': lo.charset
            })

        # ret['referencesystem'] = mdm.referencesystem
        # this two will be reemplaced by "identificationinfo"
        #   ret['identification'] = mdm.identification
        #   ret['serviceidentification'] = mdm.serviceidentification
        ret['identificationinfo'] = []
        for ii in mdm.identificationinfo:
            iid = {
                'title': ii.title,
                'abstract': ii.abstract
            }  # there are much more info
            ret['identificationinfo'].append(iid)

        ret['contentinfo'] = []
        for ci in mdm.contentinfo:
            cid = {'xml': ci.xml}  # there are much more info
            ret['contentinfo'].append(cid)

        ret['distribution'] = {}
        if mdm.distribution is not None:
            dd = {
                'format': mdm.distribution.format,
                'version': mdm.distribution.version
            }  # there are much more info
            ret['distribution'] = dd

        # TODO ret['dataquality'] = mdm.dataquality
        return ret

    def read_csw_info(self):
        csw_info = {}
        service = self.csw
        # Check each service instance conforms to OWSLib interface
        service.alias = 'CSW'
        csw_info['version'] = service.version
        csw_info['identification'] = {}  # service.identification
        csw_info['identification']['type'] = service.identification.type
        csw_info['identification']['version'] = service.identification.version
        csw_info['identification']['title'] = service.identification.title
        csw_info['identification'][
            'abstract'] = service.identification.abstract
        csw_info['identification'][
            'keywords'] = service.identification.keywords
        csw_info['identification'][
            'accessconstraints'] = service.identification.accessconstraints
        csw_info['identification']['fees'] = service.identification.fees

        csw_info['provider'] = {}
        csw_info['provider']['name'] = service.provider.name
        csw_info['provider']['url'] = service.provider.url
        ctc = service.provider.contact
        contact = {
            'name': ctc.name,
            'organization': ctc.organization,
            'site': ctc.site,
            'instructions': ctc.instructions,
            'email': ctc.email,
            'country': ctc.country
        }
        csw_info['provider']['contact'] = contact

        csw_info['operations'] = []
        for op in service.operations:
            methods = op.methods
            for method in methods:
                if type(method) == dict:
                    constraints = []
                    for k, v in method.items():
                        if k == 'constraints':
                            for c in v:
                                if type(c) == dict:
                                    constraints.append(c)
                                else:
                                    mc = {'name': c.name, 'values': c.values}
                                    constraints.append(mc)
                            method['constraints'] = constraints

            operation = {
                'name': op.name,
                'formatOptions': op.formatOptions,
                'methods': methods
            }
            csw_info['operations'].append(operation)

        self.csw_info.update(csw_info)
        return self.csw_info

    def get_original_url(self, harvest_id=None):
        # take the URL and add required params
        parts = urlparse(self.url)
        # urlparse('http://www.cwi.nl:80/%7Eguido/Python.html?q=90&p=881')
        # ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='q=90&p=881', fragment='')

        params = {
            'SERVICE': 'CSW',
            'VERSION': '2.0.2',
            'REQUEST': 'GetRecordById',
            'OUTPUTSCHEMA': 'http://www.isotc211.org/2005/gmd',
            'OUTPUTFORMAT': 'application/xml',
        }
        if harvest_id is not None:
            params['ID'] = harvest_id

        url = urlunparse((parts.scheme, parts.netloc, parts.path, None,
                          urlencode(params), None))

        return url

    def validate(self):
        # TODO
        return True

    def remove_duplicated_identifiers(self):
        unique_identifiers = []

        for dataset in self.datasets:
            idf = dataset['identifier']
            if idf not in unique_identifiers:
                unique_identifiers.append(idf)
            else:
                self.duplicates.append(idf)
                self.datasets.remove(dataset)

        return self.duplicates

    def count_resources(self):
        """ read all datasets and count resources """
        total = 0
        for dataset in self.datasets:
            pass  # TODO
        return total
コード例 #30
0
ファイル: geonetwork.py プロジェクト: mbertrand/cga-worldmap
 def get_by_uuid(self, uuid):
     csw = CatalogueServiceWeb(self.base + "srv/en/csw")
     csw.getrecordbyid([uuid], outputschema=namespaces["gmd"])
     recs = csw.records
     return recs.values()[0] if len(recs) > 0 else None
コード例 #31
0
ファイル: CSWRepository.py プロジェクト: axfelix/globus_oai
class CSWRepository(HarvestRepository):
    """ CSW Repository """

    def setRepoParams(self, repoParams):
        self.metadataprefix = "csw"
        super(CSWRepository, self).setRepoParams(repoParams)
        try:
            self.cswrepo = CatalogueServiceWeb(self.url)
        except:
            self.cswrepo = None
        self.domain_metadata = []

    def _crawl(self):
        kwargs = {
            "repo_id": self.repository_id, "repo_url": self.url, "repo_set": self.set, "repo_name": self.name,
            "repo_type": "csw",
            "enabled": self.enabled, "repo_thumbnail": self.thumbnail, "item_url_pattern": self.item_url_pattern,
            "abort_after_numerrors": self.abort_after_numerrors,
            "max_records_updated_per_run": self.max_records_updated_per_run,
            "update_log_after_numitems": self.update_log_after_numitems,
            "record_refresh_days": self.record_refresh_days,
            "repo_refresh_days": self.repo_refresh_days, "homepage_url": self.homepage_url
        }
        self.repository_id = self.db.update_repo(**kwargs)

        if self.cswrepo is None:
            self.logger.error("Could not initiate this repo to crawl it")
            return

        item_count = 0
        while True:
            try:
                self.cswrepo.getrecords2(startposition=self.cswrepo.results['nextrecord'])
            except:
                self.cswrepo.getrecords2()

            for rec in self.cswrepo.records:
                result = self.db.write_header(self.cswrepo.records[rec].identifier, self.repository_id)
                item_count = item_count + 1
                if (item_count % self.update_log_after_numitems == 0):
                    tdelta = time.time() - self.tstart + 0.1
                    self.logger.info("Done {} item headers after {} ({:.1f} items/sec)".format(item_count,
                                                                                               self.formatter.humanize(
                                                                                                   tdelta),
                                                                                               item_count / tdelta))
            if item_count == self.cswrepo.results['matches']:
                break

        self.logger.info("Found {} items in feed".format(item_count))

    def format_csw_to_oai(self, csw_record, local_identifier):
        record = {}

        record["title"] = csw_record.title
        record["description"] = csw_record.abstract
        record["tags"] = csw_record.subjects
        record["identifier"] = local_identifier
        record["creator"] = self.name
        record["contact"] = self.contact
        record["series"] = ""

        return record

    def _rate_limited(max_per_second):
        """ Decorator that make functions not be called faster than a set rate """
        threading = __import__('threading')
        lock = threading.Lock()
        min_interval = 1.0 / float(max_per_second)

        def decorate(func):
            last_time_called = [0.0]

            @wraps(func)
            def rate_limited_function(*args, **kwargs):
                lock.acquire()
                elapsed = time.clock() - last_time_called[0]
                left_to_wait = min_interval - elapsed

                if left_to_wait > 0:
                    time.sleep(left_to_wait)

                lock.release()

                ret = func(*args, **kwargs)
                last_time_called[0] = time.clock()
                return ret

            return rate_limited_function

        return decorate

    @_rate_limited(5)
    def _update_record(self, record):
        if self.cswrepo is None:
            return

        self.cswrepo.getrecordbyid(id=[record['local_identifier']])
        if self.cswrepo.records:
            csw_record = self.cswrepo.records[record['local_identifier']]
            oai_record = self.format_csw_to_oai(csw_record, record['local_identifier'])
            # We have to request a second schema to get valid dates, no idea if issue is Hakai-specific
            self.cswrepo.getrecordbyid(id=[record['local_identifier']], outputschema="http://www.isotc211.org/2005/gmd")
            oai_record["pub_date"] = self.cswrepo.records[record['local_identifier']].datestamp
            oai_record["pub_date"] = re.sub("[T ][0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.?[0-9]*[Z]?$", "",
                                            oai_record["pub_date"])
            if oai_record:
                self.db.write_record(oai_record, self.repository_id, self.metadataprefix.lower(), self.domain_metadata)
            return True

        else:
            # This record was deleted
            self.db.delete_record(record)
            return True

        return False
コード例 #32
0
ファイル: harvest_node.py プロジェクト: FuhuXia/ckanext-ngds
class HarvestNode(NgdsDataObject):
    """Stores information about harvest endpoints"""
    csw = None

    def __init__(self, url, **kwargs):
        # A URL must be given
        p = urlparse(url)
        self.url = urlunparse((p.scheme, p.netloc, p.path, "", "",
                               ""))  # Strip URL to just domain + path
        self.frequency = kwargs.get(
            'frequency',
            'manual')  # frequency should be one of manual|daily|weekly|monthly
        self.title = kwargs.get(
            'title', 'No Title Was Given')  # A title for bookkeeping
        self.node_admin_id = kwargs.get(
            'node_admin_id', None
        )  # Foreign Key to a responsible_party who maintains the remote node
        #self.csw = CatalogueServiceWeb(self.url) # owslib CSW class provides mechanisms for making CSW requests

    def setup_csw(self):
        self.csw = CatalogueServiceWeb(self.url)

    def do_harvest(self):
        """Perform a harvest from another CSW server"""
        if self.csw == None:
            self.setup_csw()
        self.get_records()  # Do the first GetRecords request
        ids = self.csw.records.keys()  # Start an array to house all of the ids
        print "next: %s, total: %s" % (self.csw.results["nextrecord"],
                                       self.csw.results["matches"])

        while self.csw.results["nextrecord"] < self.csw.results[
                "matches"] and self.csw.results[
                    "nextrecord"] != 0:  # Once next_record > number_matched, we've gotten everything
            self.get_records(
                self.csw.results["nextrecord"], self.csw.results["returned"]
            )  # Get another set, starting from next_record from previous response
            ids += self.csw.records.keys()  # Add new ids to the array
            print "next: %s, total: %s" % (self.csw.results["nextrecord"],
                                           self.csw.results["matches"])

        self.parse_records(ids)  # Gather the records themselves

    def parse_records(self, ids):
        """Perform as many GetRecordById requests as needed"""
        print "Gathered %s IDs" % str(len(ids))
        for record_id in ids:
            self.get_record_by_id(record_id)
            rec = HarvestedRecord.from_md_metadata(self.csw.records[record_id],
                                                   self)

    def get_record_by_id(self, record_id):
        """Get a single record, by ID"""
        params = {
            "id": [record_id],
            "outputschema": "http://www.isotc211.org/2005/gmd"
        }
        self.csw.getrecordbyid(**params)  # Puts response in self.csw.records

    def get_records(self, start_position=1, max_records=1000):
        """Perform a GetRecords request"""
        params = {
            "typenames": "gmd:MD_Metadata",
            "outputschema": "http://www.isotc211.org/2005/gmd",
            "startposition": start_position,
            "maxrecords": max_records,
            "esn": "brief"
        }
        self.csw.getrecords(**params)  # Puts results in self.csw.records
コード例 #33
0
def main():
    parser = argparse.ArgumentParser(description = __doc__)
    parser.add_argument('config', help = 'path of configuration file')
    parser.add_argument('-d', '--dry-run', action = 'store_true',
        help = "simulate harvesting, don't update CKAN repository")
    parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity')

    global args
    args = parser.parse_args()
    logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout)

    config_parser = ConfigParser.SafeConfigParser(dict(
        here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))),
        ))
    config_parser.read(args.config)
    conf = conv.check(conv.pipe(
        conv.test_isinstance(dict),
        conv.struct(
            {
                'ckan.api_key': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                'ckan.site_url': conv.pipe(
                    conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True,
                        full = True),
                    conv.not_none,
                    ),
                'user_agent': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                },
            default = 'drop',
            ),
        conv.not_none,
        ))(dict(config_parser.items('Etalab-CKAN-Harvesters')), conv.default_state)

    harvester = helpers.Harvester(
        supplier_abbreviation = u'gl',
        supplier_title = u"Grand Lyon",
        target_headers = {
            'Authorization': conf['ckan.api_key'],
            'User-Agent': conf['user_agent'],
            },
        target_site_url = conf['ckan.site_url'],
        )
    source_site_url = u'http://catalogue.data.grandlyon.com/geosource/srv/fr/csw'

    if not args.dry_run:
        harvester.retrieve_target()

    # Retrieve short infos of packages in source.
    csw = CatalogueServiceWeb(source_site_url)

    bad_indexes = []
    index = 0
    limit = 50
    record_by_id = {}
    while True:
        try:
            csw.getrecords(maxrecords = limit, startposition = index)
        except:
            if limit == 1:
                # Bad record found. Skip it.
                bad_indexes.append(index)
                index += 1
                limit = 50
            else:
                # Retry one by one to find bad record and skip it.
                limit = 1
        else:
            for id, record in csw.records.iteritems():
                record_by_id[id] = record
            next_index = csw.results['nextrecord']
            if next_index <= index:
                break
            index = next_index

    # Retrieve packages from source.
    formats = set()
    licenses_url = set()
    protocols = set()
    rights = set()
    temporals = set()
    types = set()
    for record_id in record_by_id.iterkeys():
        csw.getrecordbyid(id = [record_id])
        dc_record = csw.records[record_id]
        csw.getrecordbyid(id = [record_id], outputschema = 'http://www.isotc211.org/2005/gmd')
        gmd_record = csw.records.get(record_id)

        format = dc_record.format
        if format is not None:
            format = format.split(u' (', 1)[0]
        formats.add(format)

        copyright = dc_record.rights
        if copyright and isinstance(copyright, list):
            copyright = tuple(copyright)
            rights.add(copyright)

        if gmd_record is None:
            frequency = None
        else:
            for frequency_xml in etree.fromstring(gmd_record.xml).xpath('./gmd:identificationInfo'
                    '/gmd:MD_DataIdentification/gmd:resourceMaintenance/gmd:MD_MaintenanceInformation'
                    '/gmd:userDefinedMaintenanceFrequency/gts:TM_PeriodDuration',
                    namespaces = namespaces):
                frequency = frequency_xml.text
                break
            else:
                frequency = None
        if frequency is not None:
            assert frequency in frequency_by_code, 'Unknown frequency: {}'.format(frequency)
            frequency = frequency_by_code[frequency]

        for uri in dc_record.uris:
            if uri['url'].startswith('http://opendata.data.grandlyon.com/Licence'):
                licenses_url.add(uri['url'])
            protocols.add(uri['protocol'])

        subjects = [
            subject
            for subject in dc_record.subjects
            if subject != 'OpenData'
            ]
        groups = [
            harvester.upsert_group(dict(
                title = subjects[0],
                )),
            ] if subjects else []
        groups.append(harvester.upsert_group(dict(
            title = u'Territoires et Transports',
            )))
        tags = [
            dict(name = strings.slugify(subject))
            for subject in subjects
            ]

        related = []
        if gmd_record is None:
            resources = [
                dict(
                    description = uri.get('description') or None,
                    format = {
                        'application/pdf': 'PDF',
                        'application/zip': 'ZIP',
                        'pdf': 'PDF',
                        'text/csv': 'CSV',
                        'text/plain': 'TXT',
                        }.get(format, format),
                    name = uri.get('name') or None,
                    url = uri['url'],
                    )
                for uri in dc_record.uris
                if uri.get('protocol') in ('WWW:DOWNLOAD-1.0-http--download', 'WWW:LINK-1.0-http--link')
                    and uri['url'].startswith('http://opendata.data.grandlyon.com/')
                    and uri['url'] != 'http://opendata.data.grandlyon.com/Licence_ODbL_Grand_Lyon.pdf'
                ]
        else:
            kml_resource = False
            resources = []
            for online in gmd_record.distribution.online:
                if online.url.startswith((
                        'http://catalogue.data.grandlyon.com/geosource/srv/en/resources.get?id=',
                        'file:',
                        'jdbc:',
                        )) \
                        or online.url == 'http://opendata.data.grandlyon.com/Licence_ODbL_Grand_Lyon.pdf':
                    continue
                if online.protocol == 'OGC:WFS':
                    if not kml_resource:
                        resources.append(dict(
                            description = online.description or None,
                            format = 'KML',
                            name = online.name or None,
                            url = 'http://kml.data.grandlyon.com/grandlyon/?request=list&typename={}'.format(
                                online.name),
                            ))
                        kml_resource = True
                    if '?' not in online.url:
                        resources.append(dict(
                            description = online.description or None,
                            format = 'GML',
                            name = online.name or None,
                            url = u'{}?SERVICE={}&REQUEST=GetFeature&VERSION=1.1.0&typename={}'.format(online.url,
                                online.protocol.split(':', 1)[1], online.name),
                            ))
                elif online.protocol == 'OGC:WMS':
                    if '?' not in online.url:
                        bounding_box = gmd_record.identification.extent.boundingBox
                        related.append(dict(
                            image_url = u'{}?SERVICE={}&REQUEST=GetMap&VERSION=1.1.1&LAYERS={}&FORMAT=image/png'
                                u'&SRS=EPSG:4326&BBOX={},{},{},{}&WIDTH=400&HEIGHT=300'.format(online.url,
                                online.protocol.split(':', 1)[1], online.name, bounding_box.minx, bounding_box.miny,
                                bounding_box.maxx, bounding_box.maxy),
                            title = u'Vignette',
                            type = u'visualization',
                            # url = None,
                            ))
                resources.append(dict(
                    description = online.description or None,
                    format = {
                        'DB:POSTGIS': 'POSTGIS',
                        'FILE:RASTER': 'RASTER',
                        'OGC:WCS': 'WCS',
                        'OGC:WFS': 'WFS',
                        'OGC:WMS': 'WMS',
                        'WWW:DOWNLOAD-1.0-http--download': None,
                        'WWW:LINK-1.0-http--link': None,
                        }[online.protocol],
                    name = online.name or None,
                    url = online.url,
                    ))
        temporals.add(dc_record.temporal)
        types.add(dc_record.type)

        if args.dry_run:
            log.info(u'Harvested package: {}'.format(dc_record.title))
        else:
            package = dict(
                frequency = {
                    'P0Y0M0DT0H1M0S': u"ponctuelle",
                    }.get(frequency),
                license_id = {
                    'copyright': None,
                    ('Licence ODbL GRAND LYON', u"Pas de restriction d'accès public"): u'odc-odbl',
                    'license': None,
                    }.get(copyright),
                notes = u'\n\n'.join(
                    fragment
                    for fragment in (
                        dc_record.abstract,
                        dc_record.source,
                        )
                    if fragment
                    ),
                resources = resources,
                tags = [
                    dict(name = strings.slugify(subject))
                    for subject in dc_record.subjects
                    ],
#                territorial_coverage = TODO
                title = dc_record.title,
#                TODO: Use this URL once Grand Lyon is ready to use it. Before end of year.
#                url = u'http://smartdata.grandlyon.com/single/{}'.format(record_id),
                url = u'http://smartdata.grandlyon.com/',
                )

#            if gmd_record is not None:
#                for graphic_filename_xml in etree.fromstring(gmd_record.xml).xpath('./gmd:identificationInfo'
#                        '/gmd:MD_DataIdentification/gmd:graphicOverview'
#                        '/gmd:MD_BrowseGraphic[gmd:fileDescription/gco:CharacterString="large_thumbnail"]'
#                        '/gmd:fileName/gco:CharacterString',
#                        namespaces = namespaces):
#                    related.append(dict(
#                        image_url = urlparse.urljoin(base_url, unicode(graphic_filename_xml.text)),
#                        title = u'Vignette',
#                        type = u'visualization',
#                        # url = TODO,
#                        ))

            log.info(u'Harvested package: {}'.format(package['title']))
            harvester.add_package(package, harvester.supplier, dc_record.title, package['url'],
                related = related or None)

    if not args.dry_run:
        harvester.update_target()

    log.info(u'Formats: {}'.format(sorted(formats)))
    log.info(u'Licenses: {}'.format(sorted(licenses_url)))
    log.info(u'Protocols: {}'.format(sorted(protocols)))
    log.info(u'Rights: {}'.format(sorted(rights)))
    log.info(u'Temporals: {}'.format(sorted(temporals)))
    log.info(u'Types: {}'.format(sorted(types)))

    return 0