Beispiel #1
0
 def connect_to_csw(self, url):
     """
     Connect to a CSW using configuration options from config.py.
     """
     if self.CSW_USER and self.CSW_PASSWORD:
         self.csw_i = csw.CatalogueServiceWeb(url,
                                              username=self.CSW_USER,
                                              password=self.CSW_PASSWORD)
     else:
         self.csw_i = csw.CatalogueServiceWeb(url)
def test_csw_ngdc():
    "rewritten doctest/cws_ngdc.txt"
    c = csw.CatalogueServiceWeb(SERVICE_URL, timeout=120)
    assert c.identification.title == 'ArcGIS Server Geoportal Extension 10 - OGC CSW 2.0.2 ISO AP'
    assert c.identification.version == '2.0.2'
    assert sorted(c.identification.keywords) == [
        'Geophysical Metadata', 'NGDC', 'Ocean Metadata',
        'Space Weather Metadata'
    ]
    assert c.provider.name == 'NOAA NGDC'

    # Get some records

    sos_urn = 'urn:x-esri:specification:ServiceType:sos:url'
    aoos_uuid = '1706F520-2647-4A33-B7BF-592FAFDE4B45'
    uuid_filter = fes.PropertyIsEqualTo(propertyname='sys.siteuuid',
                                        literal="{%s}" % aoos_uuid)

    c.getrecords2([uuid_filter], esn='full', maxrecords=999999)
    assert len(c.records) > 40
    assert 'AOOS SOS' in c.records

    aoos_sos = c.records['AOOS SOS']
    assert aoos_sos.abstract == 'Alaska Ocean Observing System SOS'
    assert sorted([
        x['url'] for x in aoos_sos.references if x['scheme'] == sos_urn
    ]) == [
        'http://sos.aoos.org/sos/sos/kvp?service=SOS&request=GetCapabilities&acceptVersions=1.0.0',
        'http://sos.aoos.org/sos/sos/kvp?service=SOS&request=GetCapabilities&acceptVersions=1.0.0',
        'http://sos.aoos.org/sos/sos/kvp?service=SOS&request=GetCapabilities&acceptVersions=1.0.0'
    ]
    assert c.getService_urls(sos_urn) == [
        'http://sos.aoos.org/sos/sos/kvp?service=SOS&request=GetCapabilities&acceptVersions=1.0.0'
    ]
Beispiel #3
0
    def __init__(self, url, username, password, input_csv_path):
        self.INNER_DELIMITER = "###"
        self.csw = csw.CatalogueServiceWeb(url,
                                           username=username,
                                           password=password)
        self.records = {}

        if not os.path.isabs(input_csv_path):
            input_csv_path = os.path.abspath(
                os.path.relpath(input_csv_path, os.getcwd()))

        self.csvfile = open(input_csv_path, "rU")

        self.reader = csv.DictReader(self.csvfile)
        self.fieldnames = self.reader.fieldnames

        self.namespaces = self.get_namespaces()

        # these are the column names that will trigger a change
        self.field_handlers = {
            "iso19139": {
                "NEW_title": self.NEW_title,
                "NEW_abstract": self.NEW_abstract,
            },

            #the xpaths to all of the elements accessible for changes.
            #Root is gmd:MD_Metadata
        }

        self.XPATHS = {
            "iso19139": {
                "citation":
                "gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation",
                "title":
                "gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:title/gco:CharacterString",
                "md_data_identification":
                "gmd:identificationInfo/gmd:MD_DataIdentification",
                "abstract":
                "gmd:identificationInfo/gmd:MD_DataIdentification/gmd:abstract/gco:CharacterString",
            },
        }
Beispiel #4
0
def reindex_services():
    region_map = {
        'AOOS': '1706F520-2647-4A33-B7BF-592FAFDE4B45',
        'CARICOOS': '117F1684-A5E3-400E-98D8-A270BDBA1603',
        'CENCOOS': '4BA5624D-A61F-4C7E-BAEE-7F8BDDB8D9C4',
        'GCOOS': '003747E7-4818-43CD-937D-44D5B8E2F4E9',
        'GLOS': 'B664427E-6953-4517-A874-78DDBBD3893E',
        'MARACOOS': 'C664F631-6E53-4108-B8DD-EFADF558E408',
        'NANOOS': '254CCFC0-E408-4E13-BD62-87567E7586BB',
        'NERACOOS': 'E41F4FCD-0297-415D-AC53-967B970C3A3E',
        'PacIOOS': '68FF11D8-D66B-45EE-B33A-21919BB26421',
        'SCCOOS': 'B70B3E3C-3851-4BA9-8E9B-C9F195DCEAC7',
        'SECOORA': 'B3EA8869-B726-4E39-898A-299E53ABBC98'
    }
    #'NOS/CO-OPS':   '72E748DF-23B1-4E80-A2C4-81E70783094A',
    #'USACE':        '73019DFF-2E01-4800-91CD-0B3F812256A7',
    #'NAVY':         '3B94DAAE-B7E9-4789-993B-0045AD9149D9',
    #'NDBC':         '828981B0-0039-4360-9788-E788FA6B0875',
    #'USGS/CMGP':    'C6F11F00-C2BD-4AC6-8E2C-013E16F4932E' }

    services = {
        'SOS': 'urn:x-esri:specification:ServiceType:sos:url',
        'WMS': 'urn:x-esri:specification:ServiceType:wms:url',
        'WCS': 'urn:x-esri:specification:ServiceType:wcs:url',
        'DAP': 'urn:x-esri:specification:ServiceType:odp:url'
    }

    endpoint = 'http://www.ngdc.noaa.gov/geoportal/csw'  # NGDC Geoportal

    c = csw.CatalogueServiceWeb(endpoint, timeout=120)

    ns = Namespaces()

    with app.app_context():
        for region, uuid in region_map.iteritems():
            # Setup uuid filter
            uuid_filter = fes.PropertyIsEqualTo(propertyname='sys.siteuuid',
                                                literal="{%s}" % uuid)

            # Make CSW request
            c.getrecords2([uuid_filter], esn='full', maxrecords=999999)

            for name, record in c.records.iteritems():

                # @TODO: unfortunately CSW does not provide us with contact info, so
                # we must request it manually
                contact_email = ""
                metadata_url = None

                iso_ref = [
                    x['url'] for x in record.references if x['scheme'] ==
                    'urn:x-esri:specification:ServiceType:ArcIMS:Metadata:Document'
                ]
                if len(iso_ref):
                    metadata_url = iso_ref[0]

                    # Don't query for contact info right now.  It takes WAY too long.
                    #r = requests.get(iso_ref[0])
                    #r.raise_for_status()
                    #node = ET.fromstring(r.content)
                    #safe = nspath_eval("gmd:CI_ResponsibleParty/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString", ns.get_namespaces())
                    #contact_node = node.find(".//" + safe)
                    #if contact_node is not None and contact_node.text != "":
                    #    contact_email = contact_node.text
                    #    if " or " in contact_email:
                    #        contact_email = ",".join(contact_email.split(" or "))

                for ref in record.references:

                    # We are only interested in the 'services'
                    if ref["scheme"] in services.values():
                        url = unicode(ref["url"])
                        s = db.Service.find_one({
                            'data_provider':
                            unicode(region),
                            'url':
                            url
                        })
                        if s is None:
                            s = db.Service()
                            s.url = url
                            s.data_provider = unicode(region)

                        s.service_id = unicode(name)
                        s.name = unicode(record.title)
                        s.service_type = unicode(
                            next((k for k, v in services.items()
                                  if v == ref["scheme"])))
                        s.interval = 3600  # 1 hour
                        s.tld = unicode(urlparse(url).netloc)
                        s.updated = datetime.utcnow()
                        s.contact = unicode(contact_email)
                        s.metadata_url = unicode(metadata_url)
                        s.save()
                        s.schedule_harvest()
Beispiel #5
0
 def __init__(self, url, username=None, password=None):
     super().__init__(url)
     self._csw = \
         csw.CatalogueServiceWeb(url, username=username, password=password)
     self.capabilities = self._csw.response
Beispiel #6
0
def CSWCatalog(url):
    import owslib.csw as csw
    return csw.CatalogueServiceWeb(url)
Beispiel #7
0
def reindex_services(filter_regions=None, filter_service_types=None):
    c = csw.CatalogueServiceWeb(endpoint, timeout=120)

    ns = Namespaces()

    filter_regions = filter_regions or region_map.keys()
    filter_service_types = filter_service_types or services.keys()

    with app.app_context():

        new_services = []
        update_services = []

        # get a set of all non-manual, active services for possible deactivation later
        current_services = set((s._id for s in db.Service.find(
            {
                'manual': False,
                'active': True,
                'data_provider': {
                    '$in': filter_regions
                }
            }, {'_id': True})))

        # FIXME: find a more robust mechanism for detecting ERDDAP instances
        # this would fail if behind a url rewriting/proxying mechanism which
        # remove the 'erddap' portion from the URL.  May want to have GeoPortal
        # use a separate 'scheme' dedicated to ERDDAP for CSW record
        # 'references'

        # workaround for matching ERDDAP endpoints
        # match griddap or tabledap endpoints with html or graph
        # discarding any query string parameters (i.e. some datasets on PacIOOS)
        re_string = r'(^.*erddap/(?:grid|table)dap.*)\.(?:html|graph)(:?\?.*)?$'
        erddap_re = re.compile(re_string)
        erddap_all_re = re.compile(r'(^.*erddap/(?:(?:grid|table|)dap|wms).*)'
                                   r'\.(?:html|graph)(:?\?.*)?$')

        for region, uuid in region_map.iteritems():

            if region not in filter_regions:
                app.logger.info("Skipping region %s due to filter", region)
                continue

            app.logger.info("Requesting region %s", region)

            # Setup uuid filter
            uuid_filter = fes.PropertyIsEqualTo(propertyname='sys.siteuuid',
                                                literal="{%s}" % uuid)

            # Make CSW request
            c.getrecords2([uuid_filter], esn='full', maxrecords=999999)

            for name, record in c.records.iteritems():
                try:
                    # @TODO: unfortunately CSW does not provide us with contact info, so
                    # we must request it manually
                    contact_email = ""
                    metadata_url = None

                    for ref in record.references:
                        try:
                            # TODO: Use a more robust mechanism for detecting
                            # ERDDAP instances aside from relying on the url
                            erddap_match = erddap_re.search(ref['url'])
                            # We are only interested in the 'services'
                            if (ref["scheme"] in services.values()):
                                metadata_url = next((
                                    r['url'] for r in record.references
                                    if r['scheme'] ==
                                    'urn:x-esri:specification:ServiceType:ArcIMS:Metadata:Document'
                                ), None)
                                # strip extension if erddap endpoint
                                url = unicode(ref['url'])
                            elif erddap_match:
                                test_url = (erddap_match.group(1) +
                                            '.iso19115')
                                req = requests.get(test_url)
                                # if we have a valid ERDDAP metadata endpoint,
                                # store it.
                                if req.status_code == 200:
                                    metadata_url = unicode(test_url)
                                else:
                                    app.logger.error('Invalid service URL %s',
                                                     ref['url'])
                                    continue

                                url = get_erddap_url_from_iso(req.content)
                                if url is None:
                                    app.logger.error(ref['url'])
                                    app.logger.error(
                                        "Failed to parse Erddap ISO for %s",
                                        test_url)
                                    continue  # Either not a valid ISO or there's not a valid endpoint

                            # next record if not one of the previously mentioned
                            else:
                                continue
                            # end metadata find block
                            s = db.Service.find_one({
                                'data_provider':
                                unicode(region),
                                'url':
                                url
                            })
                            if s is None:
                                s = db.Service()
                                s.url = unicode(url)
                                s.data_provider = unicode(region)
                                s.manual = False
                                s.active = True

                                new_services.append(s)
                            else:
                                # will run twice if erddap services have
                                # both .html and .graph, but resultant
                                # data should be the same
                                update_services.append(s)

                            s.service_id = unicode(name)
                            s.name = unicode(record.title)
                            s.service_type = unicode(
                                'DAP' if erddap_match else next((
                                    k for k, v in services.items()
                                    if v == ref["scheme"])))
                            s.interval = 3600  # 1 hour
                            s.tld = unicode(urlparse(url).netloc)
                            s.updated = datetime.utcnow()
                            s.contact = unicode(contact_email)
                            s.metadata_url = metadata_url

                            # grab opendap form url if present
                            if s.service_type == 'DAP':
                                possible_refs = [
                                    r['url'] for r in record.references
                                    if r['scheme'] == opendap_form_schema
                                ]
                                if len(possible_refs):
                                    # this is bad, it can grab any associated
                                    # record from the dataset
                                    s.extra_url = unicode(possible_refs[0])

                            # if we see the service, this is "Active", unless we've set manual (then we don't touch)
                            if not s.manual:
                                s.active = True

                            s.save()

                        except Exception as e:
                            app.logger.warn("Could not save service: %s", e)

                except Exception as e:
                    app.logger.warn("Could not save region info: %s", e)

        # DEACTIVATE KNOWN SERVICES
        updated_ids = set((s._id for s in update_services))
        deactivate = list(current_services.difference(updated_ids))

        # bulk update (using pymongo syntax)
        db.services.update({'_id': {
            '$in': deactivate
        }}, {'$set': {
            'active': False,
            'updated': datetime.utcnow()
        }},
                           multi=True,
                           upsert=False)

        return "New services: %s, updated services: %s, deactivated services: %s" % (
            len(new_services), len(update_services), len(deactivate))