def parse(self):
        elem = self.parser.xml
        ncml = {'variables': []}

        ncml['identifier'] = elem.attrib.get('location', '')
        for variable in extract_elems(elem, ['variable']):
            v = {}
            v['name'] = variable.attrib.get('name', '')
            v['attributes'] = []
            for att in extract_elems(variable, ['attribute']):
                a = {}
                for key, value in att.attrib.iteritems():
                    tag = extract_element_tag(key)
                    if tag == 'values':
                        continue
                    
                    a[tag] = value.strip()
                    
                if a:
                    v['attributes'] += [a]

            v = tidy_dict(v)
            if v:
                ncml['variables'].append(v)

        return tidy_dict(ncml)
    def _parse_keywords(self, elem):
        '''
        for each descriptiveKeywords block
        in an identification block
        '''
        keywords = []

        for key_elem in extract_elems(elem, ['descriptiveKeywords']):
            # TODO: split these up (if *-delimited in some way)
            terms = extract_items(
                key_elem,
                ['MD_Keywords', 'keyword', 'CharacterString'])
            key_type = extract_attrib(
                key_elem,
                ['MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue']
            )
            thesaurus = extract_item(
                key_elem,
                [
                    'MD_Keywords',
                    'thesaurusName',
                    'CI_Citation',
                    'title',
                    'CharacterString'
                ]
            )

            if terms:
                keywords.append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "dc:partOf": thesaurus,
                        "bcube:hasType": key_type,
                        "bcube:hasValue": terms
                    })
                )

        # TODO: add the Anchor element handling
        #       ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor']

        # add a generic set for the iso topic category
        isotopics = extract_items(
            elem, ['topicCategory', 'MD_TopicCategoryCode'])
        if isotopics:
            keywords.append(
                tidy_dict({
                    "object_id": generate_uuid_urn(),
                    "dc:partOf": 'IsoTopicCategories',
                    "bcube:hasValue": isotopics
                })
            )

        return keywords
    def parse_item(self, elem):
        identifier = extract_item(self.elem, ['Entry_ID'])
        title = extract_item(self.elem, ['Entry_Title'])
        keywords = extract_items(self.elem, ['Keyword'])
        keywords += extract_items(self.elem, ['ISO_Topic_Category'])
        abstract = extract_item(self.elem, ['Summary'])
        organization = extract_item(self.elem, ['Originating_Center'])

        # temporal extent
        start_date = extract_item(
            self.elem, ['Temporal_Coverage', 'Start_Date'])
        end_date = extract_item(self.elem, ['Temporal_Coverage', 'End_Date'])
        temporal = [start_date, end_date] if start_date and end_date else []

        # spatial extent
        west = extract_item(
            self.elem, ['Spatial_Coverage', 'Westernmost_Longitude'])
        east = extract_item(
            self.elem, ['Spatial_Coverage', 'Easternmost_Longitude'])
        south = extract_item(
            self.elem, ['Spatial_Coverage', 'Southernmost_Latitude'])
        north = extract_item(
            self.elem, ['Spatial_Coverage', 'Northernmost_Latitude'])
        bbox = [west, south, east, north] if \
            west and east and north and south else []
        bbox = bbox_to_geom(bbox)
        bbox = to_wkt(bbox)

        distributions = []
        for related_url in extract_elems(self.elem, ['Related_URL']):
            url = extract_item(related_url, ['URL'])
            content_type = extract_item(
                related_url, ['URL_Content_Type', 'Type'])
            description = extract_item(related_url, ['Description'])
            dist = tidy_dict({
                "url": url,
                "description": description,
                "content_type": content_type
            })
            if dist:
                distributions.append(dist)

        return tidy_dict({
            "id": identifier,
            "title": title,
            "keywords": keywords,
            "abstract": abstract,
            "organization": organization,
            "bbox": bbox,
            "temporal": temporal,
            "distributions": distributions
        })
Exemple #4
0
    def parse_item(self, elem):
        identifier = extract_item(self.elem, ['Entry_ID'])
        title = extract_item(self.elem, ['Entry_Title'])
        keywords = extract_items(self.elem, ['Keyword'])
        keywords += extract_items(self.elem, ['ISO_Topic_Category'])
        abstract = extract_item(self.elem, ['Summary'])
        organization = extract_item(self.elem, ['Originating_Center'])

        # temporal extent
        start_date = extract_item(self.elem,
                                  ['Temporal_Coverage', 'Start_Date'])
        end_date = extract_item(self.elem, ['Temporal_Coverage', 'End_Date'])
        temporal = [start_date, end_date] if start_date and end_date else []

        # spatial extent
        west = extract_item(self.elem,
                            ['Spatial_Coverage', 'Westernmost_Longitude'])
        east = extract_item(self.elem,
                            ['Spatial_Coverage', 'Easternmost_Longitude'])
        south = extract_item(self.elem,
                             ['Spatial_Coverage', 'Southernmost_Latitude'])
        north = extract_item(self.elem,
                             ['Spatial_Coverage', 'Northernmost_Latitude'])
        bbox = [west, south, east, north] if \
            west and east and north and south else []
        bbox = bbox_to_geom(bbox)
        bbox = to_wkt(bbox)

        distributions = []
        for related_url in extract_elems(self.elem, ['Related_URL']):
            url = extract_item(related_url, ['URL'])
            content_type = extract_item(related_url,
                                        ['URL_Content_Type', 'Type'])
            description = extract_item(related_url, ['Description'])
            dist = tidy_dict({
                "url": url,
                "description": description,
                "content_type": content_type
            })
            if dist:
                distributions.append(dist)

        return tidy_dict({
            "id": identifier,
            "title": title,
            "keywords": keywords,
            "abstract": abstract,
            "organization": organization,
            "bbox": bbox,
            "temporal": temporal,
            "distributions": distributions
        })
    def _parse_child(self, child):
        entry = {}

        entry["title"] = extract_item(child, ["title"])
        entry["id"] = extract_item(child, ["id"])
        entry["creator"] = extract_item(child, ["creator"])
        entry["author"] = extract_item(child, ["author", "name"])
        entry["date"] = extract_item(child, ["date"])
        entry["updated"] = extract_item(child, ["updated"])
        entry["published"] = extract_item(child, ["published"])

        entry["subjects"] = [e.attrib.get("term", "") for e in extract_elems(child, ["category"])]

        entry["contents"] = []
        contents = extract_elems(child, ["content"])
        for content in contents:
            text = content.text.strip() if content.text else ""
            content_type = content.attrib.get("type", "")
            entry["contents"].append({"content": text, "type": content_type})

        entry["links"] = []
        links = extract_elems(child, ["link"])
        for link in links:
            href = link.attrib.get("href", "")
            rel = link.attrib.get("rel", "")
            entry["links"].append({"href": href, "rel": rel})

        return tidy_dict(entry)
    def _extract_params(self, endpoint):
        def _extract_prefix(param):
            pattern = "\{{0,1}(\S*):([\S][^}]*)"

            # TODO: this is probably a bad assumption (that there's just the
            #   one item in the list, not that urlparse returns the terms as a list)
            if isinstance(param, list):
                param = param[0]

            if ":" not in param:
                return ("", param)

            m = re.search(pattern, param)
            return m.groups()

        _parameter_formats = {
            "geo:box": "west, south, east, north",
            "time:start": "YYYY-MM-DDTHH:mm:ssZ",
            "time:stop": "YYYY-MM-DDTHH:mm:ssZ",
        }
        url = endpoint.get("template", "")
        query_params = parse_url(url)

        # deal with the namespaced parameters as [query param key, prefix, type]
        query_params = [[k] + list(_extract_prefix(v)) for k, v in query_params.iteritems()]

        return [
            tidy_dict(
                {"name": qp[0], "prefix": qp[1], "type": qp[2], "format": _parameter_formats.get(":".join(qp[1:]))}
            )
            for qp in query_params
        ]
Exemple #7
0
    def _parse_contact(self, elem):
        '''
        parse any CI_Contact
        '''
        contact = {}

        if elem is None:
            return contact

        contact['phone'] = extract_item(
            elem, ['phone', 'CI_Telephone', 'voice', 'CharacterString'])
        contact['addresses'] = extract_items(
            elem,
            ['address', 'CI_Address', 'deliveryPoint', 'CharacterString'])
        contact['city'] = extract_item(
            elem, ['address', 'CI_Address', 'city', 'CharacterString'])
        contact['state'] = extract_item(
            elem,
            ['address', 'CI_Address', 'administrativeArea', 'CharacterString'])
        contact['postal'] = extract_item(
            elem, ['address', 'CI_Address', 'postalCode', 'CharacterString'])
        contact['country'] = extract_item(
            elem, ['address', 'CI_Address', 'country', 'CharacterString'])
        contact['email'] = extract_item(elem, [
            'address', 'CI_Address', 'electronicMailAddress', 'CharacterString'
        ])
        return tidy_dict(contact)
    def _parse_service(self):
        output = {}

        service = {
            "object_id": generate_uuid_urn(),
            "dcterms:title": ' '.join(extract_items(
                self.parser.xml, ["Identify", "repositoryName"])),
            "rdf:type": "OAI-PMH",
            "relationships": [],
            "urls": []
        }
        url_id = generate_uuid_urn()
        dist = self._generate_harvest_manifest(**{
            "bcube:hasUrlSource": "Harvested",
            "bcube:hasConfidence": "Good",
            "vcard:hasURL": self.url,
            "object_id": url_id,
            "dc:identifier": generate_sha_urn(self.url)
        })
        service['urls'] = [dist]
        service['relationships'].append({
            "relate": "bcube:originatedFrom",
            "object_id": url_id
        })

        # output['version'] = extract_items(self.parser.xml, ["Identify", "protocolVersion"])
        # output['endpoints'] = [{'url': e} for e
        #                        in extract_items(self.parser.xml, ["Identify", "baseURL"])]

        output['services'] = [service]
        return tidy_dict(output)
Exemple #9
0
    def parse_item(self):
        '''
        parse just the dc element (like oai_dc:dc) so if you're pulling
        this from an oai-pmh service, etc, make sure that it's *not*
        the full document
        '''
        # TODO: this is not correct for the overall thing
        if self.elem is None:
            return {}

        title = extract_item(self.elem, ['title'])
        creator = extract_item(self.elem, ['creator'])
        subjects = extract_items(self.elem, ['subject'])
        description = extract_item(self.elem, ['description'])
        date = extract_item(self.elem, ['date'])
        language = extract_item(self.elem, ['language'])
        publisher = extract_item(self.elem, ['publisher'])
        sources = extract_items(self.elem, ['source'])
        types = extract_items(self.elem, ['type'])

        return tidy_dict({
            'title': title,
            'creator': creator,
            'subjects': subjects,
            'abstract': description,
            'language': language,
            'date': date,
            'publisher': publisher,
            'types': types,
            'sources': sources
        })
    def _parse_item(self, elem):
        entry = {}

        entry['title'] = extract_item(elem, ['title'])
        entry['id'] = extract_item(elem, ['id'])
        entry['creator'] = extract_item(elem, ['creator'])
        entry['author'] = extract_item(elem, ['author', 'name'])
        entry['date'] = extract_item(elem, ['date'])
        entry['updated'] = extract_item(elem, ['updated'])
        entry['published'] = extract_item(elem, ['published'])

        entry['subjects'] = [e.attrib.get('term', '') for e in extract_elems(elem, ['category'])]

        entry['contents'] = []
        contents = extract_elems(elem, ['content'])
        for content in contents:
            text = content.text.strip() if content.text else ''
            content_type = content.attrib.get('type', '')
            entry['contents'].append({'content': text, 'type': content_type})

        entry['links'] = []
        links = extract_elems(elem, ['link'])
        for link in links:
            href = link.attrib.get('href', '')
            rel = link.attrib.get('rel', '')
            entry['links'].append({'href': href, 'rel': rel})

        return tidy_dict(entry)
    def _parse_contact(self, elem):
        '''
        parse any CI_Contact
        '''
        contact = {}

        if elem is None:
            return contact

        contact['phone'] = extract_item(
            elem, ['phone', 'CI_Telephone', 'voice', 'CharacterString'])
        contact['addresses'] = extract_items(
            elem,
            ['address', 'CI_Address', 'deliveryPoint', 'CharacterString'])
        contact['city'] = extract_item(
            elem, ['address', 'CI_Address', 'city', 'CharacterString'])
        contact['state'] = extract_item(
            elem,
            ['address', 'CI_Address', 'administrativeArea', 'CharacterString'])
        contact['postal'] = extract_item(
            elem, ['address', 'CI_Address', 'postalCode', 'CharacterString'])
        contact['country'] = extract_item(
            elem, ['address', 'CI_Address', 'country', 'CharacterString'])
        contact['email'] = extract_item(
            elem,
            ['address', 'CI_Address', 'electronicMailAddress', 'CharacterString'])
        return tidy_dict(contact)
    def _extract_params(self, endpoint):
        def _extract_prefix(param):
            pattern = '\{{0,1}(\S*):([\S][^}]*)'

            # TODO: this is probably a bad assumption (that there's just the
            #   one item in the list, not that urlparse returns the terms as a list)
            if isinstance(param, list):
                param = param[0]

            if ':' not in param:
                return ('', param)

            m = re.search(pattern, param)
            return m.groups()

        _parameter_formats = {
            "geo:box": "west, south, east, north",
            "time:start": "YYYY-MM-DDTHH:mm:ssZ",
            "time:stop": "YYYY-MM-DDTHH:mm:ssZ"
        }
        url = endpoint.get('template', '')
        query_params = parse_url(url)

        # deal with the namespaced parameters as [query param key, prefix, type]
        query_params = [[k] + list(_extract_prefix(v))
                        for k, v in query_params.iteritems()]

        return [
            tidy_dict({
                "name": qp[0],
                "prefix": qp[1],
                "type": qp[2],
                "format": _parameter_formats.get(':'.join(qp[1:]))
            }) for qp in query_params
        ]
    def parse_item(self):
        '''
        parse just the dc element (like oai_dc:dc) so if you're pulling
        this from an oai-pmh service, etc, make sure that it's *not*
        the full document
        '''
        # TODO: this is not correct for the overall thing
        if self.elem is None:
            return {}

        title = extract_item(self.elem, ['title'])
        creator = extract_item(self.elem, ['creator'])
        subjects = extract_items(self.elem, ['subject'])
        description = extract_item(self.elem, ['description'])
        date = extract_item(self.elem, ['date'])
        language = extract_item(self.elem, ['language'])
        publisher = extract_item(self.elem, ['publisher'])
        sources = extract_items(self.elem, ['source'])
        types = extract_items(self.elem, ['type'])

        return tidy_dict({
            'title': title,
            'creator': creator,
            'subjects': subjects,
            'abstract': description,
            'language': language,
            'date': date,
            'publisher': publisher,
            'types': types,
            'sources': sources
        })
    def _parse_item(self, elem):
        entry = {}

        entry['title'] = extract_item(elem, ['title'])
        entry['id'] = extract_item(elem, ['id'])
        entry['creator'] = extract_item(elem, ['creator'])
        entry['author'] = extract_item(elem, ['author', 'name'])
        entry['date'] = extract_item(elem, ['date'])
        entry['updated'] = extract_item(elem, ['updated'])
        entry['published'] = extract_item(elem, ['published'])

        entry['subjects'] = [
            e.attrib.get('term', '')
            for e in extract_elems(elem, ['category'])
        ]

        entry['contents'] = []
        contents = extract_elems(elem, ['content'])
        for content in contents:
            text = content.text.strip() if content.text else ''
            content_type = content.attrib.get('type', '')
            entry['contents'].append({'content': text, 'type': content_type})

        entry['links'] = []
        links = extract_elems(elem, ['link'])
        for link in links:
            href = link.attrib.get('href', '')
            rel = link.attrib.get('rel', '')
            entry['links'].append({'href': href, 'rel': rel})

        return tidy_dict(entry)
Exemple #15
0
    def parse(self):
        if self.elem is None:
            self.description = self.output
            return

        self.description = parse_identification_info(self.elem)

        self.description['operations'] = self._handle_operations()

        self.description = tidy_dict(self.description)
    def parse(self):
        if self.elem is None:
            self.description = self.output
            return

        self.description = parse_identification_info(self.elem)

        self.description['operations'] = self._handle_operations()

        self.description = tidy_dict(self.description)
Exemple #17
0
    def _parse_keywords(self, elem):
        '''
        for each descriptiveKeywords block
        in an identification block
        '''
        keywords = []

        for key_elem in extract_elems(elem, ['descriptiveKeywords']):
            # TODO: split these up (if *-delimited in some way)
            terms = extract_items(
                key_elem, ['MD_Keywords', 'keyword', 'CharacterString'])
            key_type = extract_attrib(key_elem, [
                'MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue'
            ])
            thesaurus = extract_item(key_elem, [
                'MD_Keywords', 'thesaurusName', 'CI_Citation', 'title',
                'CharacterString'
            ])

            if terms:
                keywords.append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "dc:partOf": thesaurus,
                        "bcube:hasType": key_type,
                        "bcube:hasValue": terms
                    }))

        # TODO: add the Anchor element handling
        #       ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor']

        # add a generic set for the iso topic category
        isotopics = extract_items(elem,
                                  ['topicCategory', 'MD_TopicCategoryCode'])
        if isotopics:
            keywords.append(
                tidy_dict({
                    "object_id": generate_uuid_urn(),
                    "dc:partOf": 'IsoTopicCategories',
                    "bcube:hasValue": isotopics
                }))

        return keywords
    def _parse_endpoint(self, elem):
        endpoint = {}
        endpoint['mimetype'] = elem.attrib.get('type', '')
        endpoint['template'] = elem.attrib.get('template', '')
        endpoint['parameters'] = self._extract_params(elem)
        endpoint['actionable'] = 'NOPE'
        # endpoint['url'] = self._generate_url(
        #   endpoint['mimetype'], endpoint['template'])

        osl = OpenSearchLink(elem)
        endpoint['url'] = osl.url

        return tidy_dict(endpoint)
    def parse(self):
        self.description = {}
        if 'parent_url' in self.harvest_details:
            self.description['childOf'] = self.harvest_details['parent_url']

        if 'service' in self.identify:
            self.description = self._parse_service()

        if 'resultset' in self.identify:
            self.description['children'] = self._parse_children(
                self.identify['resultset'].get('dialect', ''))

        self.description = tidy_dict(self.description)
    def parse(self):
        self.description = {}
        if 'parent_url' in self.harvest_details:
            self.description['childOf'] = self.harvest_details['parent_url']

        if 'service' in self.identify:
            self.description = self._parse_service()

        if 'resultset' in self.identify:
            self.description['children'] = self._parse_children(
                self.identify['resultset'].get('dialect', ''))

        self.description = tidy_dict(self.description)
    def _parse_endpoint(self, elem):
        endpoint = {}
        endpoint["mimetype"] = elem.attrib.get("type", "")
        endpoint["template"] = elem.attrib.get("template", "")
        endpoint["parameters"] = self._extract_params(elem)
        endpoint["actionable"] = "NOPE"
        # endpoint['url'] = self._generate_url(
        #   endpoint['mimetype'], endpoint['template'])

        osl = OpenSearchLink(elem)
        endpoint["url"] = osl.url

        return tidy_dict(endpoint)
    def parse(self):
        output = {}
        urls = set()

        if 'service' in self.identify:
            service = {
                "object_id": generate_uuid_urn(),
                "dcterms:title": extract_attrib(self.parser.xml, ['@name']),
                "rdf:type": "UNIDATA:THREDDS {0}".format(
                    extract_attrib(self.parser.xml, ['@version'])),
                "bcube:dateCreated":
                    self.harvest_details.get('harvest_date', ''),
                "bcube:lastUpdated":
                    self.harvest_details.get('harvest_date', ''),
                "relationships": [],
                "urls": []
            }
            url_sha = generate_sha_urn(self.url)
            urls.add(url_sha)
            original_url = self._generate_harvest_manifest(**{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": url_sha
            })
            service['urls'].append(original_url)
            # NOTE: this is not the sha from the url
            service['relationships'].append(
                {
                    "relate": "bcube:originatedFrom",
                    "object_id": url_sha
                }
            )

        # deal with the "dataset"
        service_bases = self.parser.xml.xpath(
            '//*[local-name()="service" and @base != ""]'
        )
        self.service_bases = {
            s.attrib.get('name'): s.attrib.get('base') for s in service_bases
        }

        # if 'dataset' in self.identify:
        #     # TODO: this is really not right but it is not
        #     # a proper web service so meh
        #     datasets = self._parse_datasets()

        # # if 'metadata' in self.identify:
        # #     self.description['metadata'] = self._parse_metadata()
        output['services'] = [service]
        self.description = tidy_dict(output)
    def parse(self):
        self.description = {}
        self._parse_results_set_info()
        self.description['total'] = self.total
        self.description['subtotal'] = self.subtotal
        self.description['schema'] = self.schema

        if self.parent_url:
            # TODO: consider making this a sha
            self.description['childOf'] = self.parent_url

        if 'resultset' in self.identify:
            self.description['children'] = self._parse_children(self.schema)

        self.description = tidy_dict(self.description)
 def _generate_harvest_manifest(self, **kwargs):
     harvest = {
         "vcard:hasURL": self.url,
         "bcube:atTime": self.harvest_details.get('harvest_date'),
         "bcube:HTTPStatusCodeValue": 200,
         "http:reasonPhrase": "OK",
         "bcube:HTTPStatusFamilyCode": 200,
         "bcube:HTTPStatusFamilyType": "Success message",
         "bcube:hasUrlSource": "",
         "bcube:hasConfidence": "",
         "bcube:validatedOn": self.harvest_details.get('harvest_date'),
         "dc:identifier": generate_sha_urn(self.url)
     }
     harvest.update(kwargs)
     return tidy_dict(harvest)
    def parse(self):
        self.description = {}

        if "parent_url" in self.harvest_details:
            # TODO: consider making this a sha
            self.description["childOf"] = self.harvest_details["parent_url"]

        if "service" in self.identify:
            self.description = self._parse_service()

        if "resultset" in self.identify:
            # TODO: get the root stats
            self.description["children"] = self._parse_children(self.identify["resultset"].get("dialect", ""))

        self.description = tidy_dict(self.description)
Exemple #26
0
 def _generate_harvest_manifest(self, **kwargs):
     harvest = {
         "vcard:hasURL": self.url,
         "bcube:atTime": self.harvest_details.get('harvest_date'),
         "bcube:HTTPStatusCodeValue": 200,
         "http:reasonPhrase": "OK",
         "bcube:HTTPStatusFamilyCode": 200,
         "bcube:HTTPStatusFamilyType": "Success message",
         "bcube:hasUrlSource": "",
         "bcube:hasConfidence": "",
         "bcube:validatedOn": self.harvest_details.get('harvest_date'),
         "dc:identifier": generate_sha_urn(self.url)
     }
     harvest.update(kwargs)
     return tidy_dict(harvest)
    def parse(self):
        self.description = {}
        self._parse_results_set_info()
        self.description['total'] = self.total
        self.description['subtotal'] = self.subtotal
        self.description['schema'] = self.schema

        if self.parent_url:
            # TODO: consider making this a sha
            self.description['childOf'] = self.parent_url

        if 'resultset' in self.identify:
            self.description['children'] = self._parse_children(self.schema)

        self.description = tidy_dict(self.description)
Exemple #28
0
 def _generate_harvest_manifest(self, **kwargs):
     # NOTE: for iso, you have to include the dc:identifier sha256
     #       in the kwargs
     harvest = {
         "vcard:hasURL": "",
         "bcube:atTime": self.harvest_details.get('harvest_date'),
         "bcube:HTTPStatusCodeValue": 200,
         "http:reasonPhrase": "OK",
         "bcube:HTTPStatusFamilyCode": 200,
         "bcube:HTTPStatusFamilyType": "Success message",
         "bcube:hasUrlSource": "",
         "bcube:hasConfidence": "",
         "bcube:validatedOn": self.harvest_details.get('harvest_date')
     }
     harvest.update(kwargs)
     return tidy_dict(harvest)
 def _generate_harvest_manifest(self, **kwargs):
     # NOTE: for iso, you have to include the dc:identifier sha256
     #       in the kwargs
     harvest = {
         "vcard:hasURL": "",
         "bcube:atTime": self.harvest_details.get('harvest_date'),
         "bcube:HTTPStatusCodeValue": 200,
         "http:reasonPhrase": "OK",
         "bcube:HTTPStatusFamilyCode": 200,
         "bcube:HTTPStatusFamilyType": "Success message",
         "bcube:hasUrlSource": "",
         "bcube:hasConfidence": "",
         "bcube:validatedOn": self.harvest_details.get('harvest_date')
     }
     harvest.update(kwargs)
     return tidy_dict(harvest)
    def parse_service(self):
        '''
        main service parsing method: pull all defined elements,
            pull anything else text/attribute related

        returns:
            dict {service: 'anything ontology-driven'}
        '''
        service = {
            "service": self.return_service_descriptors(),
            "dataset": self.return_dataset_descriptors(),
            "metadata": self.return_metadata_descriptors()
        }
        self.service = tidy_dict(service)

        return self.service
Exemple #31
0
    def parse_service(self):
        '''
        main service parsing method: pull all defined elements,
            pull anything else text/attribute related

        returns:
            dict {service: 'anything ontology-driven'}
        '''
        service = {
            "service": self.return_service_descriptors(),
            "dataset": self.return_dataset_descriptors(),
            "metadata": self.return_metadata_descriptors()
        }
        self.service = tidy_dict(service)

        return self.service
    def parse(self):
        self.description = {}

        if "parent_url" in self.harvest_details:
            # TODO: consider making this a sha
            self.description['childOf'] = self.harvest_details['parent_url']

        if 'service' in self.identify:
            self.description = self._parse_service()

        if 'resultset' in self.identify:
            # TODO: get the root stats
            self.description['children'] = self._parse_children(
                self.identify['resultset'].get('dialect', ''))

        self.description = tidy_dict(self.description)
    def _parse_item(self, elem):
        item = {}
        item['title'] = extract_item(elem, ['title'])
        item['language'] = extract_item(elem, ['language'])
        item['author'] = extract_item(elem, ['author'])
        # TODO: go sort out what this is: http://purl.org/rss/1.0/modules/content/
        item['encoded'] = extract_item(elem, ['encoded'])
        item['id'] = extract_item(elem, ['guid'])
        item['creator'] = extract_item(elem, ['creator'])

        item['subjects'] = extract_items(elem, ['category'])
        item['published'] = extract_item(elem, ['pubDate'])
        item['timestamp'] = extract_item(elem, ['date'])

        item['links'] = extract_items(elem, ['link'])
        item['links'] += extract_items(elem, ['docs'])

        return tidy_dict(item)
    def _parse_item(self, elem):
        item = {}
        item['title'] = extract_item(elem, ['title'])
        item['language'] = extract_item(elem, ['language'])
        item['author'] = extract_item(elem, ['author'])
        # TODO: go sort out what this is: http://purl.org/rss/1.0/modules/content/
        item['encoded'] = extract_item(elem, ['encoded'])
        item['id'] = extract_item(elem, ['guid'])
        item['creator'] = extract_item(elem, ['creator'])

        item['subjects'] = extract_items(elem, ['category'])
        item['published'] = extract_item(elem, ['pubDate'])
        item['timestamp'] = extract_item(elem, ['date'])

        item['links'] = extract_items(elem, ['link'])
        item['links'] += extract_items(elem, ['docs'])

        return tidy_dict(item)
    def _parse_child(self, child):
        item = {}
        item["title"] = extract_item(child, ["title"])
        item["language"] = extract_item(child, ["language"])
        item["author"] = extract_item(child, ["author"])
        # TODO: go sort out what this is: http://purl.org/rss/1.0/modules/content/
        item["encoded"] = extract_item(child, ["encoded"])
        item["id"] = extract_item(child, ["guid"])
        item["creator"] = extract_item(child, ["creator"])

        item["subjects"] = extract_items(child, ["category"])
        item["published"] = extract_item(child, ["pubDate"])
        item["timestamp"] = extract_item(child, ["date"])

        item["links"] = extract_items(child, ["link"])
        item["links"] += extract_items(child, ["docs"])

        return tidy_dict(item)
Exemple #36
0
    def _parse_responsibleparty(self, elem):
        '''
        parse any CI_ResponsibleParty
        '''
        individual_name = extract_item(elem,
                                       ['individualName', 'CharacterString'])
        organization_name = extract_item(
            elem, ['organisationName', 'CharacterString'])
        position_name = extract_item(elem, ['positionName', 'CharacterString'])

        e = extract_elem(elem, ['contactInfo', 'CI_Contact'])
        contact = self._parse_contact(e)

        return tidy_dict({
            "individual": individual_name,
            "organization": organization_name,
            "position": position_name,
            "contact": contact
        })
    def _parse_responsibleparty(self, elem):
        '''
        parse any CI_ResponsibleParty
        '''
        individual_name = extract_item(
            elem, ['individualName', 'CharacterString'])
        organization_name = extract_item(
            elem, ['organisationName', 'CharacterString'])
        position_name = extract_item(
            elem, ['positionName', 'CharacterString'])

        e = extract_elem(elem, ['contactInfo', 'CI_Contact'])
        contact = self._parse_contact(e)

        return tidy_dict({
            "individual": individual_name,
            "organization": organization_name,
            "position": position_name,
            "contact": contact
        })
Exemple #38
0
    def parse(self):
        # get the series
        self.description = {}
        md = extract_elem(self.elem, ['seriesMetadata', 'MD_Metadata'])
        if md is None:
            return

        md_parser = MxParser(md)
        md_parser.parse()
        self.description = md_parser.description
        self.description['children'] = []

        # get the children
        children = extract_elems(
            self.elem, ['composedOf', 'DS_DataSet', 'has', 'MD_Metadata'])
        for child in children:
            child_parser = MxParser(child)
            child_parser.parse()
            if child_parser.description:
                self.description['children'].append(child_parser.description)

        self.description = tidy_dict(self.description)
    def parse(self):
        # get the series
        self.description = {}
        md = extract_elem(self.elem, ['seriesMetadata', 'MD_Metadata'])
        if md is None:
            return

        md_parser = MxParser(md)
        md_parser.parse()
        self.description = md_parser.description
        self.description['children'] = []

        # get the children
        children = extract_elems(
            self.elem, ['composedOf', 'DS_DataSet', 'has', 'MD_Metadata'])
        for child in children:
            child_parser = MxParser(child)
            child_parser.parse()
            if child_parser.description:
                self.description['children'].append(child_parser.description)

        self.description = tidy_dict(self.description)
    def _parse_identification_info(self, elem):
        # ignoring the larger get all the identifiers above
        # in favor of, hopefully, getting a better dataset id
        dataset_identifier = extract_item(elem, [
            'citation',
            'CI_Citation',
            'identifier',
            'MD_Identifier',
            'code',
            'CharacterString'
        ])

        dataset = {
            "object_id": generate_uuid_urn(),
            "dc:identifier": dataset_identifier,
            "dc:description": extract_item(
                elem, ['abstract', 'CharacterString']),
            "dcterms:title": extract_item(elem, [
                'citation', 'CI_Citation', 'title', 'CharacterString']),
            "relationships": []
        }

        # TODO: i think the rights blob is not in the ontology prototypes
        # the rights information from MD_Constraints or MD_LegalConstraints
        # rights = extract_item(elem, ['resourceConstraints', '*',
        #   'useLimitation', 'CharacterString'])

        # deal with the extent
        extents = self._parse_extent(elem)
        dataset.update(extents)

        keywords = self._parse_keywords(elem)
        for keyword in keywords:
            dataset['relationships'].append({
                "relate": "dc:conformsTo",
                "object_id": keyword['object_id']
            })
        return tidy_dict(dataset), keywords
Exemple #41
0
    def _parse_identification_info(self, elem):
        # ignoring the larger get all the identifiers above
        # in favor of, hopefully, getting a better dataset id
        dataset_identifier = extract_item(elem, [
            'citation', 'CI_Citation', 'identifier', 'MD_Identifier', 'code',
            'CharacterString'
        ])

        dataset = {
            "object_id":
            generate_uuid_urn(),
            "dc:identifier":
            dataset_identifier,
            "dc:description":
            extract_item(elem, ['abstract', 'CharacterString']),
            "dcterms:title":
            extract_item(
                elem, ['citation', 'CI_Citation', 'title', 'CharacterString']),
            "relationships": []
        }

        # TODO: i think the rights blob is not in the ontology prototypes
        # the rights information from MD_Constraints or MD_LegalConstraints
        # rights = extract_item(elem, ['resourceConstraints', '*',
        #   'useLimitation', 'CharacterString'])

        # deal with the extent
        extents = self._parse_extent(elem)
        dataset.update(extents)

        keywords = self._parse_keywords(elem)
        for keyword in keywords:
            dataset['relationships'].append({
                "relate": "dc:conformsTo",
                "object_id": keyword['object_id']
            })
        return tidy_dict(dataset), keywords
    def _parse_service(self):
        output = {}

        service = {
            "object_id":
            generate_uuid_urn(),
            "dcterms:title":
            ' '.join(
                extract_items(self.parser.xml,
                              ["Identify", "repositoryName"])),
            "rdf:type":
            "OAI-PMH",
            "relationships": [],
            "urls": []
        }
        url_id = generate_uuid_urn()
        dist = self._generate_harvest_manifest(
            **{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": url_id,
                "dc:identifier": generate_sha_urn(self.url)
            })
        service['urls'] = [dist]
        service['relationships'].append({
            "relate": "bcube:originatedFrom",
            "object_id": url_id
        })

        # output['version'] = extract_items(self.parser.xml, ["Identify", "protocolVersion"])
        # output['endpoints'] = [{'url': e} for e
        #                        in extract_items(self.parser.xml, ["Identify", "baseURL"])]

        output['services'] = [service]
        return tidy_dict(output)
    def identify(self):
        '''
        it is within a protocol if *any* set of filters
        '''
        def _test_option(filters):
            '''where filters is the set of filters as booleans'''
            for i, j in filters.iteritems():
                if self._evaluate({i: self._filter(i, j, [])}, 0):
                    return True

            return False

        def _extract_option(filters):
            '''
            where filters is the set of things to return a value
            this assumes that you have concatenated the defaults and/or checks set
            '''
            items = []
            for check in filters:
                for c in check[1]:
                    item = ''
                    if c['type'] == 'simple':
                        # TODO: this is still not a safe assumption re: casing
                        filter_value = c['value'].upper()
                        filter_object = self.source_content if c['object'] == 'content' \
                            else self.source_url
                        filter_object = filter_object.upper()

                        if filter_value in filter_object:
                            item = [c.get('text', '')]  # just for the xpath handling later
                    elif c['type'] == 'xpath':
                        if self.parser.xml is None:
                            print 'Parser FAIL'
                            continue

                        try:
                            values = self.parser.xml.xpath(c['value'])
                            values = values if isinstance(values, list) else [values]
                            item = [' '.join(v.strip().split()) for v in values if v is not None]
                        except Exception as ex:
                            print 'XPATH FAIL: ', ex
                            continue

                    if item:
                        items += item

            return items

        def _chain(source_dict, keys):
            try:
                return list(chain.from_iterable(
                    [source_dict.get(key, {}).items() for key in keys]
                ))
            except:
                print source_dict
                return []

        matches = []
        for protocol in self.yaml:
            protocol_name = protocol['name']
            # print protocol_name

            for k, v in protocol.iteritems():
                if k in ['name'] or v is None:
                    continue

                for option in v:
                    if 'filters' not in option or option['filters'] is None:
                        continue

                    is_match = _test_option(option['filters'])

                    # check the error filters
                    errors = option.get('errors', {})
                    is_error = _test_option(errors.get('filters', {})) if errors else False

                    # check the language filters
                    language_filters = option.get('language', {})
                    _filters = _chain(language_filters, ["defaults", "checks"])
                    languages = _extract_option(_filters)

                    # check the version filters
                    version_filters = option.get('versions', {})
                    _filters = _chain(version_filters, ["defaults", "checks"])
                    versions = _extract_option(_filters)

                    # and the dialect if there's a key
                    dialect_filters = option.get('dialect', {})
                    if dialect_filters:
                        if 'text' in dialect_filters:
                            dialect = dialect_filters.get('text')
                        else:
                            # it's in the response somewhere
                            _filters = _chain(dialect_filters, ["defaults", "checks"])
                            dialect = _extract_option(_filters)
                    else:
                        dialect = []

                    # dump it out
                    if is_match:
                        matches.append({
                            "protocol": protocol_name,
                            k: tidy_dict({
                                "name": option.get('name', ''),
                                "request": option.get('request', ''),
                                "dialect": dialect,
                                "version": versions,
                                "error": is_error,
                                "language": languages
                            })
                        })

        return matches
    def parse_item(self):
        output = {}

        urls = set()

        catalog_object_id = generate_uuid_urn()

        output['catalog_record'] = {
            "object_id": catalog_object_id,
            "bcube:dateCreated": self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''),
            # "dc:conformsTo": extract_attrib(
            #     self.elem, ['@noNamespaceSchemaLocation']).split(),
            "rdf:type": "FGDC:CSDGM",
            "relationships": [],
            "urls": []
        }
        output['urls'] = []

        # add the harvest info
        # this is not necessary as a sha just for set inclusion
        url_sha = generate_sha_urn(self.url)
        urls.add(url_sha)
        original_url = self._generate_harvest_manifest(**{
            "bcube:hasUrlSource": "Harvested",
            "bcube:hasConfidence": "Good",
            "vcard:hasURL": self.url,
            "object_id": generate_uuid_urn(),
            "dc:identifier": url_sha
        })
        output['catalog_record']['urls'].append(original_url)
        # NOTE: this is not the sha from the url
        output['catalog_record']['relationships'].append(
            {
                "relate": "bcube:originatedFrom",
                "object_id": original_url['object_id']
            }
        )

        datsetid = extract_item(self.elem, ['idinfo', 'datsetid'])
        dataset_object_id = generate_uuid_urn()

        dataset = {
            "object_id": dataset_object_id,
            "dcterms:identifier": datsetid,
            "bcube:dateCreated": self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''),
            "dc:description": extract_item(
                self.elem, ['idinfo', 'descript', 'abstract']),
            "dcterms:title": extract_item(
                self.elem, ['idinfo', 'citation', 'citeinfo', 'title']),
            "urls": [],
            "relationships": []
        }

        bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding'])
        if bbox_elem is not None:
            # that's not even valid
            west = extract_item(bbox_elem, ['westbc'])
            east = extract_item(bbox_elem, ['eastbc'])
            north = extract_item(bbox_elem, ['northbc'])
            south = extract_item(bbox_elem, ['southbc'])
            bbox = [west, south, east, north]
            bbox = bbox_to_geom(bbox)
            bbox = to_wkt(bbox)

            dataset.update({
                "dc:spatial": bbox,
                "esip:westBound": west,
                "esip:eastBound": east,
                "esip:northBound": north,
                "esip:southBound": south
            })

        time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo'])
        if time_elem is not None:
            caldate = extract_item(time_elem, ['sngdate', 'caldate'])
            if caldate:
                # TODO: we should see if it's at least a valid date
                dataset['esip:startDate'] = self._convert_date(caldate)

            rngdate = extract_elem(time_elem, ['rngdates'])
            if rngdate is not None:
                dataset['esip:startDate'] = self._convert_date(
                    extract_item(rngdate, ['begdate']))
                dataset['esip:endDate'] = self._convert_date(
                    extract_item(rngdate, ['enddate']))
            # TODO: add the min/max of the list of dates

        dataset['relationships'] = [
            {
                "relate": "bcube:hasMetadataRecord",
                "object_id": catalog_object_id
            }
        ]

        publisher = {
            "object_id": generate_uuid_urn(),
            "name": extract_item(
                self.elem,
                ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']),
            "location": extract_item(
                self.elem,
                ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace'])
        }
        output['publisher'] = publisher
        dataset['relationships'].append({
            "relate": "dcterms:publisher",
            "object_id": publisher['object_id']
        })

        distrib_elems = extract_elems(
            self.elem, ['distinfo', 'stdorder', 'digform'])

        for distrib_elem in distrib_elems:
            link = extract_item(
                distrib_elem,
                ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr'])
            # format = extract_item(distrib_elem, ['digtinfo', 'formname'])
            url_sha = generate_sha_urn(link)
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(**{
                    "bcube:hasUrlSource": "Harvested",
                    "bcube:hasConfidence": "Good",
                    "vcard:hasURL": link,
                    "object_id": url_id,
                    "dc:identifier": url_sha
                })
                dataset['urls'].append(dist)
                # this is a distribution link so
                # we are assuming it is to data
                dataset['relationships'].append({
                    "relate": "dcterms:references",
                    "object_id": url_id
                })

        webpages = []
        onlink_elems = extract_elems(
            self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink'])
        for onlink_elem in onlink_elems:
            link = onlink_elem.text.strip() if onlink_elem.text else ''
            if not link:
                continue
            url_sha = generate_sha_urn(link)
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(**{
                    "bcube:hasUrlSource": "Harvested",
                    "bcube:hasConfidence": "Good",
                    "vcard:hasURL": link,
                    "object_id": url_id,
                    "dc:identifier": url_sha
                })
                dataset['urls'].append(dist)
                webpages.append({
                    "object_id": generate_uuid_urn(),
                    "relationships": [
                        {
                            "relate": "dcterms:references",
                            "object_id": url_id
                        }
                    ]}
                )

        output['catalog_record']['webpages'] = webpages
        for webpage in webpages:
            dataset['relationships'].append({
                "relate": "dcterms:references",
                "object_id": webpage['object_id']
            })

        # retain the keyword sets with type, thesaurus name and split
        # the terms as best we can
        keywords = []
        key_elem = extract_elem(self.elem, ['idinfo', 'keywords'])
        for child in key_elem.iterchildren():
            key_type = extract_element_tag(child.tag)
            key_tag = 'strat' if key_type == 'stratum' else key_type
            key_tag = 'temp' if key_tag == 'temporal' else key_tag
            thesaurus = extract_item(child, ['%skt' % key_tag])

            # TODO: split these up
            terms = extract_items(child, ['%skey' % key_tag])

            if terms:
                # if there's a parsing error (bad cdata, etc) may not have
                # TODO: add something for a set without a thesaurus name
                keywords.append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "dc:partOf": thesaurus,
                        "bcube:hasType": key_type,
                        "bcube:hasValue": terms
                    })
                )
        output['keywords'] = keywords
        for keyword in keywords:
            dataset['relationships'].append(
                {
                    "relate": "dc:conformsTo",
                    "object_id": keyword['object_id']
                }
            )

        output['datasets'] = [dataset]

        # add the metadata relate
        output['catalog_record']['relationships'].append(
            {
                "relate": "foaf:primaryTopic",
                "object_id": dataset_object_id
            }
        )

        output['catalog_records'] = [output['catalog_record']]
        del output['catalog_record']
        self.description = tidy_dict(output)
    def _parse_service(self):
        output = {}
        urls = set()

        service = {
            "object_id":
            generate_uuid_urn(),
            "bcube:dateCreated":
            self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated":
            self.harvest_details.get('harvest_date', ''),
            "rdf:type":
            'OpenSearch1.1:Description',
            "dcterms:title":
            extract_item(self.parser.xml, ["ShortName"]),
            "dc:description":
            ' '.join(
                extract_items(self.parser.xml, ["LongName"]) +
                extract_items(self.parser.xml, ["Description"])),
            "urls": [],
            "webpages": [],
            "relationships": []
        }
        url_sha = generate_sha_urn(self.url)
        urls.add(url_sha)
        original_url = self._generate_harvest_manifest(
            **{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": generate_uuid_urn(),
                "dc:identifier": url_sha
            })
        service['urls'].append(original_url)
        service['relationships'].append({
            "relate": "bcube:originatedFrom",
            "object_id": original_url['object_id']
        })

        # output['source'] = extract_items(
        #   self.parser.xml, ["Attribution"])
        # output['contact'] = extract_items(
        #     self.parser.xml, ["Developer"])
        # output['rights'] = extract_items(
        #   self.parser.xml, ["SyndicationRight"])

        key_id = generate_uuid_urn()
        output['keywords'] = [{
            "object_id":
            key_id,
            "bcube:hasValue":
            extract_items(self.parser.xml, ["Tags"])
        }]
        service['relationships'].append({
            "relate": "dc:conformsTo",
            "object_id": key_id
        })

        for t in extract_elems(self.parser.xml, ['Url']):
            ep = self._parse_endpoint(t)
            url_sha = generate_sha_urn(ep['url'])
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(
                    **{
                        "bcube:hasUrlSource": "Generated",
                        "bcube:hasConfidence": "Not Sure",
                        "vcard:hasURL": ep['url'],
                        "object_id": url_id,
                        "dc:identifier": url_sha
                    })
                service['urls'].append(dist)
                wb_id = generate_uuid_urn()
                service['webpages'].append({
                    "object_id":
                    wb_id,
                    "relationships": [{
                        "relate": "dcterms:references",
                        "object_id": url_id
                    }]
                })
                service['relationships'].append({
                    "relate": "dcterms:references",
                    "object_id": wb_id
                })

        output['services'] = [service]

        return tidy_dict(output)
 def parse(self):
     output = {}
     output['items'] = [
         child for child in self.parse_children(tags=['//*', 'item'])
     ]
     return tidy_dict(output)
 def parse(self):
     output = {}
     output["items"] = [child for child in self.parse_children(tags=["//*", "item"])]
     return tidy_dict(output)
Exemple #48
0
    def parse(self):
        # for ogc, a catalog record is the getcapabilities rsp
        output = {"layers": [], "catalog_records": []}

        urls = set()

        if not self.reader:
            self.description = {}
            return

        if 'service' in self.identify:
            service_id = generate_uuid_urn()
            service = {
                "object_id": service_id,
                "bcube:dateCreated":
                self.harvest_details.get('harvest_date', ''),
                "bcube:lastUpdated":
                self.harvest_details.get('harvest_date', ''),
                "relationships": [],
                "urls": [],
                "rdf:type": self.urn
            }

            url_sha = generate_sha_urn(self.url)
            urls.add(url_sha)
            url_id = generate_uuid_urn()
            service['urls'].append(
                self._generate_harvest_manifest(
                    **{
                        "bcube:hasUrlSource": "Harvested",
                        "bcube:hasConfidence": "Good",
                        "vcard:hasURL": self.url,
                        "object_id": url_id,
                        "dc:identifier": url_sha
                    }))
            service['relationships'].append({
                "relate": "bcube:originatedFrom",
                "object_id": url_id
            })

            # self._get_service_config(service_name, version)
            service_reader = self._parse_service(self.reader,
                                                 self.service_name,
                                                 self.version)

            # map to triples
            service.update({"dc:description": service_reader.get('abstract')})

            keywords = service_reader.get('subject', [])
            if keywords:
                output['keywords'] = [{
                    "object_id": generate_uuid_urn(),
                    "bcube:hasValue": keywords
                }]
                for k in output['keywords']:
                    service['relationships'].append({
                        "relate": "dc:conformsTo",
                        "object_id": k['object_id']
                    })
            if self.identify['service'].get('request',
                                            '') == 'GetCapabilities':
                # this is also awkward. meh. needs must.
                layers = []
                listed_layers = self._parse_getcap_datasets(self.reader)

                for ld in listed_layers:
                    layer = {
                        "object_id":
                        generate_uuid_urn(),
                        "bcube:dateCreated":
                        self.harvest_details.get('harvest_date', ''),
                        "bcube:lastUpdated":
                        self.harvest_details.get('harvest_date', ''),
                        "dc:description":
                        ld.get('abstract', ''),
                        "dc:title":
                        ld.get('title', ''),
                        "relationships": []
                    }
                    service['relationships'].append({
                        "relate":
                        "bcube:contains",
                        "object_id":
                        layer['object_id']
                    })

                    # add the generated url for the service
                    generated_url = self._generate_url(self.url,
                                                       ld.get('name'),
                                                       ld.get('bbox'),
                                                       self.service_name,
                                                       self.version)
                    if generated_url:
                        url_sha = generate_sha_urn(generated_url)
                        if url_sha not in urls:
                            urls.add(url_sha)
                            url_id = generate_uuid_urn()
                            layer_url = self._generate_harvest_manifest(
                                **{
                                    "vcard:hasURL": generated_url,
                                    "bcube:hasUrlSource": "Generated",
                                    "bcube:hasConfidence": "Not Sure",
                                    "object_id": url_id,
                                    "dc:identifier": url_sha
                                })
                            service['urls'].append(layer_url)
                        # don't add to the larger set, but do
                        # include the reference within the layer
                        layer['relationships'].append({
                            "relate": "dcterms:references",
                            "object_id": url_id
                        })

                    # add each as a dataset with just a url for now
                    for mu in ld.get('metadata_urls', []):
                        url_link = generate_uuid_urn()
                        url_sha = generate_sha_urn(mu.get('url'))

                        if url_sha not in urls:
                            urls.add(url_sha)
                            url_id = generate_uuid_urn()
                            output['catalog_records'] += [{
                                "object_id":
                                url_link,
                                "urls": [
                                    self._generate_harvest_manifest(
                                        **{
                                            "vcard:hasURL": mu.get('url'),
                                            "bcube:hasUrlSource": "Harvested",
                                            "bcube:hasConfidence": "Good",
                                            "object_id": url_id,
                                            "dc:identifier": url_sha
                                        })
                                ],
                                "relationships": [{
                                    "relate":
                                    "dc:describes",
                                    "object_id":
                                    layer['object_id']
                                }, {
                                    "relate": "bcube:originatedFrom",
                                    "object_id": url_id
                                }]
                            }]

                    if 'temporal_extent' in ld:
                        temporal = tidy_dict({
                            "esip:startDate":
                            ld['temporal_extent'].get('begin', ''),
                            "esip:endDate":
                            ld['temporal_extent'].get('end', '')
                        })
                        if temporal:
                            layer.update(temporal)

                    if 'bbox' in ld:
                        layer.update(ld['bbox'])

                    layers.append(layer)

                service['layers'] = layers
                # if layers:
                #     service['layers'] = layers
                #     for layer in layers:
                #         service['relationships'].append({
                #             "relate": "bcube:contains",
                #             "object_id": layer['object_id']
                #         })

        output['services'] = [service]
        self.description = tidy_dict(output)
    def parse(self):
        # for ogc, a catalog record is the getcapabilities rsp
        output = {
            "layers": [],
            "catalog_records": []
        }

        urls = set()

        if not self.reader:
            self.description = {}
            return

        if 'service' in self.identify:
            service_id = generate_uuid_urn()
            service = {
                "object_id": service_id,
                "bcube:dateCreated": self.harvest_details.get(
                    'harvest_date', ''),
                "bcube:lastUpdated": self.harvest_details.get(
                    'harvest_date', ''),
                "relationships": [],
                "urls": [],
                "rdf:type": self.urn
            }

            url_sha = generate_sha_urn(self.url)
            urls.add(url_sha)
            url_id = generate_uuid_urn()
            service['urls'].append(
                self._generate_harvest_manifest(**{
                    "bcube:hasUrlSource": "Harvested",
                    "bcube:hasConfidence": "Good",
                    "vcard:hasURL": self.url,
                    "object_id": url_id,
                    "dc:identifier": url_sha
                })
            )
            service['relationships'].append({
                "relate": "bcube:originatedFrom",
                "object_id": url_id
            })

            # self._get_service_config(service_name, version)
            service_reader = self._parse_service(
                self.reader, self.service_name, self.version)

            # map to triples
            service.update({
                "dc:description": service_reader.get('abstract')
            })

            keywords = service_reader.get('subject', [])
            if keywords:
                output['keywords'] = [{
                    "object_id": generate_uuid_urn(),
                    "bcube:hasValue": keywords
                }]
                for k in output['keywords']:
                    service['relationships'].append(
                        {
                            "relate": "dc:conformsTo",
                            "object_id": k['object_id']
                        }
                    )
            if self.identify['service'].get('request', '') == 'GetCapabilities':
                # this is also awkward. meh. needs must.
                layers = []
                listed_layers = self._parse_getcap_datasets(self.reader)

                for ld in listed_layers:
                    layer = {
                        "object_id": generate_uuid_urn(),
                        "bcube:dateCreated":
                            self.harvest_details.get('harvest_date', ''),
                        "bcube:lastUpdated":
                            self.harvest_details.get('harvest_date', ''),
                        "dc:description": ld.get('abstract', ''),
                        "dc:title": ld.get('title', ''),
                        "relationships": []
                    }
                    service['relationships'].append({
                        "relate": "bcube:contains",
                        "object_id": layer['object_id']
                    })

                    # add the generated url for the service
                    generated_url = self._generate_url(
                        self.url,
                        ld.get('name'),
                        ld.get('bbox'),
                        self.service_name,
                        self.version
                    )
                    if generated_url:
                        url_sha = generate_sha_urn(generated_url)
                        if url_sha not in urls:
                            urls.add(url_sha)
                            url_id = generate_uuid_urn()
                            layer_url = self._generate_harvest_manifest(**{
                                "vcard:hasURL": generated_url,
                                "bcube:hasUrlSource": "Generated",
                                "bcube:hasConfidence": "Not Sure",
                                "object_id": url_id,
                                "dc:identifier": url_sha
                            })
                            service['urls'].append(layer_url)
                        # don't add to the larger set, but do
                        # include the reference within the layer
                        layer['relationships'].append({
                            "relate": "dcterms:references",
                            "object_id": url_id
                        })

                    # add each as a dataset with just a url for now
                    for mu in ld.get('metadata_urls', []):
                        url_link = generate_uuid_urn()
                        url_sha = generate_sha_urn(mu.get('url'))

                        if url_sha not in urls:
                            urls.add(url_sha)
                            url_id = generate_uuid_urn()
                            output['catalog_records'] += [
                                {
                                    "object_id": url_link,
                                    "urls": [
                                        self._generate_harvest_manifest(**{
                                            "vcard:hasURL": mu.get('url'),
                                            "bcube:hasUrlSource": "Harvested",
                                            "bcube:hasConfidence": "Good",
                                            "object_id": url_id,
                                            "dc:identifier": url_sha
                                        })
                                    ],
                                    "relationships": [
                                        {
                                            "relate": "dc:describes",
                                            "object_id": layer['object_id']
                                        },
                                        {
                                            "relate": "bcube:originatedFrom",
                                            "object_id": url_id
                                        }
                                    ]
                                }
                            ]

                    if 'temporal_extent' in ld:
                        temporal = tidy_dict(
                            {
                                "esip:startDate":
                                    ld['temporal_extent'].get('begin', ''),
                                "esip:endDate":
                                    ld['temporal_extent'].get('end', '')
                            }
                        )
                        if temporal:
                            layer.update(temporal)

                    if 'bbox' in ld:
                        layer.update(ld['bbox'])

                    layers.append(layer)

                service['layers'] = layers
                # if layers:
                #     service['layers'] = layers
                #     for layer in layers:
                #         service['relationships'].append({
                #             "relate": "bcube:contains",
                #             "object_id": layer['object_id']
                #         })

        output['services'] = [service]
        self.description = tidy_dict(output)
    def _parse_service(self):
        output = {}
        urls = set()

        service = {
            "object_id": generate_uuid_urn(),
            "bcube:dateCreated": self.harvest_details.get("harvest_date", ""),
            "bcube:lastUpdated": self.harvest_details.get("harvest_date", ""),
            "rdf:type": "OpenSearch1.1:Description",
            "dcterms:title": extract_item(self.parser.xml, ["ShortName"]),
            "dc:description": " ".join(
                extract_items(self.parser.xml, ["LongName"]) + extract_items(self.parser.xml, ["Description"])
            ),
            "urls": [],
            "webpages": [],
            "relationships": [],
        }
        url_sha = generate_sha_urn(self.url)
        urls.add(url_sha)
        original_url = self._generate_harvest_manifest(
            **{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": generate_uuid_urn(),
                "dc:identifier": url_sha,
            }
        )
        service["urls"].append(original_url)
        service["relationships"].append({"relate": "bcube:originatedFrom", "object_id": original_url["object_id"]})

        # output['source'] = extract_items(
        #   self.parser.xml, ["Attribution"])
        # output['contact'] = extract_items(
        #     self.parser.xml, ["Developer"])
        # output['rights'] = extract_items(
        #   self.parser.xml, ["SyndicationRight"])

        key_id = generate_uuid_urn()
        output["keywords"] = [{"object_id": key_id, "bcube:hasValue": extract_items(self.parser.xml, ["Tags"])}]
        service["relationships"].append({"relate": "dc:conformsTo", "object_id": key_id})

        for t in extract_elems(self.parser.xml, ["Url"]):
            ep = self._parse_endpoint(t)
            url_sha = generate_sha_urn(ep["url"])
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(
                    **{
                        "bcube:hasUrlSource": "Generated",
                        "bcube:hasConfidence": "Not Sure",
                        "vcard:hasURL": ep["url"],
                        "object_id": url_id,
                        "dc:identifier": url_sha,
                    }
                )
                service["urls"].append(dist)
                wb_id = generate_uuid_urn()
                service["webpages"].append(
                    {"object_id": wb_id, "relationships": [{"relate": "dcterms:references", "object_id": url_id}]}
                )
                service["relationships"].append({"relate": "dcterms:references", "object_id": wb_id})

        output["services"] = [service]

        return tidy_dict(output)
Exemple #51
0
    def parse_item(self):
        output = {}

        urls = set()

        catalog_object_id = generate_uuid_urn()

        output['catalog_record'] = {
            "object_id": catalog_object_id,
            "bcube:dateCreated": self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''),
            # "dc:conformsTo": extract_attrib(
            #     self.elem, ['@noNamespaceSchemaLocation']).split(),
            "rdf:type": "FGDC:CSDGM",
            "relationships": [],
            "urls": []
        }
        output['urls'] = []

        # add the harvest info
        # this is not necessary as a sha just for set inclusion
        url_sha = generate_sha_urn(self.url)
        urls.add(url_sha)
        original_url = self._generate_harvest_manifest(
            **{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": generate_uuid_urn(),
                "dc:identifier": url_sha
            })
        output['catalog_record']['urls'].append(original_url)
        # NOTE: this is not the sha from the url
        output['catalog_record']['relationships'].append({
            "relate":
            "bcube:originatedFrom",
            "object_id":
            original_url['object_id']
        })

        datsetid = extract_item(self.elem, ['idinfo', 'datsetid'])
        dataset_object_id = generate_uuid_urn()

        dataset = {
            "object_id":
            dataset_object_id,
            "dcterms:identifier":
            datsetid,
            "bcube:dateCreated":
            self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated":
            self.harvest_details.get('harvest_date', ''),
            "dc:description":
            extract_item(self.elem, ['idinfo', 'descript', 'abstract']),
            "dcterms:title":
            extract_item(self.elem,
                         ['idinfo', 'citation', 'citeinfo', 'title']),
            "urls": [],
            "relationships": []
        }

        bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding'])
        if bbox_elem is not None:
            # that's not even valid
            west = extract_item(bbox_elem, ['westbc'])
            east = extract_item(bbox_elem, ['eastbc'])
            north = extract_item(bbox_elem, ['northbc'])
            south = extract_item(bbox_elem, ['southbc'])
            bbox = [west, south, east, north]
            bbox = bbox_to_geom(bbox)
            bbox = to_wkt(bbox)

            dataset.update({
                "dc:spatial": bbox,
                "esip:westBound": west,
                "esip:eastBound": east,
                "esip:northBound": north,
                "esip:southBound": south
            })

        time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo'])
        if time_elem is not None:
            caldate = extract_item(time_elem, ['sngdate', 'caldate'])
            if caldate:
                # TODO: we should see if it's at least a valid date
                dataset['esip:startDate'] = self._convert_date(caldate)

            rngdate = extract_elem(time_elem, ['rngdates'])
            if rngdate is not None:
                dataset['esip:startDate'] = self._convert_date(
                    extract_item(rngdate, ['begdate']))
                dataset['esip:endDate'] = self._convert_date(
                    extract_item(rngdate, ['enddate']))
            # TODO: add the min/max of the list of dates

        dataset['relationships'] = [{
            "relate": "bcube:hasMetadataRecord",
            "object_id": catalog_object_id
        }]

        publisher = {
            "object_id":
            generate_uuid_urn(),
            "name":
            extract_item(
                self.elem,
                ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']),
            "location":
            extract_item(
                self.elem,
                ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace'])
        }
        output['publisher'] = publisher
        dataset['relationships'].append({
            "relate": "dcterms:publisher",
            "object_id": publisher['object_id']
        })

        distrib_elems = extract_elems(self.elem,
                                      ['distinfo', 'stdorder', 'digform'])

        for distrib_elem in distrib_elems:
            link = extract_item(
                distrib_elem,
                ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr'])
            # format = extract_item(distrib_elem, ['digtinfo', 'formname'])
            url_sha = generate_sha_urn(link)
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(
                    **{
                        "bcube:hasUrlSource": "Harvested",
                        "bcube:hasConfidence": "Good",
                        "vcard:hasURL": link,
                        "object_id": url_id,
                        "dc:identifier": url_sha
                    })
                dataset['urls'].append(dist)
                # this is a distribution link so
                # we are assuming it is to data
                dataset['relationships'].append({
                    "relate": "dcterms:references",
                    "object_id": url_id
                })

        webpages = []
        onlink_elems = extract_elems(
            self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink'])
        for onlink_elem in onlink_elems:
            link = onlink_elem.text.strip() if onlink_elem.text else ''
            if not link:
                continue
            url_sha = generate_sha_urn(link)
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(
                    **{
                        "bcube:hasUrlSource": "Harvested",
                        "bcube:hasConfidence": "Good",
                        "vcard:hasURL": link,
                        "object_id": url_id,
                        "dc:identifier": url_sha
                    })
                dataset['urls'].append(dist)
                webpages.append({
                    "object_id":
                    generate_uuid_urn(),
                    "relationships": [{
                        "relate": "dcterms:references",
                        "object_id": url_id
                    }]
                })

        output['catalog_record']['webpages'] = webpages
        for webpage in webpages:
            dataset['relationships'].append({
                "relate": "dcterms:references",
                "object_id": webpage['object_id']
            })

        # retain the keyword sets with type, thesaurus name and split
        # the terms as best we can
        keywords = []
        key_elem = extract_elem(self.elem, ['idinfo', 'keywords'])
        for child in key_elem.iterchildren():
            key_type = extract_element_tag(child.tag)
            key_tag = 'strat' if key_type == 'stratum' else key_type
            key_tag = 'temp' if key_tag == 'temporal' else key_tag
            thesaurus = extract_item(child, ['%skt' % key_tag])

            # TODO: split these up
            terms = extract_items(child, ['%skey' % key_tag])

            if terms:
                # if there's a parsing error (bad cdata, etc) may not have
                # TODO: add something for a set without a thesaurus name
                keywords.append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "dc:partOf": thesaurus,
                        "bcube:hasType": key_type,
                        "bcube:hasValue": terms
                    }))
        output['keywords'] = keywords
        for keyword in keywords:
            dataset['relationships'].append({
                "relate": "dc:conformsTo",
                "object_id": keyword['object_id']
            })

        output['datasets'] = [dataset]

        # add the metadata relate
        output['catalog_record']['relationships'].append({
            "relate":
            "foaf:primaryTopic",
            "object_id":
            dataset_object_id
        })

        output['catalog_records'] = [output['catalog_record']]
        del output['catalog_record']
        self.description = tidy_dict(output)
    def parse(self):
        '''
        from the root node, parse:
            identification (title, abstract, point of contact, keywords,
            extent) if identificationInfo contains SV_ServiceIdentification,
            add as child distribution info
        '''
        # set up the url set
        urls = set()
        urls.add(self.output['catalog_record']['urls'][0]['object_id'])

        for id_elem in extract_elems(
                self.elem,
                ['//*', 'identificationInfo', 'MD_DataIdentification']):
            dataset, keywords = self._parse_identification_info(id_elem)
            dataset['relationships'].append({
                "relate": "bcube:hasMetadataRecord",
                "object_id": self.output['catalog_record']['object_id']
            })
            dataset.update({
                "bcube:dateCreated":
                    self.harvest_details.get('harvest_date', ''),
                "bcube:lastUpdated":
                    self.harvest_details.get('harvest_date', '')
            })
            self.output['catalog_record']['relationships'].append({
                "relate": "foaf:primaryTopic",
                "object_id": dataset['object_id']
            })

            # point of contact from the root node and this might be an issue
            # in things like the -1/-3 from ngdc so try for an idinfo blob
            poc_elem = extract_elem(id_elem, [
                'identificationInfo',
                'MD_DataIdentification',
                'pointOfContact',
                'CI_ResponsibleParty'])
            # if poc_elem is None:
            #     # and if that fails try for the root-level contact
            #     poc_elem = extract_elem(
            #         self.elem,
            #         ['contact', 'CI_ResponsibleParty'])

            # TODO: point of contact is not necessarily the publisher
            if poc_elem is not None:
                poc = self._parse_responsibleparty(poc_elem)
                location = (
                    ' '.join(
                        [poc['contact'].get('city', ''),
                         poc['contact'].get('country', '')])
                ).strip() if poc.get('contact', {}) else ''

                self.output['publishers'].append(tidy_dict({
                    "object_id": generate_uuid_urn(),
                    "name": poc.get('organization', ''),
                    "location": location
                }))
                dataset['relationships'].append({
                    "relate": "dcterms:publisher",
                    "object_id": self.output['publisher']['object_id']
                })

            dataset['urls'] = []
            dist_elems = extract_elems(self.elem, ['distributionInfo'])
            for dist_elem in dist_elems:
                for d in self._parse_distribution(dist_elem):
                    if not d:
                        continue
                    url_sha = generate_sha_urn(d)
                    if url_sha not in urls:
                        urls.add(url_sha)
                        url_id = generate_uuid_urn()
                        dist = self._generate_harvest_manifest(**{
                            "bcube:hasUrlSource": "Harvested",
                            "bcube:hasConfidence": "Good",
                            "vcard:hasURL": d,
                            "object_id": url_id,
                            "dc:identifier": url_sha
                        })
                        dataset['urls'].append(dist)
                        dataset['relationships'].append({
                            "relate": "dcterms:references",
                            "object_id": url_id
                        })

            self.output['datasets'].append(dataset)
            self.output['keywords'] += keywords

        # TODO: removing this until we have a definition for SERVICE
        # # check for the service elements
        # service_elems = extract_elems(self.elem,
        #     ['identificationInfo', 'SV_ServiceIdentification'])
        # self.description['services'] = []
        # for service_elem in service_elems:
        #     sv = SrvParser(service_elem)
        #     self.description['services'].append(sv.parse())

        # switch the catalog record to a list for conformity. eep.
        self.output['catalog_records'] = [self.output['catalog_record']]
        del self.output['catalog_record']
        self.description = tidy_dict(self.output)
    def _get_operations(self, reader, service, version):
        '''
        each operation can have more than one endpoint (get and post, ex)

        formatOptions = output format of the response method that can be some
                        controlled vocab value (XMLSCHEMA, etc)

                        this is incorrectly handled in wfs 1.1.0 (hardcoded value)

                        later versions can have an outputFormat parameter instead
                        BUT it comes down to whether that is a proper query parameter
                        or simply the delivered response (no choice in the spec)

        parameters = {name: {values: list}}
        '''
        _vocabs = {
            "XMLSCHEMA": "application/xml",
            "GML2": "text/xml; subtype=gml/2.1.2"
        }

        def _check_controlled_vocabs(term):
            if term in _vocabs:
                return _vocabs[term]
            return term

        def _replace_nones(to_check):
            return '' if to_check is None else to_check

        def _append_params(base_url, operation):
            if not base_url[-1] == '?':
                base_url += '?'
            return base_url + 'SERVICE=%s&VERSION=%s&REQUEST=%s' % (service,
                                                                    version,
                                                                    operation)

        def _merge_params(op_name, found_params):
            '''
            for some parameter structure:
                {'resultType': {'values': ['results', 'hits']}}
            integrate into the config params with the common elements

            '''
            # TODO: how to handle aliases (if necessary)
            req_methods = self.config.get('methods', [])
            req_params = next(
                iter(
                    [d for d in req_methods if d['name'] == op_name.upper()]
                ), {}
            ).get('params', [])
            req_params = [] if not req_params else req_params
            defaults = self.config.get('common', []) + req_params

            if not found_params:
                return defaults

            for k, v in found_params.iteritems():
                param = next(
                    iter(d for d in defaults if d['name'] == k.lower()), [])
                if not param:
                    continue

                found_index = defaults.index(param)
                param['values'] = [
                    _check_controlled_vocabs(a) for a in v['values']
                ]
                defaults[found_index] = param

            return defaults

        def _return_parameter(param):
            # return a parameter dict without empty values
            parameter = {}
            for key in ['name', 'type', 'format', 'values']:
                if key in param and param[key]:
                    parameter[key] = param[key]
            return parameter

        operations = []
        for o in reader.operations:
            # TODO: handle the differing formatOptions

            # get the parameter values if supported by the service
            try:
                params = o.parameters
            except AttributeError:
                params = {}

            # merge with defaults (where it can be add the whole element
            #   OR parts of the element)
            params = _merge_params(o.name, params)

            # get the formatOptions
            try:
                formats = [
                    _check_controlled_vocabs(fo) for fo in o.formatOptions
                ]
            except AttributeError:
                formats = []

            endpoints = [
                tidy_dict({
                    "name": o.name,
                    "protocol": remap_http_method(m.get('type', '')),
                    "url": _append_params(m.get('url', ''), o.name),
                    "constraints": m.get('constraints', []),
                    "mimeType": formats,
                    "actionable": 1 if o.name == 'GetCapabilities' else 2,
                    "parameters": [_return_parameter(p) for p in params]
                }) for m in o.methods
            ]

            operations += endpoints

        return operations
Exemple #54
0
    def _get_operations(self, reader, service, version):
        '''
        each operation can have more than one endpoint (get and post, ex)

        formatOptions = output format of the response method that can be some
                        controlled vocab value (XMLSCHEMA, etc)

                        this is incorrectly handled in wfs 1.1.0 (hardcoded value)

                        later versions can have an outputFormat parameter instead
                        BUT it comes down to whether that is a proper query parameter
                        or simply the delivered response (no choice in the spec)

        parameters = {name: {values: list}}
        '''
        _vocabs = {
            "XMLSCHEMA": "application/xml",
            "GML2": "text/xml; subtype=gml/2.1.2"
        }

        def _check_controlled_vocabs(term):
            if term in _vocabs:
                return _vocabs[term]
            return term

        def _replace_nones(to_check):
            return '' if to_check is None else to_check

        def _append_params(base_url, operation):
            if not base_url[-1] == '?':
                base_url += '?'
            return base_url + 'SERVICE=%s&VERSION=%s&REQUEST=%s' % (
                service, version, operation)

        def _merge_params(op_name, found_params):
            '''
            for some parameter structure:
                {'resultType': {'values': ['results', 'hits']}}
            integrate into the config params with the common elements

            '''
            # TODO: how to handle aliases (if necessary)
            req_methods = self.config.get('methods', [])
            req_params = next(
                iter([d for d in req_methods if d['name'] == op_name.upper()]),
                {}).get('params', [])
            req_params = [] if not req_params else req_params
            defaults = self.config.get('common', []) + req_params

            if not found_params:
                return defaults

            for k, v in found_params.iteritems():
                param = next(
                    iter(d for d in defaults if d['name'] == k.lower()), [])
                if not param:
                    continue

                found_index = defaults.index(param)
                param['values'] = [
                    _check_controlled_vocabs(a) for a in v['values']
                ]
                defaults[found_index] = param

            return defaults

        def _return_parameter(param):
            # return a parameter dict without empty values
            parameter = {}
            for key in ['name', 'type', 'format', 'values']:
                if key in param and param[key]:
                    parameter[key] = param[key]
            return parameter

        operations = []
        for o in reader.operations:
            # TODO: handle the differing formatOptions

            # get the parameter values if supported by the service
            try:
                params = o.parameters
            except AttributeError:
                params = {}

            # merge with defaults (where it can be add the whole element
            #   OR parts of the element)
            params = _merge_params(o.name, params)

            # get the formatOptions
            try:
                formats = [
                    _check_controlled_vocabs(fo) for fo in o.formatOptions
                ]
            except AttributeError:
                formats = []

            endpoints = [
                tidy_dict({
                    "name": o.name,
                    "protocol": remap_http_method(m.get('type', '')),
                    "url": _append_params(m.get('url', ''), o.name),
                    "constraints": m.get('constraints', []),
                    "mimeType": formats,
                    "actionable": 1 if o.name == 'GetCapabilities' else 2,
                    "parameters": [_return_parameter(p) for p in params]
                }) for m in o.methods
            ]

            operations += endpoints

        return operations
Exemple #55
0
    def parse(self):
        '''
        from the root node, parse:
            identification (title, abstract, point of contact, keywords,
            extent) if identificationInfo contains SV_ServiceIdentification,
            add as child distribution info
        '''
        # set up the url set
        urls = set()
        urls.add(self.output['catalog_record']['urls'][0]['object_id'])

        for id_elem in extract_elems(
                self.elem,
            ['//*', 'identificationInfo', 'MD_DataIdentification']):
            dataset, keywords = self._parse_identification_info(id_elem)
            dataset['relationships'].append({
                "relate":
                "bcube:hasMetadataRecord",
                "object_id":
                self.output['catalog_record']['object_id']
            })
            dataset.update({
                "bcube:dateCreated":
                self.harvest_details.get('harvest_date', ''),
                "bcube:lastUpdated":
                self.harvest_details.get('harvest_date', '')
            })
            self.output['catalog_record']['relationships'].append({
                "relate":
                "foaf:primaryTopic",
                "object_id":
                dataset['object_id']
            })

            # point of contact from the root node and this might be an issue
            # in things like the -1/-3 from ngdc so try for an idinfo blob
            poc_elem = extract_elem(id_elem, [
                'identificationInfo', 'MD_DataIdentification',
                'pointOfContact', 'CI_ResponsibleParty'
            ])
            # if poc_elem is None:
            #     # and if that fails try for the root-level contact
            #     poc_elem = extract_elem(
            #         self.elem,
            #         ['contact', 'CI_ResponsibleParty'])

            # TODO: point of contact is not necessarily the publisher
            if poc_elem is not None:
                poc = self._parse_responsibleparty(poc_elem)
                location = (' '.join([
                    poc['contact'].get('city', ''), poc['contact'].get(
                        'country', '')
                ])).strip() if poc.get('contact', {}) else ''

                self.output['publishers'].append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "name": poc.get('organization', ''),
                        "location": location
                    }))
                dataset['relationships'].append({
                    "relate":
                    "dcterms:publisher",
                    "object_id":
                    self.output['publisher']['object_id']
                })

            dataset['urls'] = []
            dist_elems = extract_elems(self.elem, ['distributionInfo'])
            for dist_elem in dist_elems:
                for d in self._parse_distribution(dist_elem):
                    if not d:
                        continue
                    url_sha = generate_sha_urn(d)
                    if url_sha not in urls:
                        urls.add(url_sha)
                        url_id = generate_uuid_urn()
                        dist = self._generate_harvest_manifest(
                            **{
                                "bcube:hasUrlSource": "Harvested",
                                "bcube:hasConfidence": "Good",
                                "vcard:hasURL": d,
                                "object_id": url_id,
                                "dc:identifier": url_sha
                            })
                        dataset['urls'].append(dist)
                        dataset['relationships'].append({
                            "relate": "dcterms:references",
                            "object_id": url_id
                        })

            self.output['datasets'].append(dataset)
            self.output['keywords'] += keywords

        # TODO: removing this until we have a definition for SERVICE
        # # check for the service elements
        # service_elems = extract_elems(self.elem,
        #     ['identificationInfo', 'SV_ServiceIdentification'])
        # self.description['services'] = []
        # for service_elem in service_elems:
        #     sv = SrvParser(service_elem)
        #     self.description['services'].append(sv.parse())

        # switch the catalog record to a list for conformity. eep.
        self.output['catalog_records'] = [self.output['catalog_record']]
        del self.output['catalog_record']
        self.description = tidy_dict(self.output)