def _parse_service(self):
        output = {}

        service = {
            "object_id": generate_uuid_urn(),
            "dcterms:title": ' '.join(extract_items(
                self.parser.xml, ["Identify", "repositoryName"])),
            "rdf:type": "OAI-PMH",
            "relationships": [],
            "urls": []
        }
        url_id = generate_uuid_urn()
        dist = self._generate_harvest_manifest(**{
            "bcube:hasUrlSource": "Harvested",
            "bcube:hasConfidence": "Good",
            "vcard:hasURL": self.url,
            "object_id": url_id,
            "dc:identifier": generate_sha_urn(self.url)
        })
        service['urls'] = [dist]
        service['relationships'].append({
            "relate": "bcube:originatedFrom",
            "object_id": url_id
        })

        # output['version'] = extract_items(self.parser.xml, ["Identify", "protocolVersion"])
        # output['endpoints'] = [{'url': e} for e
        #                        in extract_items(self.parser.xml, ["Identify", "baseURL"])]

        output['services'] = [service]
        return tidy_dict(output)
    def _parse_keywords(self, elem):
        '''
        for each descriptiveKeywords block
        in an identification block
        '''
        keywords = []

        for key_elem in extract_elems(elem, ['descriptiveKeywords']):
            # TODO: split these up (if *-delimited in some way)
            terms = extract_items(
                key_elem,
                ['MD_Keywords', 'keyword', 'CharacterString'])
            key_type = extract_attrib(
                key_elem,
                ['MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue']
            )
            thesaurus = extract_item(
                key_elem,
                [
                    'MD_Keywords',
                    'thesaurusName',
                    'CI_Citation',
                    'title',
                    'CharacterString'
                ]
            )

            if terms:
                keywords.append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "dc:partOf": thesaurus,
                        "bcube:hasType": key_type,
                        "bcube:hasValue": terms
                    })
                )

        # TODO: add the Anchor element handling
        #       ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor']

        # add a generic set for the iso topic category
        isotopics = extract_items(
            elem, ['topicCategory', 'MD_TopicCategoryCode'])
        if isotopics:
            keywords.append(
                tidy_dict({
                    "object_id": generate_uuid_urn(),
                    "dc:partOf": 'IsoTopicCategories',
                    "bcube:hasValue": isotopics
                })
            )

        return keywords
Ejemplo n.º 3
0
    def _parse_keywords(self, elem):
        '''
        for each descriptiveKeywords block
        in an identification block
        '''
        keywords = []

        for key_elem in extract_elems(elem, ['descriptiveKeywords']):
            # TODO: split these up (if *-delimited in some way)
            terms = extract_items(
                key_elem, ['MD_Keywords', 'keyword', 'CharacterString'])
            key_type = extract_attrib(key_elem, [
                'MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue'
            ])
            thesaurus = extract_item(key_elem, [
                'MD_Keywords', 'thesaurusName', 'CI_Citation', 'title',
                'CharacterString'
            ])

            if terms:
                keywords.append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "dc:partOf": thesaurus,
                        "bcube:hasType": key_type,
                        "bcube:hasValue": terms
                    }))

        # TODO: add the Anchor element handling
        #       ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor']

        # add a generic set for the iso topic category
        isotopics = extract_items(elem,
                                  ['topicCategory', 'MD_TopicCategoryCode'])
        if isotopics:
            keywords.append(
                tidy_dict({
                    "object_id": generate_uuid_urn(),
                    "dc:partOf": 'IsoTopicCategories',
                    "bcube:hasValue": isotopics
                }))

        return keywords
    def parse(self):
        output = {}
        urls = set()

        if 'service' in self.identify:
            service = {
                "object_id": generate_uuid_urn(),
                "dcterms:title": extract_attrib(self.parser.xml, ['@name']),
                "rdf:type": "UNIDATA:THREDDS {0}".format(
                    extract_attrib(self.parser.xml, ['@version'])),
                "bcube:dateCreated":
                    self.harvest_details.get('harvest_date', ''),
                "bcube:lastUpdated":
                    self.harvest_details.get('harvest_date', ''),
                "relationships": [],
                "urls": []
            }
            url_sha = generate_sha_urn(self.url)
            urls.add(url_sha)
            original_url = self._generate_harvest_manifest(**{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": url_sha
            })
            service['urls'].append(original_url)
            # NOTE: this is not the sha from the url
            service['relationships'].append(
                {
                    "relate": "bcube:originatedFrom",
                    "object_id": url_sha
                }
            )

        # deal with the "dataset"
        service_bases = self.parser.xml.xpath(
            '//*[local-name()="service" and @base != ""]'
        )
        self.service_bases = {
            s.attrib.get('name'): s.attrib.get('base') for s in service_bases
        }

        # if 'dataset' in self.identify:
        #     # TODO: this is really not right but it is not
        #     # a proper web service so meh
        #     datasets = self._parse_datasets()

        # # if 'metadata' in self.identify:
        # #     self.description['metadata'] = self._parse_metadata()
        output['services'] = [service]
        self.description = tidy_dict(output)
    def _parse_service(self):
        output = {}

        service = {
            "object_id":
            generate_uuid_urn(),
            "dcterms:title":
            ' '.join(
                extract_items(self.parser.xml,
                              ["Identify", "repositoryName"])),
            "rdf:type":
            "OAI-PMH",
            "relationships": [],
            "urls": []
        }
        url_id = generate_uuid_urn()
        dist = self._generate_harvest_manifest(
            **{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": url_id,
                "dc:identifier": generate_sha_urn(self.url)
            })
        service['urls'] = [dist]
        service['relationships'].append({
            "relate": "bcube:originatedFrom",
            "object_id": url_id
        })

        # output['version'] = extract_items(self.parser.xml, ["Identify", "protocolVersion"])
        # output['endpoints'] = [{'url': e} for e
        #                        in extract_items(self.parser.xml, ["Identify", "baseURL"])]

        output['services'] = [service]
        return tidy_dict(output)
    def _parse_identification_info(self, elem):
        # ignoring the larger get all the identifiers above
        # in favor of, hopefully, getting a better dataset id
        dataset_identifier = extract_item(elem, [
            'citation',
            'CI_Citation',
            'identifier',
            'MD_Identifier',
            'code',
            'CharacterString'
        ])

        dataset = {
            "object_id": generate_uuid_urn(),
            "dc:identifier": dataset_identifier,
            "dc:description": extract_item(
                elem, ['abstract', 'CharacterString']),
            "dcterms:title": extract_item(elem, [
                'citation', 'CI_Citation', 'title', 'CharacterString']),
            "relationships": []
        }

        # TODO: i think the rights blob is not in the ontology prototypes
        # the rights information from MD_Constraints or MD_LegalConstraints
        # rights = extract_item(elem, ['resourceConstraints', '*',
        #   'useLimitation', 'CharacterString'])

        # deal with the extent
        extents = self._parse_extent(elem)
        dataset.update(extents)

        keywords = self._parse_keywords(elem)
        for keyword in keywords:
            dataset['relationships'].append({
                "relate": "dc:conformsTo",
                "object_id": keyword['object_id']
            })
        return tidy_dict(dataset), keywords
Ejemplo n.º 7
0
    def _parse_identification_info(self, elem):
        # ignoring the larger get all the identifiers above
        # in favor of, hopefully, getting a better dataset id
        dataset_identifier = extract_item(elem, [
            'citation', 'CI_Citation', 'identifier', 'MD_Identifier', 'code',
            'CharacterString'
        ])

        dataset = {
            "object_id":
            generate_uuid_urn(),
            "dc:identifier":
            dataset_identifier,
            "dc:description":
            extract_item(elem, ['abstract', 'CharacterString']),
            "dcterms:title":
            extract_item(
                elem, ['citation', 'CI_Citation', 'title', 'CharacterString']),
            "relationships": []
        }

        # TODO: i think the rights blob is not in the ontology prototypes
        # the rights information from MD_Constraints or MD_LegalConstraints
        # rights = extract_item(elem, ['resourceConstraints', '*',
        #   'useLimitation', 'CharacterString'])

        # deal with the extent
        extents = self._parse_extent(elem)
        dataset.update(extents)

        keywords = self._parse_keywords(elem)
        for keyword in keywords:
            dataset['relationships'].append({
                "relate": "dc:conformsTo",
                "object_id": keyword['object_id']
            })
        return tidy_dict(dataset), keywords
Ejemplo n.º 8
0
    def parse(self):
        '''
        from the root node, parse:
            identification (title, abstract, point of contact, keywords,
            extent) if identificationInfo contains SV_ServiceIdentification,
            add as child distribution info
        '''
        # set up the url set
        urls = set()
        urls.add(self.output['catalog_record']['urls'][0]['object_id'])

        for id_elem in extract_elems(
                self.elem,
            ['//*', 'identificationInfo', 'MD_DataIdentification']):
            dataset, keywords = self._parse_identification_info(id_elem)
            dataset['relationships'].append({
                "relate":
                "bcube:hasMetadataRecord",
                "object_id":
                self.output['catalog_record']['object_id']
            })
            dataset.update({
                "bcube:dateCreated":
                self.harvest_details.get('harvest_date', ''),
                "bcube:lastUpdated":
                self.harvest_details.get('harvest_date', '')
            })
            self.output['catalog_record']['relationships'].append({
                "relate":
                "foaf:primaryTopic",
                "object_id":
                dataset['object_id']
            })

            # point of contact from the root node and this might be an issue
            # in things like the -1/-3 from ngdc so try for an idinfo blob
            poc_elem = extract_elem(id_elem, [
                'identificationInfo', 'MD_DataIdentification',
                'pointOfContact', 'CI_ResponsibleParty'
            ])
            # if poc_elem is None:
            #     # and if that fails try for the root-level contact
            #     poc_elem = extract_elem(
            #         self.elem,
            #         ['contact', 'CI_ResponsibleParty'])

            # TODO: point of contact is not necessarily the publisher
            if poc_elem is not None:
                poc = self._parse_responsibleparty(poc_elem)
                location = (' '.join([
                    poc['contact'].get('city', ''), poc['contact'].get(
                        'country', '')
                ])).strip() if poc.get('contact', {}) else ''

                self.output['publishers'].append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "name": poc.get('organization', ''),
                        "location": location
                    }))
                dataset['relationships'].append({
                    "relate":
                    "dcterms:publisher",
                    "object_id":
                    self.output['publisher']['object_id']
                })

            dataset['urls'] = []
            dist_elems = extract_elems(self.elem, ['distributionInfo'])
            for dist_elem in dist_elems:
                for d in self._parse_distribution(dist_elem):
                    if not d:
                        continue
                    url_sha = generate_sha_urn(d)
                    if url_sha not in urls:
                        urls.add(url_sha)
                        url_id = generate_uuid_urn()
                        dist = self._generate_harvest_manifest(
                            **{
                                "bcube:hasUrlSource": "Harvested",
                                "bcube:hasConfidence": "Good",
                                "vcard:hasURL": d,
                                "object_id": url_id,
                                "dc:identifier": url_sha
                            })
                        dataset['urls'].append(dist)
                        dataset['relationships'].append({
                            "relate": "dcterms:references",
                            "object_id": url_id
                        })

            self.output['datasets'].append(dataset)
            self.output['keywords'] += keywords

        # TODO: removing this until we have a definition for SERVICE
        # # check for the service elements
        # service_elems = extract_elems(self.elem,
        #     ['identificationInfo', 'SV_ServiceIdentification'])
        # self.description['services'] = []
        # for service_elem in service_elems:
        #     sv = SrvParser(service_elem)
        #     self.description['services'].append(sv.parse())

        # switch the catalog record to a list for conformity. eep.
        self.output['catalog_records'] = [self.output['catalog_record']]
        del self.output['catalog_record']
        self.description = tidy_dict(self.output)
    def _parse_service(self):
        output = {}
        urls = set()

        service = {
            "object_id":
            generate_uuid_urn(),
            "bcube:dateCreated":
            self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated":
            self.harvest_details.get('harvest_date', ''),
            "rdf:type":
            'OpenSearch1.1:Description',
            "dcterms:title":
            extract_item(self.parser.xml, ["ShortName"]),
            "dc:description":
            ' '.join(
                extract_items(self.parser.xml, ["LongName"]) +
                extract_items(self.parser.xml, ["Description"])),
            "urls": [],
            "webpages": [],
            "relationships": []
        }
        url_sha = generate_sha_urn(self.url)
        urls.add(url_sha)
        original_url = self._generate_harvest_manifest(
            **{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": generate_uuid_urn(),
                "dc:identifier": url_sha
            })
        service['urls'].append(original_url)
        service['relationships'].append({
            "relate": "bcube:originatedFrom",
            "object_id": original_url['object_id']
        })

        # output['source'] = extract_items(
        #   self.parser.xml, ["Attribution"])
        # output['contact'] = extract_items(
        #     self.parser.xml, ["Developer"])
        # output['rights'] = extract_items(
        #   self.parser.xml, ["SyndicationRight"])

        key_id = generate_uuid_urn()
        output['keywords'] = [{
            "object_id":
            key_id,
            "bcube:hasValue":
            extract_items(self.parser.xml, ["Tags"])
        }]
        service['relationships'].append({
            "relate": "dc:conformsTo",
            "object_id": key_id
        })

        for t in extract_elems(self.parser.xml, ['Url']):
            ep = self._parse_endpoint(t)
            url_sha = generate_sha_urn(ep['url'])
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(
                    **{
                        "bcube:hasUrlSource": "Generated",
                        "bcube:hasConfidence": "Not Sure",
                        "vcard:hasURL": ep['url'],
                        "object_id": url_id,
                        "dc:identifier": url_sha
                    })
                service['urls'].append(dist)
                wb_id = generate_uuid_urn()
                service['webpages'].append({
                    "object_id":
                    wb_id,
                    "relationships": [{
                        "relate": "dcterms:references",
                        "object_id": url_id
                    }]
                })
                service['relationships'].append({
                    "relate": "dcterms:references",
                    "object_id": wb_id
                })

        output['services'] = [service]

        return tidy_dict(output)
    def parse(self):
        '''
        run the routing
        '''

        if not self.identity:
            # we're going to have to sort it out
            self.identity = {}

        metadata = self.identity.get('metadata', {})
        if not metadata:
            return {}

        metadata_type = metadata.get('name', '')
        if not metadata_type:
            return {}

        # TODO: this is unlikely to be correct, given the ds record
        #       but we're not going there just yet
        # TODO: deal with conformsTo (multiple schemaLocations, etc)
        catalog_record = {
            "object_id": generate_uuid_urn(),
            "rdf:type": self._version_to_urn(),
            "bcube:dateCreated": self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''),
            # "dc:conformsTo": extract_attrib(
            #     self.parser.xml, ['@schemaLocation']).split(),
            "relationships": [],
            "urls": []
        }
        original_url = self._generate_harvest_manifest(**{
            "bcube:hasUrlSource": "Harvested",
            "bcube:hasConfidence": "Good",
            "vcard:hasURL": self.url,
            "object_id": generate_uuid_urn()
        })
        catalog_record['urls'].append(original_url)
        catalog_record['relationships'].append({
            "relate": "bcube:originatedFrom",
            "object_id": original_url['object_id']
        })

        if metadata_type == 'Data Series':
            # run the set
            self.reader = DsParser(self.parser.xml, catalog_record)
        elif metadata_type == '19119':
            # run that
            for srv in extract_elems(
                self.parser.xml,
                    ['identificationInfo', 'SV_ServiceIdentification']):
                reader = SrvParser(srv, catalog_record)
                reader.parse()
        elif metadata_type == '19115':
            # it's a mi/md so run that
            self.reader = MxParser(
                self.parser.xml,
                catalog_record,
                self.harvest_details
            )
            self.reader.parse()

        # self.reader.parse()
        # # pass it back up the chain a bit
        self.description = self.reader.output
    def parse(self):
        '''
        from the root node, parse:
            identification (title, abstract, point of contact, keywords,
            extent) if identificationInfo contains SV_ServiceIdentification,
            add as child distribution info
        '''
        # set up the url set
        urls = set()
        urls.add(self.output['catalog_record']['urls'][0]['object_id'])

        for id_elem in extract_elems(
                self.elem,
                ['//*', 'identificationInfo', 'MD_DataIdentification']):
            dataset, keywords = self._parse_identification_info(id_elem)
            dataset['relationships'].append({
                "relate": "bcube:hasMetadataRecord",
                "object_id": self.output['catalog_record']['object_id']
            })
            dataset.update({
                "bcube:dateCreated":
                    self.harvest_details.get('harvest_date', ''),
                "bcube:lastUpdated":
                    self.harvest_details.get('harvest_date', '')
            })
            self.output['catalog_record']['relationships'].append({
                "relate": "foaf:primaryTopic",
                "object_id": dataset['object_id']
            })

            # point of contact from the root node and this might be an issue
            # in things like the -1/-3 from ngdc so try for an idinfo blob
            poc_elem = extract_elem(id_elem, [
                'identificationInfo',
                'MD_DataIdentification',
                'pointOfContact',
                'CI_ResponsibleParty'])
            # if poc_elem is None:
            #     # and if that fails try for the root-level contact
            #     poc_elem = extract_elem(
            #         self.elem,
            #         ['contact', 'CI_ResponsibleParty'])

            # TODO: point of contact is not necessarily the publisher
            if poc_elem is not None:
                poc = self._parse_responsibleparty(poc_elem)
                location = (
                    ' '.join(
                        [poc['contact'].get('city', ''),
                         poc['contact'].get('country', '')])
                ).strip() if poc.get('contact', {}) else ''

                self.output['publishers'].append(tidy_dict({
                    "object_id": generate_uuid_urn(),
                    "name": poc.get('organization', ''),
                    "location": location
                }))
                dataset['relationships'].append({
                    "relate": "dcterms:publisher",
                    "object_id": self.output['publisher']['object_id']
                })

            dataset['urls'] = []
            dist_elems = extract_elems(self.elem, ['distributionInfo'])
            for dist_elem in dist_elems:
                for d in self._parse_distribution(dist_elem):
                    if not d:
                        continue
                    url_sha = generate_sha_urn(d)
                    if url_sha not in urls:
                        urls.add(url_sha)
                        url_id = generate_uuid_urn()
                        dist = self._generate_harvest_manifest(**{
                            "bcube:hasUrlSource": "Harvested",
                            "bcube:hasConfidence": "Good",
                            "vcard:hasURL": d,
                            "object_id": url_id,
                            "dc:identifier": url_sha
                        })
                        dataset['urls'].append(dist)
                        dataset['relationships'].append({
                            "relate": "dcterms:references",
                            "object_id": url_id
                        })

            self.output['datasets'].append(dataset)
            self.output['keywords'] += keywords

        # TODO: removing this until we have a definition for SERVICE
        # # check for the service elements
        # service_elems = extract_elems(self.elem,
        #     ['identificationInfo', 'SV_ServiceIdentification'])
        # self.description['services'] = []
        # for service_elem in service_elems:
        #     sv = SrvParser(service_elem)
        #     self.description['services'].append(sv.parse())

        # switch the catalog record to a list for conformity. eep.
        self.output['catalog_records'] = [self.output['catalog_record']]
        del self.output['catalog_record']
        self.description = tidy_dict(self.output)
    def parse(self):
        # for ogc, a catalog record is the getcapabilities rsp
        output = {
            "layers": [],
            "catalog_records": []
        }

        urls = set()

        if not self.reader:
            self.description = {}
            return

        if 'service' in self.identify:
            service_id = generate_uuid_urn()
            service = {
                "object_id": service_id,
                "bcube:dateCreated": self.harvest_details.get(
                    'harvest_date', ''),
                "bcube:lastUpdated": self.harvest_details.get(
                    'harvest_date', ''),
                "relationships": [],
                "urls": [],
                "rdf:type": self.urn
            }

            url_sha = generate_sha_urn(self.url)
            urls.add(url_sha)
            url_id = generate_uuid_urn()
            service['urls'].append(
                self._generate_harvest_manifest(**{
                    "bcube:hasUrlSource": "Harvested",
                    "bcube:hasConfidence": "Good",
                    "vcard:hasURL": self.url,
                    "object_id": url_id,
                    "dc:identifier": url_sha
                })
            )
            service['relationships'].append({
                "relate": "bcube:originatedFrom",
                "object_id": url_id
            })

            # self._get_service_config(service_name, version)
            service_reader = self._parse_service(
                self.reader, self.service_name, self.version)

            # map to triples
            service.update({
                "dc:description": service_reader.get('abstract')
            })

            keywords = service_reader.get('subject', [])
            if keywords:
                output['keywords'] = [{
                    "object_id": generate_uuid_urn(),
                    "bcube:hasValue": keywords
                }]
                for k in output['keywords']:
                    service['relationships'].append(
                        {
                            "relate": "dc:conformsTo",
                            "object_id": k['object_id']
                        }
                    )
            if self.identify['service'].get('request', '') == 'GetCapabilities':
                # this is also awkward. meh. needs must.
                layers = []
                listed_layers = self._parse_getcap_datasets(self.reader)

                for ld in listed_layers:
                    layer = {
                        "object_id": generate_uuid_urn(),
                        "bcube:dateCreated":
                            self.harvest_details.get('harvest_date', ''),
                        "bcube:lastUpdated":
                            self.harvest_details.get('harvest_date', ''),
                        "dc:description": ld.get('abstract', ''),
                        "dc:title": ld.get('title', ''),
                        "relationships": []
                    }
                    service['relationships'].append({
                        "relate": "bcube:contains",
                        "object_id": layer['object_id']
                    })

                    # add the generated url for the service
                    generated_url = self._generate_url(
                        self.url,
                        ld.get('name'),
                        ld.get('bbox'),
                        self.service_name,
                        self.version
                    )
                    if generated_url:
                        url_sha = generate_sha_urn(generated_url)
                        if url_sha not in urls:
                            urls.add(url_sha)
                            url_id = generate_uuid_urn()
                            layer_url = self._generate_harvest_manifest(**{
                                "vcard:hasURL": generated_url,
                                "bcube:hasUrlSource": "Generated",
                                "bcube:hasConfidence": "Not Sure",
                                "object_id": url_id,
                                "dc:identifier": url_sha
                            })
                            service['urls'].append(layer_url)
                        # don't add to the larger set, but do
                        # include the reference within the layer
                        layer['relationships'].append({
                            "relate": "dcterms:references",
                            "object_id": url_id
                        })

                    # add each as a dataset with just a url for now
                    for mu in ld.get('metadata_urls', []):
                        url_link = generate_uuid_urn()
                        url_sha = generate_sha_urn(mu.get('url'))

                        if url_sha not in urls:
                            urls.add(url_sha)
                            url_id = generate_uuid_urn()
                            output['catalog_records'] += [
                                {
                                    "object_id": url_link,
                                    "urls": [
                                        self._generate_harvest_manifest(**{
                                            "vcard:hasURL": mu.get('url'),
                                            "bcube:hasUrlSource": "Harvested",
                                            "bcube:hasConfidence": "Good",
                                            "object_id": url_id,
                                            "dc:identifier": url_sha
                                        })
                                    ],
                                    "relationships": [
                                        {
                                            "relate": "dc:describes",
                                            "object_id": layer['object_id']
                                        },
                                        {
                                            "relate": "bcube:originatedFrom",
                                            "object_id": url_id
                                        }
                                    ]
                                }
                            ]

                    if 'temporal_extent' in ld:
                        temporal = tidy_dict(
                            {
                                "esip:startDate":
                                    ld['temporal_extent'].get('begin', ''),
                                "esip:endDate":
                                    ld['temporal_extent'].get('end', '')
                            }
                        )
                        if temporal:
                            layer.update(temporal)

                    if 'bbox' in ld:
                        layer.update(ld['bbox'])

                    layers.append(layer)

                service['layers'] = layers
                # if layers:
                #     service['layers'] = layers
                #     for layer in layers:
                #         service['relationships'].append({
                #             "relate": "bcube:contains",
                #             "object_id": layer['object_id']
                #         })

        output['services'] = [service]
        self.description = tidy_dict(output)
Ejemplo n.º 13
0
    def parse_item(self):
        output = {}

        urls = set()

        catalog_object_id = generate_uuid_urn()

        output['catalog_record'] = {
            "object_id": catalog_object_id,
            "bcube:dateCreated": self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''),
            # "dc:conformsTo": extract_attrib(
            #     self.elem, ['@noNamespaceSchemaLocation']).split(),
            "rdf:type": "FGDC:CSDGM",
            "relationships": [],
            "urls": []
        }
        output['urls'] = []

        # add the harvest info
        # this is not necessary as a sha just for set inclusion
        url_sha = generate_sha_urn(self.url)
        urls.add(url_sha)
        original_url = self._generate_harvest_manifest(
            **{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": generate_uuid_urn(),
                "dc:identifier": url_sha
            })
        output['catalog_record']['urls'].append(original_url)
        # NOTE: this is not the sha from the url
        output['catalog_record']['relationships'].append({
            "relate":
            "bcube:originatedFrom",
            "object_id":
            original_url['object_id']
        })

        datsetid = extract_item(self.elem, ['idinfo', 'datsetid'])
        dataset_object_id = generate_uuid_urn()

        dataset = {
            "object_id":
            dataset_object_id,
            "dcterms:identifier":
            datsetid,
            "bcube:dateCreated":
            self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated":
            self.harvest_details.get('harvest_date', ''),
            "dc:description":
            extract_item(self.elem, ['idinfo', 'descript', 'abstract']),
            "dcterms:title":
            extract_item(self.elem,
                         ['idinfo', 'citation', 'citeinfo', 'title']),
            "urls": [],
            "relationships": []
        }

        bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding'])
        if bbox_elem is not None:
            # that's not even valid
            west = extract_item(bbox_elem, ['westbc'])
            east = extract_item(bbox_elem, ['eastbc'])
            north = extract_item(bbox_elem, ['northbc'])
            south = extract_item(bbox_elem, ['southbc'])
            bbox = [west, south, east, north]
            bbox = bbox_to_geom(bbox)
            bbox = to_wkt(bbox)

            dataset.update({
                "dc:spatial": bbox,
                "esip:westBound": west,
                "esip:eastBound": east,
                "esip:northBound": north,
                "esip:southBound": south
            })

        time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo'])
        if time_elem is not None:
            caldate = extract_item(time_elem, ['sngdate', 'caldate'])
            if caldate:
                # TODO: we should see if it's at least a valid date
                dataset['esip:startDate'] = self._convert_date(caldate)

            rngdate = extract_elem(time_elem, ['rngdates'])
            if rngdate is not None:
                dataset['esip:startDate'] = self._convert_date(
                    extract_item(rngdate, ['begdate']))
                dataset['esip:endDate'] = self._convert_date(
                    extract_item(rngdate, ['enddate']))
            # TODO: add the min/max of the list of dates

        dataset['relationships'] = [{
            "relate": "bcube:hasMetadataRecord",
            "object_id": catalog_object_id
        }]

        publisher = {
            "object_id":
            generate_uuid_urn(),
            "name":
            extract_item(
                self.elem,
                ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']),
            "location":
            extract_item(
                self.elem,
                ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace'])
        }
        output['publisher'] = publisher
        dataset['relationships'].append({
            "relate": "dcterms:publisher",
            "object_id": publisher['object_id']
        })

        distrib_elems = extract_elems(self.elem,
                                      ['distinfo', 'stdorder', 'digform'])

        for distrib_elem in distrib_elems:
            link = extract_item(
                distrib_elem,
                ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr'])
            # format = extract_item(distrib_elem, ['digtinfo', 'formname'])
            url_sha = generate_sha_urn(link)
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(
                    **{
                        "bcube:hasUrlSource": "Harvested",
                        "bcube:hasConfidence": "Good",
                        "vcard:hasURL": link,
                        "object_id": url_id,
                        "dc:identifier": url_sha
                    })
                dataset['urls'].append(dist)
                # this is a distribution link so
                # we are assuming it is to data
                dataset['relationships'].append({
                    "relate": "dcterms:references",
                    "object_id": url_id
                })

        webpages = []
        onlink_elems = extract_elems(
            self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink'])
        for onlink_elem in onlink_elems:
            link = onlink_elem.text.strip() if onlink_elem.text else ''
            if not link:
                continue
            url_sha = generate_sha_urn(link)
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(
                    **{
                        "bcube:hasUrlSource": "Harvested",
                        "bcube:hasConfidence": "Good",
                        "vcard:hasURL": link,
                        "object_id": url_id,
                        "dc:identifier": url_sha
                    })
                dataset['urls'].append(dist)
                webpages.append({
                    "object_id":
                    generate_uuid_urn(),
                    "relationships": [{
                        "relate": "dcterms:references",
                        "object_id": url_id
                    }]
                })

        output['catalog_record']['webpages'] = webpages
        for webpage in webpages:
            dataset['relationships'].append({
                "relate": "dcterms:references",
                "object_id": webpage['object_id']
            })

        # retain the keyword sets with type, thesaurus name and split
        # the terms as best we can
        keywords = []
        key_elem = extract_elem(self.elem, ['idinfo', 'keywords'])
        for child in key_elem.iterchildren():
            key_type = extract_element_tag(child.tag)
            key_tag = 'strat' if key_type == 'stratum' else key_type
            key_tag = 'temp' if key_tag == 'temporal' else key_tag
            thesaurus = extract_item(child, ['%skt' % key_tag])

            # TODO: split these up
            terms = extract_items(child, ['%skey' % key_tag])

            if terms:
                # if there's a parsing error (bad cdata, etc) may not have
                # TODO: add something for a set without a thesaurus name
                keywords.append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "dc:partOf": thesaurus,
                        "bcube:hasType": key_type,
                        "bcube:hasValue": terms
                    }))
        output['keywords'] = keywords
        for keyword in keywords:
            dataset['relationships'].append({
                "relate": "dc:conformsTo",
                "object_id": keyword['object_id']
            })

        output['datasets'] = [dataset]

        # add the metadata relate
        output['catalog_record']['relationships'].append({
            "relate":
            "foaf:primaryTopic",
            "object_id":
            dataset_object_id
        })

        output['catalog_records'] = [output['catalog_record']]
        del output['catalog_record']
        self.description = tidy_dict(output)
Ejemplo n.º 14
0
    def parse(self):
        '''
        run the routing
        '''

        if not self.identity:
            # we're going to have to sort it out
            self.identity = {}

        metadata = self.identity.get('metadata', {})
        if not metadata:
            return {}

        metadata_type = metadata.get('name', '')
        if not metadata_type:
            return {}

        # TODO: this is unlikely to be correct, given the ds record
        #       but we're not going there just yet
        # TODO: deal with conformsTo (multiple schemaLocations, etc)
        catalog_record = {
            "object_id": generate_uuid_urn(),
            "rdf:type": self._version_to_urn(),
            "bcube:dateCreated": self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''),
            # "dc:conformsTo": extract_attrib(
            #     self.parser.xml, ['@schemaLocation']).split(),
            "relationships": [],
            "urls": []
        }
        original_url = self._generate_harvest_manifest(
            **{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": generate_uuid_urn()
            })
        catalog_record['urls'].append(original_url)
        catalog_record['relationships'].append({
            "relate":
            "bcube:originatedFrom",
            "object_id":
            original_url['object_id']
        })

        if metadata_type == 'Data Series':
            # run the set
            self.reader = DsParser(self.parser.xml, catalog_record)
        elif metadata_type == '19119':
            # run that
            for srv in extract_elems(
                    self.parser.xml,
                ['identificationInfo', 'SV_ServiceIdentification']):
                reader = SrvParser(srv, catalog_record)
                reader.parse()
        elif metadata_type == '19115':
            # it's a mi/md so run that
            self.reader = MxParser(self.parser.xml, catalog_record,
                                   self.harvest_details)
            self.reader.parse()

        # self.reader.parse()
        # # pass it back up the chain a bit
        self.description = self.reader.output
Ejemplo n.º 15
0
    def parse(self):
        # for ogc, a catalog record is the getcapabilities rsp
        output = {"layers": [], "catalog_records": []}

        urls = set()

        if not self.reader:
            self.description = {}
            return

        if 'service' in self.identify:
            service_id = generate_uuid_urn()
            service = {
                "object_id": service_id,
                "bcube:dateCreated":
                self.harvest_details.get('harvest_date', ''),
                "bcube:lastUpdated":
                self.harvest_details.get('harvest_date', ''),
                "relationships": [],
                "urls": [],
                "rdf:type": self.urn
            }

            url_sha = generate_sha_urn(self.url)
            urls.add(url_sha)
            url_id = generate_uuid_urn()
            service['urls'].append(
                self._generate_harvest_manifest(
                    **{
                        "bcube:hasUrlSource": "Harvested",
                        "bcube:hasConfidence": "Good",
                        "vcard:hasURL": self.url,
                        "object_id": url_id,
                        "dc:identifier": url_sha
                    }))
            service['relationships'].append({
                "relate": "bcube:originatedFrom",
                "object_id": url_id
            })

            # self._get_service_config(service_name, version)
            service_reader = self._parse_service(self.reader,
                                                 self.service_name,
                                                 self.version)

            # map to triples
            service.update({"dc:description": service_reader.get('abstract')})

            keywords = service_reader.get('subject', [])
            if keywords:
                output['keywords'] = [{
                    "object_id": generate_uuid_urn(),
                    "bcube:hasValue": keywords
                }]
                for k in output['keywords']:
                    service['relationships'].append({
                        "relate": "dc:conformsTo",
                        "object_id": k['object_id']
                    })
            if self.identify['service'].get('request',
                                            '') == 'GetCapabilities':
                # this is also awkward. meh. needs must.
                layers = []
                listed_layers = self._parse_getcap_datasets(self.reader)

                for ld in listed_layers:
                    layer = {
                        "object_id":
                        generate_uuid_urn(),
                        "bcube:dateCreated":
                        self.harvest_details.get('harvest_date', ''),
                        "bcube:lastUpdated":
                        self.harvest_details.get('harvest_date', ''),
                        "dc:description":
                        ld.get('abstract', ''),
                        "dc:title":
                        ld.get('title', ''),
                        "relationships": []
                    }
                    service['relationships'].append({
                        "relate":
                        "bcube:contains",
                        "object_id":
                        layer['object_id']
                    })

                    # add the generated url for the service
                    generated_url = self._generate_url(self.url,
                                                       ld.get('name'),
                                                       ld.get('bbox'),
                                                       self.service_name,
                                                       self.version)
                    if generated_url:
                        url_sha = generate_sha_urn(generated_url)
                        if url_sha not in urls:
                            urls.add(url_sha)
                            url_id = generate_uuid_urn()
                            layer_url = self._generate_harvest_manifest(
                                **{
                                    "vcard:hasURL": generated_url,
                                    "bcube:hasUrlSource": "Generated",
                                    "bcube:hasConfidence": "Not Sure",
                                    "object_id": url_id,
                                    "dc:identifier": url_sha
                                })
                            service['urls'].append(layer_url)
                        # don't add to the larger set, but do
                        # include the reference within the layer
                        layer['relationships'].append({
                            "relate": "dcterms:references",
                            "object_id": url_id
                        })

                    # add each as a dataset with just a url for now
                    for mu in ld.get('metadata_urls', []):
                        url_link = generate_uuid_urn()
                        url_sha = generate_sha_urn(mu.get('url'))

                        if url_sha not in urls:
                            urls.add(url_sha)
                            url_id = generate_uuid_urn()
                            output['catalog_records'] += [{
                                "object_id":
                                url_link,
                                "urls": [
                                    self._generate_harvest_manifest(
                                        **{
                                            "vcard:hasURL": mu.get('url'),
                                            "bcube:hasUrlSource": "Harvested",
                                            "bcube:hasConfidence": "Good",
                                            "object_id": url_id,
                                            "dc:identifier": url_sha
                                        })
                                ],
                                "relationships": [{
                                    "relate":
                                    "dc:describes",
                                    "object_id":
                                    layer['object_id']
                                }, {
                                    "relate": "bcube:originatedFrom",
                                    "object_id": url_id
                                }]
                            }]

                    if 'temporal_extent' in ld:
                        temporal = tidy_dict({
                            "esip:startDate":
                            ld['temporal_extent'].get('begin', ''),
                            "esip:endDate":
                            ld['temporal_extent'].get('end', '')
                        })
                        if temporal:
                            layer.update(temporal)

                    if 'bbox' in ld:
                        layer.update(ld['bbox'])

                    layers.append(layer)

                service['layers'] = layers
                # if layers:
                #     service['layers'] = layers
                #     for layer in layers:
                #         service['relationships'].append({
                #             "relate": "bcube:contains",
                #             "object_id": layer['object_id']
                #         })

        output['services'] = [service]
        self.description = tidy_dict(output)
    def _parse_service(self):
        output = {}
        urls = set()

        service = {
            "object_id": generate_uuid_urn(),
            "bcube:dateCreated": self.harvest_details.get("harvest_date", ""),
            "bcube:lastUpdated": self.harvest_details.get("harvest_date", ""),
            "rdf:type": "OpenSearch1.1:Description",
            "dcterms:title": extract_item(self.parser.xml, ["ShortName"]),
            "dc:description": " ".join(
                extract_items(self.parser.xml, ["LongName"]) + extract_items(self.parser.xml, ["Description"])
            ),
            "urls": [],
            "webpages": [],
            "relationships": [],
        }
        url_sha = generate_sha_urn(self.url)
        urls.add(url_sha)
        original_url = self._generate_harvest_manifest(
            **{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": generate_uuid_urn(),
                "dc:identifier": url_sha,
            }
        )
        service["urls"].append(original_url)
        service["relationships"].append({"relate": "bcube:originatedFrom", "object_id": original_url["object_id"]})

        # output['source'] = extract_items(
        #   self.parser.xml, ["Attribution"])
        # output['contact'] = extract_items(
        #     self.parser.xml, ["Developer"])
        # output['rights'] = extract_items(
        #   self.parser.xml, ["SyndicationRight"])

        key_id = generate_uuid_urn()
        output["keywords"] = [{"object_id": key_id, "bcube:hasValue": extract_items(self.parser.xml, ["Tags"])}]
        service["relationships"].append({"relate": "dc:conformsTo", "object_id": key_id})

        for t in extract_elems(self.parser.xml, ["Url"]):
            ep = self._parse_endpoint(t)
            url_sha = generate_sha_urn(ep["url"])
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(
                    **{
                        "bcube:hasUrlSource": "Generated",
                        "bcube:hasConfidence": "Not Sure",
                        "vcard:hasURL": ep["url"],
                        "object_id": url_id,
                        "dc:identifier": url_sha,
                    }
                )
                service["urls"].append(dist)
                wb_id = generate_uuid_urn()
                service["webpages"].append(
                    {"object_id": wb_id, "relationships": [{"relate": "dcterms:references", "object_id": url_id}]}
                )
                service["relationships"].append({"relate": "dcterms:references", "object_id": wb_id})

        output["services"] = [service]

        return tidy_dict(output)
    def parse_item(self):
        output = {}

        urls = set()

        catalog_object_id = generate_uuid_urn()

        output['catalog_record'] = {
            "object_id": catalog_object_id,
            "bcube:dateCreated": self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''),
            # "dc:conformsTo": extract_attrib(
            #     self.elem, ['@noNamespaceSchemaLocation']).split(),
            "rdf:type": "FGDC:CSDGM",
            "relationships": [],
            "urls": []
        }
        output['urls'] = []

        # add the harvest info
        # this is not necessary as a sha just for set inclusion
        url_sha = generate_sha_urn(self.url)
        urls.add(url_sha)
        original_url = self._generate_harvest_manifest(**{
            "bcube:hasUrlSource": "Harvested",
            "bcube:hasConfidence": "Good",
            "vcard:hasURL": self.url,
            "object_id": generate_uuid_urn(),
            "dc:identifier": url_sha
        })
        output['catalog_record']['urls'].append(original_url)
        # NOTE: this is not the sha from the url
        output['catalog_record']['relationships'].append(
            {
                "relate": "bcube:originatedFrom",
                "object_id": original_url['object_id']
            }
        )

        datsetid = extract_item(self.elem, ['idinfo', 'datsetid'])
        dataset_object_id = generate_uuid_urn()

        dataset = {
            "object_id": dataset_object_id,
            "dcterms:identifier": datsetid,
            "bcube:dateCreated": self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''),
            "dc:description": extract_item(
                self.elem, ['idinfo', 'descript', 'abstract']),
            "dcterms:title": extract_item(
                self.elem, ['idinfo', 'citation', 'citeinfo', 'title']),
            "urls": [],
            "relationships": []
        }

        bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding'])
        if bbox_elem is not None:
            # that's not even valid
            west = extract_item(bbox_elem, ['westbc'])
            east = extract_item(bbox_elem, ['eastbc'])
            north = extract_item(bbox_elem, ['northbc'])
            south = extract_item(bbox_elem, ['southbc'])
            bbox = [west, south, east, north]
            bbox = bbox_to_geom(bbox)
            bbox = to_wkt(bbox)

            dataset.update({
                "dc:spatial": bbox,
                "esip:westBound": west,
                "esip:eastBound": east,
                "esip:northBound": north,
                "esip:southBound": south
            })

        time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo'])
        if time_elem is not None:
            caldate = extract_item(time_elem, ['sngdate', 'caldate'])
            if caldate:
                # TODO: we should see if it's at least a valid date
                dataset['esip:startDate'] = self._convert_date(caldate)

            rngdate = extract_elem(time_elem, ['rngdates'])
            if rngdate is not None:
                dataset['esip:startDate'] = self._convert_date(
                    extract_item(rngdate, ['begdate']))
                dataset['esip:endDate'] = self._convert_date(
                    extract_item(rngdate, ['enddate']))
            # TODO: add the min/max of the list of dates

        dataset['relationships'] = [
            {
                "relate": "bcube:hasMetadataRecord",
                "object_id": catalog_object_id
            }
        ]

        publisher = {
            "object_id": generate_uuid_urn(),
            "name": extract_item(
                self.elem,
                ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']),
            "location": extract_item(
                self.elem,
                ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace'])
        }
        output['publisher'] = publisher
        dataset['relationships'].append({
            "relate": "dcterms:publisher",
            "object_id": publisher['object_id']
        })

        distrib_elems = extract_elems(
            self.elem, ['distinfo', 'stdorder', 'digform'])

        for distrib_elem in distrib_elems:
            link = extract_item(
                distrib_elem,
                ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr'])
            # format = extract_item(distrib_elem, ['digtinfo', 'formname'])
            url_sha = generate_sha_urn(link)
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(**{
                    "bcube:hasUrlSource": "Harvested",
                    "bcube:hasConfidence": "Good",
                    "vcard:hasURL": link,
                    "object_id": url_id,
                    "dc:identifier": url_sha
                })
                dataset['urls'].append(dist)
                # this is a distribution link so
                # we are assuming it is to data
                dataset['relationships'].append({
                    "relate": "dcterms:references",
                    "object_id": url_id
                })

        webpages = []
        onlink_elems = extract_elems(
            self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink'])
        for onlink_elem in onlink_elems:
            link = onlink_elem.text.strip() if onlink_elem.text else ''
            if not link:
                continue
            url_sha = generate_sha_urn(link)
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(**{
                    "bcube:hasUrlSource": "Harvested",
                    "bcube:hasConfidence": "Good",
                    "vcard:hasURL": link,
                    "object_id": url_id,
                    "dc:identifier": url_sha
                })
                dataset['urls'].append(dist)
                webpages.append({
                    "object_id": generate_uuid_urn(),
                    "relationships": [
                        {
                            "relate": "dcterms:references",
                            "object_id": url_id
                        }
                    ]}
                )

        output['catalog_record']['webpages'] = webpages
        for webpage in webpages:
            dataset['relationships'].append({
                "relate": "dcterms:references",
                "object_id": webpage['object_id']
            })

        # retain the keyword sets with type, thesaurus name and split
        # the terms as best we can
        keywords = []
        key_elem = extract_elem(self.elem, ['idinfo', 'keywords'])
        for child in key_elem.iterchildren():
            key_type = extract_element_tag(child.tag)
            key_tag = 'strat' if key_type == 'stratum' else key_type
            key_tag = 'temp' if key_tag == 'temporal' else key_tag
            thesaurus = extract_item(child, ['%skt' % key_tag])

            # TODO: split these up
            terms = extract_items(child, ['%skey' % key_tag])

            if terms:
                # if there's a parsing error (bad cdata, etc) may not have
                # TODO: add something for a set without a thesaurus name
                keywords.append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "dc:partOf": thesaurus,
                        "bcube:hasType": key_type,
                        "bcube:hasValue": terms
                    })
                )
        output['keywords'] = keywords
        for keyword in keywords:
            dataset['relationships'].append(
                {
                    "relate": "dc:conformsTo",
                    "object_id": keyword['object_id']
                }
            )

        output['datasets'] = [dataset]

        # add the metadata relate
        output['catalog_record']['relationships'].append(
            {
                "relate": "foaf:primaryTopic",
                "object_id": dataset_object_id
            }
        )

        output['catalog_records'] = [output['catalog_record']]
        del output['catalog_record']
        self.description = tidy_dict(output)