def _parse_distribution(self, elem):
        ''' from the distributionInfo element '''
        for dist_elem in extract_elems(elem, ['MD_Distribution']):
            # this is going to get ugly.
            # super ugly
            # get the transferoptions block
            # get the url, the name, the description, the size
            # get the format from a parent node
            # but where the transferoptions can be in some nested
            # distributor thing or at the root of the element (NOT
            # the root of the file)
            transfer_elems = extract_elems(
                dist_elem, ['//*', 'MD_DigitalTransferOptions'])
            for transfer_elem in transfer_elems:
                # _transfer = {}
                # transfer['url'] = extract_item(
                #     transfer_elem,
                #     ['onLine', 'CI_OnlineResource', 'linkage', 'URL'])
                # transfer['objectid'] = generate_sha_urn(transfer['url'])

                # xp = generate_localname_xpath(
                #     ['..', '..', 'distributorFormat', 'MD_Format'])
                # format_elem = next(iter(transfer_elem.xpath(xp)), None)
                # if format_elem is not None:
                #     transfer['format'] = ' '.join([
                #         extract_item(format_elem,
                #                      ['name', 'CharacterString']),
                #         extract_item(
                #             format_elem, ['version', 'CharacterString'])])
                link = extract_item(
                    transfer_elem,
                    ['onLine', 'CI_OnlineResource', 'linkage', 'URL'])
                yield link
    def _parse_item(self, elem):
        entry = {}

        entry['title'] = extract_item(elem, ['title'])
        entry['id'] = extract_item(elem, ['id'])
        entry['creator'] = extract_item(elem, ['creator'])
        entry['author'] = extract_item(elem, ['author', 'name'])
        entry['date'] = extract_item(elem, ['date'])
        entry['updated'] = extract_item(elem, ['updated'])
        entry['published'] = extract_item(elem, ['published'])

        entry['subjects'] = [
            e.attrib.get('term', '')
            for e in extract_elems(elem, ['category'])
        ]

        entry['contents'] = []
        contents = extract_elems(elem, ['content'])
        for content in contents:
            text = content.text.strip() if content.text else ''
            content_type = content.attrib.get('type', '')
            entry['contents'].append({'content': text, 'type': content_type})

        entry['links'] = []
        links = extract_elems(elem, ['link'])
        for link in links:
            href = link.attrib.get('href', '')
            rel = link.attrib.get('rel', '')
            entry['links'].append({'href': href, 'rel': rel})

        return tidy_dict(entry)
    def parse(self):
        elem = self.parser.xml
        ncml = {'variables': []}

        ncml['identifier'] = elem.attrib.get('location', '')
        for variable in extract_elems(elem, ['variable']):
            v = {}
            v['name'] = variable.attrib.get('name', '')
            v['attributes'] = []
            for att in extract_elems(variable, ['attribute']):
                a = {}
                for key, value in att.attrib.iteritems():
                    tag = extract_element_tag(key)
                    if tag == 'values':
                        continue
                    
                    a[tag] = value.strip()
                    
                if a:
                    v['attributes'] += [a]

            v = tidy_dict(v)
            if v:
                ncml['variables'].append(v)

        return tidy_dict(ncml)
    def _parse_child(self, child):
        entry = {}

        entry["title"] = extract_item(child, ["title"])
        entry["id"] = extract_item(child, ["id"])
        entry["creator"] = extract_item(child, ["creator"])
        entry["author"] = extract_item(child, ["author", "name"])
        entry["date"] = extract_item(child, ["date"])
        entry["updated"] = extract_item(child, ["updated"])
        entry["published"] = extract_item(child, ["published"])

        entry["subjects"] = [e.attrib.get("term", "") for e in extract_elems(child, ["category"])]

        entry["contents"] = []
        contents = extract_elems(child, ["content"])
        for content in contents:
            text = content.text.strip() if content.text else ""
            content_type = content.attrib.get("type", "")
            entry["contents"].append({"content": text, "type": content_type})

        entry["links"] = []
        links = extract_elems(child, ["link"])
        for link in links:
            href = link.attrib.get("href", "")
            rel = link.attrib.get("rel", "")
            entry["links"].append({"href": href, "rel": rel})

        return tidy_dict(entry)
    def parse_endpoints(self):
        '''
        JUST THE SERVICE ENDPOINTS (service and catalogRef elements
            at the root level)
        if the catalog service contains service elements. or a dataset
        element or catalogRef elements, parse those as endpoints (relative paths
            and all of the tagging issues)
        '''
        endpoints = []

        services = extract_elems(self.parser.xml, ['service'])
        # ffs, services can be nested too
        for service in services:
            description, child_endpoints = self._handle_elem(
                service,
                ['service'],
                self.url,
                {}
            )
            endpoints += [description]
            if child_endpoints:
                endpoints += child_endpoints

        catrefs = extract_elems(self.parser.xml, ['catalogRef'])
        for catref in catrefs:
            description, child_endpoints = self._handle_elem(
                catref,
                ['catalogRef', 'metadata'],
                self.url,
                {}  # TODO: so dap or file base path only? (not the full set,
                    # that makes no sense)
            )
            endpoints += [description] + child_endpoints

        return endpoints
    def _parse_item(self, elem):
        entry = {}

        entry['title'] = extract_item(elem, ['title'])
        entry['id'] = extract_item(elem, ['id'])
        entry['creator'] = extract_item(elem, ['creator'])
        entry['author'] = extract_item(elem, ['author', 'name'])
        entry['date'] = extract_item(elem, ['date'])
        entry['updated'] = extract_item(elem, ['updated'])
        entry['published'] = extract_item(elem, ['published'])

        entry['subjects'] = [e.attrib.get('term', '') for e in extract_elems(elem, ['category'])]

        entry['contents'] = []
        contents = extract_elems(elem, ['content'])
        for content in contents:
            text = content.text.strip() if content.text else ''
            content_type = content.attrib.get('type', '')
            entry['contents'].append({'content': text, 'type': content_type})

        entry['links'] = []
        links = extract_elems(elem, ['link'])
        for link in links:
            href = link.attrib.get('href', '')
            rel = link.attrib.get('rel', '')
            entry['links'].append({'href': href, 'rel': rel})

        return tidy_dict(entry)
Ejemplo n.º 7
0
    def _parse_distribution(self, elem):
        ''' from the distributionInfo element '''
        for dist_elem in extract_elems(elem, ['MD_Distribution']):
            # this is going to get ugly.
            # super ugly
            # get the transferoptions block
            # get the url, the name, the description, the size
            # get the format from a parent node
            # but where the transferoptions can be in some nested
            # distributor thing or at the root of the element (NOT
            # the root of the file)
            transfer_elems = extract_elems(
                dist_elem, ['//*', 'MD_DigitalTransferOptions'])
            for transfer_elem in transfer_elems:
                # _transfer = {}
                # transfer['url'] = extract_item(
                #     transfer_elem,
                #     ['onLine', 'CI_OnlineResource', 'linkage', 'URL'])
                # transfer['objectid'] = generate_sha_urn(transfer['url'])

                # xp = generate_localname_xpath(
                #     ['..', '..', 'distributorFormat', 'MD_Format'])
                # format_elem = next(iter(transfer_elem.xpath(xp)), None)
                # if format_elem is not None:
                #     transfer['format'] = ' '.join([
                #         extract_item(format_elem,
                #                      ['name', 'CharacterString']),
                #         extract_item(
                #             format_elem, ['version', 'CharacterString'])])
                link = extract_item(
                    transfer_elem,
                    ['onLine', 'CI_OnlineResource', 'linkage', 'URL'])
                yield link
    def _handle_elem(self, elem, child_tags, base_url, service_bases):
        description = self._get_items(
            extract_element_tag(elem.tag), elem, base_url, service_bases
        )
        description['source'] = extract_element_tag(elem.tag)

        endpoints = []

        for child_tag in child_tags:
            elems = extract_elems(elem, [child_tag])

            for e in elems:
                e_desc = self._get_items(
                    extract_element_tag(e.tag), e, base_url, service_bases
                )

                e_desc['childOf'] = description.get('ID', '')
                e_desc["source"] = extract_element_tag(child_tag)

                parents = description.get('parentOf', [])
                parents += [e['ID'] for e in endpoints if 'childOf' in e]
                description['parentOf'] = parents

                endpoints.append(e_desc)

        return description, endpoints
Ejemplo n.º 9
0
    def _handle_operations(self):
        elems = extract_elems(self.elem,
                              ['containsOperations', 'SV_OperationMetadata'])

        ops = []
        for e in elems:
            op = {}
            op['name'] = extract_item(e, ['operationName', 'CharacterString'])
            op['method'] = extract_attrib(e,
                                          ['DCP', 'DCPList', '@codeListValue'])
            op['url'] = extract_item(
                e, ['connectPoint', 'CI_OnlineResource', 'linkage', 'URL'])
            op['parameters'] = [
                self._handle_parameter(pe)
                for pe in extract_elems(e, ['parameters', 'SV_Parameter'])
            ]
            ops.append(op)

        return ops
    def _parse_keywords(self, elem):
        '''
        for each descriptiveKeywords block
        in an identification block
        '''
        keywords = []

        for key_elem in extract_elems(elem, ['descriptiveKeywords']):
            # TODO: split these up (if *-delimited in some way)
            terms = extract_items(
                key_elem,
                ['MD_Keywords', 'keyword', 'CharacterString'])
            key_type = extract_attrib(
                key_elem,
                ['MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue']
            )
            thesaurus = extract_item(
                key_elem,
                [
                    'MD_Keywords',
                    'thesaurusName',
                    'CI_Citation',
                    'title',
                    'CharacterString'
                ]
            )

            if terms:
                keywords.append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "dc:partOf": thesaurus,
                        "bcube:hasType": key_type,
                        "bcube:hasValue": terms
                    })
                )

        # TODO: add the Anchor element handling
        #       ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor']

        # add a generic set for the iso topic category
        isotopics = extract_items(
            elem, ['topicCategory', 'MD_TopicCategoryCode'])
        if isotopics:
            keywords.append(
                tidy_dict({
                    "object_id": generate_uuid_urn(),
                    "dc:partOf": 'IsoTopicCategories',
                    "bcube:hasValue": isotopics
                })
            )

        return keywords
Ejemplo n.º 11
0
    def parse_item(self, elem):
        identifier = extract_item(self.elem, ['Entry_ID'])
        title = extract_item(self.elem, ['Entry_Title'])
        keywords = extract_items(self.elem, ['Keyword'])
        keywords += extract_items(self.elem, ['ISO_Topic_Category'])
        abstract = extract_item(self.elem, ['Summary'])
        organization = extract_item(self.elem, ['Originating_Center'])

        # temporal extent
        start_date = extract_item(self.elem,
                                  ['Temporal_Coverage', 'Start_Date'])
        end_date = extract_item(self.elem, ['Temporal_Coverage', 'End_Date'])
        temporal = [start_date, end_date] if start_date and end_date else []

        # spatial extent
        west = extract_item(self.elem,
                            ['Spatial_Coverage', 'Westernmost_Longitude'])
        east = extract_item(self.elem,
                            ['Spatial_Coverage', 'Easternmost_Longitude'])
        south = extract_item(self.elem,
                             ['Spatial_Coverage', 'Southernmost_Latitude'])
        north = extract_item(self.elem,
                             ['Spatial_Coverage', 'Northernmost_Latitude'])
        bbox = [west, south, east, north] if \
            west and east and north and south else []
        bbox = bbox_to_geom(bbox)
        bbox = to_wkt(bbox)

        distributions = []
        for related_url in extract_elems(self.elem, ['Related_URL']):
            url = extract_item(related_url, ['URL'])
            content_type = extract_item(related_url,
                                        ['URL_Content_Type', 'Type'])
            description = extract_item(related_url, ['Description'])
            dist = tidy_dict({
                "url": url,
                "description": description,
                "content_type": content_type
            })
            if dist:
                distributions.append(dist)

        return tidy_dict({
            "id": identifier,
            "title": title,
            "keywords": keywords,
            "abstract": abstract,
            "organization": organization,
            "bbox": bbox,
            "temporal": temporal,
            "distributions": distributions
        })
    def parse_item(self, elem):
        identifier = extract_item(self.elem, ['Entry_ID'])
        title = extract_item(self.elem, ['Entry_Title'])
        keywords = extract_items(self.elem, ['Keyword'])
        keywords += extract_items(self.elem, ['ISO_Topic_Category'])
        abstract = extract_item(self.elem, ['Summary'])
        organization = extract_item(self.elem, ['Originating_Center'])

        # temporal extent
        start_date = extract_item(
            self.elem, ['Temporal_Coverage', 'Start_Date'])
        end_date = extract_item(self.elem, ['Temporal_Coverage', 'End_Date'])
        temporal = [start_date, end_date] if start_date and end_date else []

        # spatial extent
        west = extract_item(
            self.elem, ['Spatial_Coverage', 'Westernmost_Longitude'])
        east = extract_item(
            self.elem, ['Spatial_Coverage', 'Easternmost_Longitude'])
        south = extract_item(
            self.elem, ['Spatial_Coverage', 'Southernmost_Latitude'])
        north = extract_item(
            self.elem, ['Spatial_Coverage', 'Northernmost_Latitude'])
        bbox = [west, south, east, north] if \
            west and east and north and south else []
        bbox = bbox_to_geom(bbox)
        bbox = to_wkt(bbox)

        distributions = []
        for related_url in extract_elems(self.elem, ['Related_URL']):
            url = extract_item(related_url, ['URL'])
            content_type = extract_item(
                related_url, ['URL_Content_Type', 'Type'])
            description = extract_item(related_url, ['Description'])
            dist = tidy_dict({
                "url": url,
                "description": description,
                "content_type": content_type
            })
            if dist:
                distributions.append(dist)

        return tidy_dict({
            "id": identifier,
            "title": title,
            "keywords": keywords,
            "abstract": abstract,
            "organization": organization,
            "bbox": bbox,
            "temporal": temporal,
            "distributions": distributions
        })
    def _parse_metadata(self):
        endpoints = []
        # metadatas = self.parser.find(metadata_xpath)
        metadatas = extract_elems(self.parser.xml, ['metadata'])
        for metadata in metadatas:
            description, child_endpoints = self._handle_elem(
                metadata,
                [],
                self.url,
                self.service_bases
            )
            endpoints += [description] + child_endpoints

        return {"endpoints": endpoints}
    def _handle_operations(self):
        elems = extract_elems(
            self.elem,
            ['containsOperations', 'SV_OperationMetadata'])

        ops = []
        for e in elems:
            op = {}
            op['name'] = extract_item(
                e,
                ['operationName', 'CharacterString'])
            op['method'] = extract_attrib(
                e,
                ['DCP', 'DCPList', '@codeListValue'])
            op['url'] = extract_item(
                e,
                ['connectPoint', 'CI_OnlineResource', 'linkage', 'URL'])
            op['parameters'] = [
                self._handle_parameter(pe) for pe in
                extract_elems(e, ['parameters', 'SV_Parameter'])]
            ops.append(op)

        return ops
    def _parse_datasets(self):

        # get the level-one children (catalog->child)
        endpoints = []

        datasets = extract_elems(self.parser.xml, ['dataset'])
        # datasets = self.parser.find(dataset_xpath)
        for dataset in datasets:
            description, child_endpoints = self._handle_elem(
                dataset, ['dataset', 'metadata', 'catalogRef'],
                self.url,
                self.service_bases
            )
            endpoints += [description] + child_endpoints

        return {"endpoints": endpoints}
Ejemplo n.º 16
0
    def parse_children(self, elem=None, tags=[]):
        '''
        where elem = the parent node for the set and
        tags is the un-namespaced list of explicit items
        to parse or, if not specified, run the children
        one level down
        '''
        elem = self.parser.xml if elem is None else elem
        children = []
        if tags:
            children = extract_elems(elem, tags)
        else:
            children = [child for child in elem.iterchildren()]

        for child in children:
            parsed = self._parse_child(child)
            if parsed:
                yield parsed
Ejemplo n.º 17
0
    def parse_children(self, elem=None, tags=[]):
        '''
        where elem = the parent node for the set and
        tags is the un-namespaced list of explicit items
        to parse or, if not specified, run the children
        one level down
        '''
        elem = self.parser.xml if elem is None else elem
        children = []
        if tags:
            children = extract_elems(elem, tags)
        else:
            children = [child for child in elem.iterchildren()]

        for child in children:
            parsed = self._parse_child(child)
            if parsed:
                yield parsed
    def parse(self):
        '''
        key = entry for atom and item for rss
        '''
        key = 'entry' if self.dialect == 'atom' else 'item'
        elems = extract_elems(self.parser.xml, ['//*', key])
        items = [self.item_class(elem).item for elem in elems]

        # TODO: add the root level parsing, ie the difference btwn atom and rss
        title = extract_item(self.parser.xml, ['title'])
        updated = extract_item(self.parser.xml, ['updated'])
        author_name = extract_item(self.parser.xml, ['author', 'name'])

        return {
            "title": title,
            "updated": updated,
            "author": author_name,
            "items": items
        }
Ejemplo n.º 19
0
    def _parse_keywords(self, elem):
        '''
        for each descriptiveKeywords block
        in an identification block
        '''
        keywords = []

        for key_elem in extract_elems(elem, ['descriptiveKeywords']):
            # TODO: split these up (if *-delimited in some way)
            terms = extract_items(
                key_elem, ['MD_Keywords', 'keyword', 'CharacterString'])
            key_type = extract_attrib(key_elem, [
                'MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue'
            ])
            thesaurus = extract_item(key_elem, [
                'MD_Keywords', 'thesaurusName', 'CI_Citation', 'title',
                'CharacterString'
            ])

            if terms:
                keywords.append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "dc:partOf": thesaurus,
                        "bcube:hasType": key_type,
                        "bcube:hasValue": terms
                    }))

        # TODO: add the Anchor element handling
        #       ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor']

        # add a generic set for the iso topic category
        isotopics = extract_items(elem,
                                  ['topicCategory', 'MD_TopicCategoryCode'])
        if isotopics:
            keywords.append(
                tidy_dict({
                    "object_id": generate_uuid_urn(),
                    "dc:partOf": 'IsoTopicCategories',
                    "bcube:hasValue": isotopics
                }))

        return keywords
    def parse(self):
        '''
        key = entry for atom and item for rss
        '''
        key = 'entry' if self.dialect == 'atom' else 'item'
        elems = extract_elems(self.parser.xml, ['//*', key])
        items = [self.item_class(elem).item for elem in elems]

        # TODO: add the root level parsing, ie the difference btwn atom and rss
        title = extract_item(self.parser.xml, ['title'])
        updated = extract_item(self.parser.xml, ['updated'])
        author_name = extract_item(self.parser.xml, ['author', 'name'])

        return {
            "title": title,
            "updated": updated,
            "author": author_name,
            "items": items
        }
Ejemplo n.º 21
0
    def parse(self):
        # get the series
        self.description = {}
        md = extract_elem(self.elem, ['seriesMetadata', 'MD_Metadata'])
        if md is None:
            return

        md_parser = MxParser(md)
        md_parser.parse()
        self.description = md_parser.description
        self.description['children'] = []

        # get the children
        children = extract_elems(
            self.elem, ['composedOf', 'DS_DataSet', 'has', 'MD_Metadata'])
        for child in children:
            child_parser = MxParser(child)
            child_parser.parse()
            if child_parser.description:
                self.description['children'].append(child_parser.description)

        self.description = tidy_dict(self.description)
    def parse(self):
        # get the series
        self.description = {}
        md = extract_elem(self.elem, ['seriesMetadata', 'MD_Metadata'])
        if md is None:
            return

        md_parser = MxParser(md)
        md_parser.parse()
        self.description = md_parser.description
        self.description['children'] = []

        # get the children
        children = extract_elems(
            self.elem, ['composedOf', 'DS_DataSet', 'has', 'MD_Metadata'])
        for child in children:
            child_parser = MxParser(child)
            child_parser.parse()
            if child_parser.description:
                self.description['children'].append(child_parser.description)

        self.description = tidy_dict(self.description)
 def _parse_children(self, dialect):
     elems = extract_elems(self.parser.xml, ['ListRecords', 'record'])
     return [self._parse_child(child, dialect) for child in elems]
    def _get_items(self, tag, elem, base_url, service_bases):
        '''
        return any structure not part of the
        current element's attributes
        '''

        def _normalize_key(key):
            '''
            standardize the url (or other) xml tags to the desired
            json key

            as source key: endpoint key
            '''
            remaps = {
                "serviceType": "type",
                "href": "url",
                "base": "url",
                "urlPath": "url"
            }

            if key in remaps:
                return remaps[key]
            return key

        def _run_element(elem, service_bases):
            '''
            for a given element, return any text() and any attribute value
            '''
            # run a generated xpath on the given element
            children = elem.xpath('./node()[local-name()!="metadata"' +
                                  'and local-name()!="dataset" and' +
                                  'local-name()!="catalogRef"]')

            element = {_normalize_key(extract_element_tag(k)): v for k, v
                       in elem.attrib.iteritems()}
            element = self._manage_id(element)

            for child in children:
                value = child.text
                # xp = generate_qualified_xpath(child, True)
                tag = _normalize_key(extract_element_tag(child.tag))

                if value:
                    element[tag] = value

                for k, v in child.attrib.iteritems():
                    if v:
                        element[tag + '_' + _normalize_key(extract_element_tag(k))] = v

            # get the service bases in case
            if [g for g in element.keys() if g.endswith('serviceName')]:
                sbs = [v for k, v in service_bases.iteritems() if k == element.get('serviceName')]
            else:
                sbs = service_bases.values()

            # send a unique list of base relative paths
            sbs = list(set(sbs))

            url_key = next(iter([g for g in element.keys() if g.endswith('url')]), '')
            if url_key:
                # for service urls, if catalog.xml isn't appended it will resolve to
                # the html endpoint (not desired). so if the path equals the/a path in
                # the service bases, append catalog.xml to the path

                # elem_url = element[url_key]
                # if elem_url in sbs or not sbs:
                #     elem_url += ('' if elem_url.endswith('/') else '/') + 'catalog.xml'
                # # element['url'] = intersect_url(base_url, elem_url, sbs)
                # # element['url'] = base_url

                # let's generate the link
                tl = ThreddsLink(elem, self.url, sbs)
                element['url'] = tl.urls

                element['actionable'] = 2

            return element

        children = extract_elems(elem, ['metadata']) + \
            extract_elems(elem, ['dataset']) + \
            extract_elems(elem, ['catalogRef'])

        # elem.xpath('./node()[local-name()="metadata" or ' +
        #                       'local-name()="dataset" or local-name()="catalogRef"]')

        element = _run_element(elem, service_bases)
        element_children = []
        for c in children:
            element_desc = _run_element(c, service_bases)
            element_children.append(element_desc)

        if element_children:
            element['children'] = element_children

        return element
 def _parse_children(self, dialect):
     elems = extract_elems(self.parser.xml, ['ListRecords', 'record'])
     return [self._parse_child(child, dialect) for child in elems]
    def _parse_service(self):
        output = {}
        urls = set()

        service = {
            "object_id":
            generate_uuid_urn(),
            "bcube:dateCreated":
            self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated":
            self.harvest_details.get('harvest_date', ''),
            "rdf:type":
            'OpenSearch1.1:Description',
            "dcterms:title":
            extract_item(self.parser.xml, ["ShortName"]),
            "dc:description":
            ' '.join(
                extract_items(self.parser.xml, ["LongName"]) +
                extract_items(self.parser.xml, ["Description"])),
            "urls": [],
            "webpages": [],
            "relationships": []
        }
        url_sha = generate_sha_urn(self.url)
        urls.add(url_sha)
        original_url = self._generate_harvest_manifest(
            **{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": generate_uuid_urn(),
                "dc:identifier": url_sha
            })
        service['urls'].append(original_url)
        service['relationships'].append({
            "relate": "bcube:originatedFrom",
            "object_id": original_url['object_id']
        })

        # output['source'] = extract_items(
        #   self.parser.xml, ["Attribution"])
        # output['contact'] = extract_items(
        #     self.parser.xml, ["Developer"])
        # output['rights'] = extract_items(
        #   self.parser.xml, ["SyndicationRight"])

        key_id = generate_uuid_urn()
        output['keywords'] = [{
            "object_id":
            key_id,
            "bcube:hasValue":
            extract_items(self.parser.xml, ["Tags"])
        }]
        service['relationships'].append({
            "relate": "dc:conformsTo",
            "object_id": key_id
        })

        for t in extract_elems(self.parser.xml, ['Url']):
            ep = self._parse_endpoint(t)
            url_sha = generate_sha_urn(ep['url'])
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(
                    **{
                        "bcube:hasUrlSource": "Generated",
                        "bcube:hasConfidence": "Not Sure",
                        "vcard:hasURL": ep['url'],
                        "object_id": url_id,
                        "dc:identifier": url_sha
                    })
                service['urls'].append(dist)
                wb_id = generate_uuid_urn()
                service['webpages'].append({
                    "object_id":
                    wb_id,
                    "relationships": [{
                        "relate": "dcterms:references",
                        "object_id": url_id
                    }]
                })
                service['relationships'].append({
                    "relate": "dcterms:references",
                    "object_id": wb_id
                })

        output['services'] = [service]

        return tidy_dict(output)
    def parse(self):
        '''
        run the routing
        '''

        if not self.identity:
            # we're going to have to sort it out
            self.identity = {}

        metadata = self.identity.get('metadata', {})
        if not metadata:
            return {}

        metadata_type = metadata.get('name', '')
        if not metadata_type:
            return {}

        # TODO: this is unlikely to be correct, given the ds record
        #       but we're not going there just yet
        # TODO: deal with conformsTo (multiple schemaLocations, etc)
        catalog_record = {
            "object_id": generate_uuid_urn(),
            "rdf:type": self._version_to_urn(),
            "bcube:dateCreated": self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''),
            # "dc:conformsTo": extract_attrib(
            #     self.parser.xml, ['@schemaLocation']).split(),
            "relationships": [],
            "urls": []
        }
        original_url = self._generate_harvest_manifest(**{
            "bcube:hasUrlSource": "Harvested",
            "bcube:hasConfidence": "Good",
            "vcard:hasURL": self.url,
            "object_id": generate_uuid_urn()
        })
        catalog_record['urls'].append(original_url)
        catalog_record['relationships'].append({
            "relate": "bcube:originatedFrom",
            "object_id": original_url['object_id']
        })

        if metadata_type == 'Data Series':
            # run the set
            self.reader = DsParser(self.parser.xml, catalog_record)
        elif metadata_type == '19119':
            # run that
            for srv in extract_elems(
                self.parser.xml,
                    ['identificationInfo', 'SV_ServiceIdentification']):
                reader = SrvParser(srv, catalog_record)
                reader.parse()
        elif metadata_type == '19115':
            # it's a mi/md so run that
            self.reader = MxParser(
                self.parser.xml,
                catalog_record,
                self.harvest_details
            )
            self.reader.parse()

        # self.reader.parse()
        # # pass it back up the chain a bit
        self.description = self.reader.output
    def _parse_service(self):
        output = {}
        urls = set()

        service = {
            "object_id": generate_uuid_urn(),
            "bcube:dateCreated": self.harvest_details.get("harvest_date", ""),
            "bcube:lastUpdated": self.harvest_details.get("harvest_date", ""),
            "rdf:type": "OpenSearch1.1:Description",
            "dcterms:title": extract_item(self.parser.xml, ["ShortName"]),
            "dc:description": " ".join(
                extract_items(self.parser.xml, ["LongName"]) + extract_items(self.parser.xml, ["Description"])
            ),
            "urls": [],
            "webpages": [],
            "relationships": [],
        }
        url_sha = generate_sha_urn(self.url)
        urls.add(url_sha)
        original_url = self._generate_harvest_manifest(
            **{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": generate_uuid_urn(),
                "dc:identifier": url_sha,
            }
        )
        service["urls"].append(original_url)
        service["relationships"].append({"relate": "bcube:originatedFrom", "object_id": original_url["object_id"]})

        # output['source'] = extract_items(
        #   self.parser.xml, ["Attribution"])
        # output['contact'] = extract_items(
        #     self.parser.xml, ["Developer"])
        # output['rights'] = extract_items(
        #   self.parser.xml, ["SyndicationRight"])

        key_id = generate_uuid_urn()
        output["keywords"] = [{"object_id": key_id, "bcube:hasValue": extract_items(self.parser.xml, ["Tags"])}]
        service["relationships"].append({"relate": "dc:conformsTo", "object_id": key_id})

        for t in extract_elems(self.parser.xml, ["Url"]):
            ep = self._parse_endpoint(t)
            url_sha = generate_sha_urn(ep["url"])
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(
                    **{
                        "bcube:hasUrlSource": "Generated",
                        "bcube:hasConfidence": "Not Sure",
                        "vcard:hasURL": ep["url"],
                        "object_id": url_id,
                        "dc:identifier": url_sha,
                    }
                )
                service["urls"].append(dist)
                wb_id = generate_uuid_urn()
                service["webpages"].append(
                    {"object_id": wb_id, "relationships": [{"relate": "dcterms:references", "object_id": url_id}]}
                )
                service["relationships"].append({"relate": "dcterms:references", "object_id": wb_id})

        output["services"] = [service]

        return tidy_dict(output)
    def parse(self):
        '''
        from the root node, parse:
            identification (title, abstract, point of contact, keywords,
            extent) if identificationInfo contains SV_ServiceIdentification,
            add as child distribution info
        '''
        # set up the url set
        urls = set()
        urls.add(self.output['catalog_record']['urls'][0]['object_id'])

        for id_elem in extract_elems(
                self.elem,
                ['//*', 'identificationInfo', 'MD_DataIdentification']):
            dataset, keywords = self._parse_identification_info(id_elem)
            dataset['relationships'].append({
                "relate": "bcube:hasMetadataRecord",
                "object_id": self.output['catalog_record']['object_id']
            })
            dataset.update({
                "bcube:dateCreated":
                    self.harvest_details.get('harvest_date', ''),
                "bcube:lastUpdated":
                    self.harvest_details.get('harvest_date', '')
            })
            self.output['catalog_record']['relationships'].append({
                "relate": "foaf:primaryTopic",
                "object_id": dataset['object_id']
            })

            # point of contact from the root node and this might be an issue
            # in things like the -1/-3 from ngdc so try for an idinfo blob
            poc_elem = extract_elem(id_elem, [
                'identificationInfo',
                'MD_DataIdentification',
                'pointOfContact',
                'CI_ResponsibleParty'])
            # if poc_elem is None:
            #     # and if that fails try for the root-level contact
            #     poc_elem = extract_elem(
            #         self.elem,
            #         ['contact', 'CI_ResponsibleParty'])

            # TODO: point of contact is not necessarily the publisher
            if poc_elem is not None:
                poc = self._parse_responsibleparty(poc_elem)
                location = (
                    ' '.join(
                        [poc['contact'].get('city', ''),
                         poc['contact'].get('country', '')])
                ).strip() if poc.get('contact', {}) else ''

                self.output['publishers'].append(tidy_dict({
                    "object_id": generate_uuid_urn(),
                    "name": poc.get('organization', ''),
                    "location": location
                }))
                dataset['relationships'].append({
                    "relate": "dcterms:publisher",
                    "object_id": self.output['publisher']['object_id']
                })

            dataset['urls'] = []
            dist_elems = extract_elems(self.elem, ['distributionInfo'])
            for dist_elem in dist_elems:
                for d in self._parse_distribution(dist_elem):
                    if not d:
                        continue
                    url_sha = generate_sha_urn(d)
                    if url_sha not in urls:
                        urls.add(url_sha)
                        url_id = generate_uuid_urn()
                        dist = self._generate_harvest_manifest(**{
                            "bcube:hasUrlSource": "Harvested",
                            "bcube:hasConfidence": "Good",
                            "vcard:hasURL": d,
                            "object_id": url_id,
                            "dc:identifier": url_sha
                        })
                        dataset['urls'].append(dist)
                        dataset['relationships'].append({
                            "relate": "dcterms:references",
                            "object_id": url_id
                        })

            self.output['datasets'].append(dataset)
            self.output['keywords'] += keywords

        # TODO: removing this until we have a definition for SERVICE
        # # check for the service elements
        # service_elems = extract_elems(self.elem,
        #     ['identificationInfo', 'SV_ServiceIdentification'])
        # self.description['services'] = []
        # for service_elem in service_elems:
        #     sv = SrvParser(service_elem)
        #     self.description['services'].append(sv.parse())

        # switch the catalog record to a list for conformity. eep.
        self.output['catalog_records'] = [self.output['catalog_record']]
        del self.output['catalog_record']
        self.description = tidy_dict(self.output)
    def parse_item(self):
        output = {}

        urls = set()

        catalog_object_id = generate_uuid_urn()

        output['catalog_record'] = {
            "object_id": catalog_object_id,
            "bcube:dateCreated": self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''),
            # "dc:conformsTo": extract_attrib(
            #     self.elem, ['@noNamespaceSchemaLocation']).split(),
            "rdf:type": "FGDC:CSDGM",
            "relationships": [],
            "urls": []
        }
        output['urls'] = []

        # add the harvest info
        # this is not necessary as a sha just for set inclusion
        url_sha = generate_sha_urn(self.url)
        urls.add(url_sha)
        original_url = self._generate_harvest_manifest(**{
            "bcube:hasUrlSource": "Harvested",
            "bcube:hasConfidence": "Good",
            "vcard:hasURL": self.url,
            "object_id": generate_uuid_urn(),
            "dc:identifier": url_sha
        })
        output['catalog_record']['urls'].append(original_url)
        # NOTE: this is not the sha from the url
        output['catalog_record']['relationships'].append(
            {
                "relate": "bcube:originatedFrom",
                "object_id": original_url['object_id']
            }
        )

        datsetid = extract_item(self.elem, ['idinfo', 'datsetid'])
        dataset_object_id = generate_uuid_urn()

        dataset = {
            "object_id": dataset_object_id,
            "dcterms:identifier": datsetid,
            "bcube:dateCreated": self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''),
            "dc:description": extract_item(
                self.elem, ['idinfo', 'descript', 'abstract']),
            "dcterms:title": extract_item(
                self.elem, ['idinfo', 'citation', 'citeinfo', 'title']),
            "urls": [],
            "relationships": []
        }

        bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding'])
        if bbox_elem is not None:
            # that's not even valid
            west = extract_item(bbox_elem, ['westbc'])
            east = extract_item(bbox_elem, ['eastbc'])
            north = extract_item(bbox_elem, ['northbc'])
            south = extract_item(bbox_elem, ['southbc'])
            bbox = [west, south, east, north]
            bbox = bbox_to_geom(bbox)
            bbox = to_wkt(bbox)

            dataset.update({
                "dc:spatial": bbox,
                "esip:westBound": west,
                "esip:eastBound": east,
                "esip:northBound": north,
                "esip:southBound": south
            })

        time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo'])
        if time_elem is not None:
            caldate = extract_item(time_elem, ['sngdate', 'caldate'])
            if caldate:
                # TODO: we should see if it's at least a valid date
                dataset['esip:startDate'] = self._convert_date(caldate)

            rngdate = extract_elem(time_elem, ['rngdates'])
            if rngdate is not None:
                dataset['esip:startDate'] = self._convert_date(
                    extract_item(rngdate, ['begdate']))
                dataset['esip:endDate'] = self._convert_date(
                    extract_item(rngdate, ['enddate']))
            # TODO: add the min/max of the list of dates

        dataset['relationships'] = [
            {
                "relate": "bcube:hasMetadataRecord",
                "object_id": catalog_object_id
            }
        ]

        publisher = {
            "object_id": generate_uuid_urn(),
            "name": extract_item(
                self.elem,
                ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']),
            "location": extract_item(
                self.elem,
                ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace'])
        }
        output['publisher'] = publisher
        dataset['relationships'].append({
            "relate": "dcterms:publisher",
            "object_id": publisher['object_id']
        })

        distrib_elems = extract_elems(
            self.elem, ['distinfo', 'stdorder', 'digform'])

        for distrib_elem in distrib_elems:
            link = extract_item(
                distrib_elem,
                ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr'])
            # format = extract_item(distrib_elem, ['digtinfo', 'formname'])
            url_sha = generate_sha_urn(link)
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(**{
                    "bcube:hasUrlSource": "Harvested",
                    "bcube:hasConfidence": "Good",
                    "vcard:hasURL": link,
                    "object_id": url_id,
                    "dc:identifier": url_sha
                })
                dataset['urls'].append(dist)
                # this is a distribution link so
                # we are assuming it is to data
                dataset['relationships'].append({
                    "relate": "dcterms:references",
                    "object_id": url_id
                })

        webpages = []
        onlink_elems = extract_elems(
            self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink'])
        for onlink_elem in onlink_elems:
            link = onlink_elem.text.strip() if onlink_elem.text else ''
            if not link:
                continue
            url_sha = generate_sha_urn(link)
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(**{
                    "bcube:hasUrlSource": "Harvested",
                    "bcube:hasConfidence": "Good",
                    "vcard:hasURL": link,
                    "object_id": url_id,
                    "dc:identifier": url_sha
                })
                dataset['urls'].append(dist)
                webpages.append({
                    "object_id": generate_uuid_urn(),
                    "relationships": [
                        {
                            "relate": "dcterms:references",
                            "object_id": url_id
                        }
                    ]}
                )

        output['catalog_record']['webpages'] = webpages
        for webpage in webpages:
            dataset['relationships'].append({
                "relate": "dcterms:references",
                "object_id": webpage['object_id']
            })

        # retain the keyword sets with type, thesaurus name and split
        # the terms as best we can
        keywords = []
        key_elem = extract_elem(self.elem, ['idinfo', 'keywords'])
        for child in key_elem.iterchildren():
            key_type = extract_element_tag(child.tag)
            key_tag = 'strat' if key_type == 'stratum' else key_type
            key_tag = 'temp' if key_tag == 'temporal' else key_tag
            thesaurus = extract_item(child, ['%skt' % key_tag])

            # TODO: split these up
            terms = extract_items(child, ['%skey' % key_tag])

            if terms:
                # if there's a parsing error (bad cdata, etc) may not have
                # TODO: add something for a set without a thesaurus name
                keywords.append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "dc:partOf": thesaurus,
                        "bcube:hasType": key_type,
                        "bcube:hasValue": terms
                    })
                )
        output['keywords'] = keywords
        for keyword in keywords:
            dataset['relationships'].append(
                {
                    "relate": "dc:conformsTo",
                    "object_id": keyword['object_id']
                }
            )

        output['datasets'] = [dataset]

        # add the metadata relate
        output['catalog_record']['relationships'].append(
            {
                "relate": "foaf:primaryTopic",
                "object_id": dataset_object_id
            }
        )

        output['catalog_records'] = [output['catalog_record']]
        del output['catalog_record']
        self.description = tidy_dict(output)
Ejemplo n.º 31
0
    def parse(self):
        '''
        from the root node, parse:
            identification (title, abstract, point of contact, keywords,
            extent) if identificationInfo contains SV_ServiceIdentification,
            add as child distribution info
        '''
        # set up the url set
        urls = set()
        urls.add(self.output['catalog_record']['urls'][0]['object_id'])

        for id_elem in extract_elems(
                self.elem,
            ['//*', 'identificationInfo', 'MD_DataIdentification']):
            dataset, keywords = self._parse_identification_info(id_elem)
            dataset['relationships'].append({
                "relate":
                "bcube:hasMetadataRecord",
                "object_id":
                self.output['catalog_record']['object_id']
            })
            dataset.update({
                "bcube:dateCreated":
                self.harvest_details.get('harvest_date', ''),
                "bcube:lastUpdated":
                self.harvest_details.get('harvest_date', '')
            })
            self.output['catalog_record']['relationships'].append({
                "relate":
                "foaf:primaryTopic",
                "object_id":
                dataset['object_id']
            })

            # point of contact from the root node and this might be an issue
            # in things like the -1/-3 from ngdc so try for an idinfo blob
            poc_elem = extract_elem(id_elem, [
                'identificationInfo', 'MD_DataIdentification',
                'pointOfContact', 'CI_ResponsibleParty'
            ])
            # if poc_elem is None:
            #     # and if that fails try for the root-level contact
            #     poc_elem = extract_elem(
            #         self.elem,
            #         ['contact', 'CI_ResponsibleParty'])

            # TODO: point of contact is not necessarily the publisher
            if poc_elem is not None:
                poc = self._parse_responsibleparty(poc_elem)
                location = (' '.join([
                    poc['contact'].get('city', ''), poc['contact'].get(
                        'country', '')
                ])).strip() if poc.get('contact', {}) else ''

                self.output['publishers'].append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "name": poc.get('organization', ''),
                        "location": location
                    }))
                dataset['relationships'].append({
                    "relate":
                    "dcterms:publisher",
                    "object_id":
                    self.output['publisher']['object_id']
                })

            dataset['urls'] = []
            dist_elems = extract_elems(self.elem, ['distributionInfo'])
            for dist_elem in dist_elems:
                for d in self._parse_distribution(dist_elem):
                    if not d:
                        continue
                    url_sha = generate_sha_urn(d)
                    if url_sha not in urls:
                        urls.add(url_sha)
                        url_id = generate_uuid_urn()
                        dist = self._generate_harvest_manifest(
                            **{
                                "bcube:hasUrlSource": "Harvested",
                                "bcube:hasConfidence": "Good",
                                "vcard:hasURL": d,
                                "object_id": url_id,
                                "dc:identifier": url_sha
                            })
                        dataset['urls'].append(dist)
                        dataset['relationships'].append({
                            "relate": "dcterms:references",
                            "object_id": url_id
                        })

            self.output['datasets'].append(dataset)
            self.output['keywords'] += keywords

        # TODO: removing this until we have a definition for SERVICE
        # # check for the service elements
        # service_elems = extract_elems(self.elem,
        #     ['identificationInfo', 'SV_ServiceIdentification'])
        # self.description['services'] = []
        # for service_elem in service_elems:
        #     sv = SrvParser(service_elem)
        #     self.description['services'].append(sv.parse())

        # switch the catalog record to a list for conformity. eep.
        self.output['catalog_records'] = [self.output['catalog_record']]
        del self.output['catalog_record']
        self.description = tidy_dict(self.output)
Ejemplo n.º 32
0
    def parse(self):
        '''
        run the routing
        '''

        if not self.identity:
            # we're going to have to sort it out
            self.identity = {}

        metadata = self.identity.get('metadata', {})
        if not metadata:
            return {}

        metadata_type = metadata.get('name', '')
        if not metadata_type:
            return {}

        # TODO: this is unlikely to be correct, given the ds record
        #       but we're not going there just yet
        # TODO: deal with conformsTo (multiple schemaLocations, etc)
        catalog_record = {
            "object_id": generate_uuid_urn(),
            "rdf:type": self._version_to_urn(),
            "bcube:dateCreated": self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''),
            # "dc:conformsTo": extract_attrib(
            #     self.parser.xml, ['@schemaLocation']).split(),
            "relationships": [],
            "urls": []
        }
        original_url = self._generate_harvest_manifest(
            **{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": generate_uuid_urn()
            })
        catalog_record['urls'].append(original_url)
        catalog_record['relationships'].append({
            "relate":
            "bcube:originatedFrom",
            "object_id":
            original_url['object_id']
        })

        if metadata_type == 'Data Series':
            # run the set
            self.reader = DsParser(self.parser.xml, catalog_record)
        elif metadata_type == '19119':
            # run that
            for srv in extract_elems(
                    self.parser.xml,
                ['identificationInfo', 'SV_ServiceIdentification']):
                reader = SrvParser(srv, catalog_record)
                reader.parse()
        elif metadata_type == '19115':
            # it's a mi/md so run that
            self.reader = MxParser(self.parser.xml, catalog_record,
                                   self.harvest_details)
            self.reader.parse()

        # self.reader.parse()
        # # pass it back up the chain a bit
        self.description = self.reader.output
Ejemplo n.º 33
0
    def parse_item(self):
        output = {}

        urls = set()

        catalog_object_id = generate_uuid_urn()

        output['catalog_record'] = {
            "object_id": catalog_object_id,
            "bcube:dateCreated": self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''),
            # "dc:conformsTo": extract_attrib(
            #     self.elem, ['@noNamespaceSchemaLocation']).split(),
            "rdf:type": "FGDC:CSDGM",
            "relationships": [],
            "urls": []
        }
        output['urls'] = []

        # add the harvest info
        # this is not necessary as a sha just for set inclusion
        url_sha = generate_sha_urn(self.url)
        urls.add(url_sha)
        original_url = self._generate_harvest_manifest(
            **{
                "bcube:hasUrlSource": "Harvested",
                "bcube:hasConfidence": "Good",
                "vcard:hasURL": self.url,
                "object_id": generate_uuid_urn(),
                "dc:identifier": url_sha
            })
        output['catalog_record']['urls'].append(original_url)
        # NOTE: this is not the sha from the url
        output['catalog_record']['relationships'].append({
            "relate":
            "bcube:originatedFrom",
            "object_id":
            original_url['object_id']
        })

        datsetid = extract_item(self.elem, ['idinfo', 'datsetid'])
        dataset_object_id = generate_uuid_urn()

        dataset = {
            "object_id":
            dataset_object_id,
            "dcterms:identifier":
            datsetid,
            "bcube:dateCreated":
            self.harvest_details.get('harvest_date', ''),
            "bcube:lastUpdated":
            self.harvest_details.get('harvest_date', ''),
            "dc:description":
            extract_item(self.elem, ['idinfo', 'descript', 'abstract']),
            "dcterms:title":
            extract_item(self.elem,
                         ['idinfo', 'citation', 'citeinfo', 'title']),
            "urls": [],
            "relationships": []
        }

        bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding'])
        if bbox_elem is not None:
            # that's not even valid
            west = extract_item(bbox_elem, ['westbc'])
            east = extract_item(bbox_elem, ['eastbc'])
            north = extract_item(bbox_elem, ['northbc'])
            south = extract_item(bbox_elem, ['southbc'])
            bbox = [west, south, east, north]
            bbox = bbox_to_geom(bbox)
            bbox = to_wkt(bbox)

            dataset.update({
                "dc:spatial": bbox,
                "esip:westBound": west,
                "esip:eastBound": east,
                "esip:northBound": north,
                "esip:southBound": south
            })

        time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo'])
        if time_elem is not None:
            caldate = extract_item(time_elem, ['sngdate', 'caldate'])
            if caldate:
                # TODO: we should see if it's at least a valid date
                dataset['esip:startDate'] = self._convert_date(caldate)

            rngdate = extract_elem(time_elem, ['rngdates'])
            if rngdate is not None:
                dataset['esip:startDate'] = self._convert_date(
                    extract_item(rngdate, ['begdate']))
                dataset['esip:endDate'] = self._convert_date(
                    extract_item(rngdate, ['enddate']))
            # TODO: add the min/max of the list of dates

        dataset['relationships'] = [{
            "relate": "bcube:hasMetadataRecord",
            "object_id": catalog_object_id
        }]

        publisher = {
            "object_id":
            generate_uuid_urn(),
            "name":
            extract_item(
                self.elem,
                ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']),
            "location":
            extract_item(
                self.elem,
                ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace'])
        }
        output['publisher'] = publisher
        dataset['relationships'].append({
            "relate": "dcterms:publisher",
            "object_id": publisher['object_id']
        })

        distrib_elems = extract_elems(self.elem,
                                      ['distinfo', 'stdorder', 'digform'])

        for distrib_elem in distrib_elems:
            link = extract_item(
                distrib_elem,
                ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr'])
            # format = extract_item(distrib_elem, ['digtinfo', 'formname'])
            url_sha = generate_sha_urn(link)
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(
                    **{
                        "bcube:hasUrlSource": "Harvested",
                        "bcube:hasConfidence": "Good",
                        "vcard:hasURL": link,
                        "object_id": url_id,
                        "dc:identifier": url_sha
                    })
                dataset['urls'].append(dist)
                # this is a distribution link so
                # we are assuming it is to data
                dataset['relationships'].append({
                    "relate": "dcterms:references",
                    "object_id": url_id
                })

        webpages = []
        onlink_elems = extract_elems(
            self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink'])
        for onlink_elem in onlink_elems:
            link = onlink_elem.text.strip() if onlink_elem.text else ''
            if not link:
                continue
            url_sha = generate_sha_urn(link)
            if url_sha not in urls:
                urls.add(url_sha)
                url_id = generate_uuid_urn()
                dist = self._generate_harvest_manifest(
                    **{
                        "bcube:hasUrlSource": "Harvested",
                        "bcube:hasConfidence": "Good",
                        "vcard:hasURL": link,
                        "object_id": url_id,
                        "dc:identifier": url_sha
                    })
                dataset['urls'].append(dist)
                webpages.append({
                    "object_id":
                    generate_uuid_urn(),
                    "relationships": [{
                        "relate": "dcterms:references",
                        "object_id": url_id
                    }]
                })

        output['catalog_record']['webpages'] = webpages
        for webpage in webpages:
            dataset['relationships'].append({
                "relate": "dcterms:references",
                "object_id": webpage['object_id']
            })

        # retain the keyword sets with type, thesaurus name and split
        # the terms as best we can
        keywords = []
        key_elem = extract_elem(self.elem, ['idinfo', 'keywords'])
        for child in key_elem.iterchildren():
            key_type = extract_element_tag(child.tag)
            key_tag = 'strat' if key_type == 'stratum' else key_type
            key_tag = 'temp' if key_tag == 'temporal' else key_tag
            thesaurus = extract_item(child, ['%skt' % key_tag])

            # TODO: split these up
            terms = extract_items(child, ['%skey' % key_tag])

            if terms:
                # if there's a parsing error (bad cdata, etc) may not have
                # TODO: add something for a set without a thesaurus name
                keywords.append(
                    tidy_dict({
                        "object_id": generate_uuid_urn(),
                        "dc:partOf": thesaurus,
                        "bcube:hasType": key_type,
                        "bcube:hasValue": terms
                    }))
        output['keywords'] = keywords
        for keyword in keywords:
            dataset['relationships'].append({
                "relate": "dc:conformsTo",
                "object_id": keyword['object_id']
            })

        output['datasets'] = [dataset]

        # add the metadata relate
        output['catalog_record']['relationships'].append({
            "relate":
            "foaf:primaryTopic",
            "object_id":
            dataset_object_id
        })

        output['catalog_records'] = [output['catalog_record']]
        del output['catalog_record']
        self.description = tidy_dict(output)