def parse(self): elem = self.parser.xml ncml = {'variables': []} ncml['identifier'] = elem.attrib.get('location', '') for variable in extract_elems(elem, ['variable']): v = {} v['name'] = variable.attrib.get('name', '') v['attributes'] = [] for att in extract_elems(variable, ['attribute']): a = {} for key, value in att.attrib.iteritems(): tag = extract_element_tag(key) if tag == 'values': continue a[tag] = value.strip() if a: v['attributes'] += [a] v = tidy_dict(v) if v: ncml['variables'].append(v) return tidy_dict(ncml)
def _parse_keywords(self, elem): ''' for each descriptiveKeywords block in an identification block ''' keywords = [] for key_elem in extract_elems(elem, ['descriptiveKeywords']): # TODO: split these up (if *-delimited in some way) terms = extract_items( key_elem, ['MD_Keywords', 'keyword', 'CharacterString']) key_type = extract_attrib( key_elem, ['MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue'] ) thesaurus = extract_item( key_elem, [ 'MD_Keywords', 'thesaurusName', 'CI_Citation', 'title', 'CharacterString' ] ) if terms: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms }) ) # TODO: add the Anchor element handling # ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor'] # add a generic set for the iso topic category isotopics = extract_items( elem, ['topicCategory', 'MD_TopicCategoryCode']) if isotopics: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": 'IsoTopicCategories', "bcube:hasValue": isotopics }) ) return keywords
def parse_item(self, elem): identifier = extract_item(self.elem, ['Entry_ID']) title = extract_item(self.elem, ['Entry_Title']) keywords = extract_items(self.elem, ['Keyword']) keywords += extract_items(self.elem, ['ISO_Topic_Category']) abstract = extract_item(self.elem, ['Summary']) organization = extract_item(self.elem, ['Originating_Center']) # temporal extent start_date = extract_item( self.elem, ['Temporal_Coverage', 'Start_Date']) end_date = extract_item(self.elem, ['Temporal_Coverage', 'End_Date']) temporal = [start_date, end_date] if start_date and end_date else [] # spatial extent west = extract_item( self.elem, ['Spatial_Coverage', 'Westernmost_Longitude']) east = extract_item( self.elem, ['Spatial_Coverage', 'Easternmost_Longitude']) south = extract_item( self.elem, ['Spatial_Coverage', 'Southernmost_Latitude']) north = extract_item( self.elem, ['Spatial_Coverage', 'Northernmost_Latitude']) bbox = [west, south, east, north] if \ west and east and north and south else [] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) distributions = [] for related_url in extract_elems(self.elem, ['Related_URL']): url = extract_item(related_url, ['URL']) content_type = extract_item( related_url, ['URL_Content_Type', 'Type']) description = extract_item(related_url, ['Description']) dist = tidy_dict({ "url": url, "description": description, "content_type": content_type }) if dist: distributions.append(dist) return tidy_dict({ "id": identifier, "title": title, "keywords": keywords, "abstract": abstract, "organization": organization, "bbox": bbox, "temporal": temporal, "distributions": distributions })
def parse_item(self, elem): identifier = extract_item(self.elem, ['Entry_ID']) title = extract_item(self.elem, ['Entry_Title']) keywords = extract_items(self.elem, ['Keyword']) keywords += extract_items(self.elem, ['ISO_Topic_Category']) abstract = extract_item(self.elem, ['Summary']) organization = extract_item(self.elem, ['Originating_Center']) # temporal extent start_date = extract_item(self.elem, ['Temporal_Coverage', 'Start_Date']) end_date = extract_item(self.elem, ['Temporal_Coverage', 'End_Date']) temporal = [start_date, end_date] if start_date and end_date else [] # spatial extent west = extract_item(self.elem, ['Spatial_Coverage', 'Westernmost_Longitude']) east = extract_item(self.elem, ['Spatial_Coverage', 'Easternmost_Longitude']) south = extract_item(self.elem, ['Spatial_Coverage', 'Southernmost_Latitude']) north = extract_item(self.elem, ['Spatial_Coverage', 'Northernmost_Latitude']) bbox = [west, south, east, north] if \ west and east and north and south else [] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) distributions = [] for related_url in extract_elems(self.elem, ['Related_URL']): url = extract_item(related_url, ['URL']) content_type = extract_item(related_url, ['URL_Content_Type', 'Type']) description = extract_item(related_url, ['Description']) dist = tidy_dict({ "url": url, "description": description, "content_type": content_type }) if dist: distributions.append(dist) return tidy_dict({ "id": identifier, "title": title, "keywords": keywords, "abstract": abstract, "organization": organization, "bbox": bbox, "temporal": temporal, "distributions": distributions })
def _parse_child(self, child): entry = {} entry["title"] = extract_item(child, ["title"]) entry["id"] = extract_item(child, ["id"]) entry["creator"] = extract_item(child, ["creator"]) entry["author"] = extract_item(child, ["author", "name"]) entry["date"] = extract_item(child, ["date"]) entry["updated"] = extract_item(child, ["updated"]) entry["published"] = extract_item(child, ["published"]) entry["subjects"] = [e.attrib.get("term", "") for e in extract_elems(child, ["category"])] entry["contents"] = [] contents = extract_elems(child, ["content"]) for content in contents: text = content.text.strip() if content.text else "" content_type = content.attrib.get("type", "") entry["contents"].append({"content": text, "type": content_type}) entry["links"] = [] links = extract_elems(child, ["link"]) for link in links: href = link.attrib.get("href", "") rel = link.attrib.get("rel", "") entry["links"].append({"href": href, "rel": rel}) return tidy_dict(entry)
def _extract_params(self, endpoint): def _extract_prefix(param): pattern = "\{{0,1}(\S*):([\S][^}]*)" # TODO: this is probably a bad assumption (that there's just the # one item in the list, not that urlparse returns the terms as a list) if isinstance(param, list): param = param[0] if ":" not in param: return ("", param) m = re.search(pattern, param) return m.groups() _parameter_formats = { "geo:box": "west, south, east, north", "time:start": "YYYY-MM-DDTHH:mm:ssZ", "time:stop": "YYYY-MM-DDTHH:mm:ssZ", } url = endpoint.get("template", "") query_params = parse_url(url) # deal with the namespaced parameters as [query param key, prefix, type] query_params = [[k] + list(_extract_prefix(v)) for k, v in query_params.iteritems()] return [ tidy_dict( {"name": qp[0], "prefix": qp[1], "type": qp[2], "format": _parameter_formats.get(":".join(qp[1:]))} ) for qp in query_params ]
def _parse_contact(self, elem): ''' parse any CI_Contact ''' contact = {} if elem is None: return contact contact['phone'] = extract_item( elem, ['phone', 'CI_Telephone', 'voice', 'CharacterString']) contact['addresses'] = extract_items( elem, ['address', 'CI_Address', 'deliveryPoint', 'CharacterString']) contact['city'] = extract_item( elem, ['address', 'CI_Address', 'city', 'CharacterString']) contact['state'] = extract_item( elem, ['address', 'CI_Address', 'administrativeArea', 'CharacterString']) contact['postal'] = extract_item( elem, ['address', 'CI_Address', 'postalCode', 'CharacterString']) contact['country'] = extract_item( elem, ['address', 'CI_Address', 'country', 'CharacterString']) contact['email'] = extract_item(elem, [ 'address', 'CI_Address', 'electronicMailAddress', 'CharacterString' ]) return tidy_dict(contact)
def _parse_service(self): output = {} service = { "object_id": generate_uuid_urn(), "dcterms:title": ' '.join(extract_items( self.parser.xml, ["Identify", "repositoryName"])), "rdf:type": "OAI-PMH", "relationships": [], "urls": [] } url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": url_id, "dc:identifier": generate_sha_urn(self.url) }) service['urls'] = [dist] service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": url_id }) # output['version'] = extract_items(self.parser.xml, ["Identify", "protocolVersion"]) # output['endpoints'] = [{'url': e} for e # in extract_items(self.parser.xml, ["Identify", "baseURL"])] output['services'] = [service] return tidy_dict(output)
def parse_item(self): ''' parse just the dc element (like oai_dc:dc) so if you're pulling this from an oai-pmh service, etc, make sure that it's *not* the full document ''' # TODO: this is not correct for the overall thing if self.elem is None: return {} title = extract_item(self.elem, ['title']) creator = extract_item(self.elem, ['creator']) subjects = extract_items(self.elem, ['subject']) description = extract_item(self.elem, ['description']) date = extract_item(self.elem, ['date']) language = extract_item(self.elem, ['language']) publisher = extract_item(self.elem, ['publisher']) sources = extract_items(self.elem, ['source']) types = extract_items(self.elem, ['type']) return tidy_dict({ 'title': title, 'creator': creator, 'subjects': subjects, 'abstract': description, 'language': language, 'date': date, 'publisher': publisher, 'types': types, 'sources': sources })
def _parse_item(self, elem): entry = {} entry['title'] = extract_item(elem, ['title']) entry['id'] = extract_item(elem, ['id']) entry['creator'] = extract_item(elem, ['creator']) entry['author'] = extract_item(elem, ['author', 'name']) entry['date'] = extract_item(elem, ['date']) entry['updated'] = extract_item(elem, ['updated']) entry['published'] = extract_item(elem, ['published']) entry['subjects'] = [e.attrib.get('term', '') for e in extract_elems(elem, ['category'])] entry['contents'] = [] contents = extract_elems(elem, ['content']) for content in contents: text = content.text.strip() if content.text else '' content_type = content.attrib.get('type', '') entry['contents'].append({'content': text, 'type': content_type}) entry['links'] = [] links = extract_elems(elem, ['link']) for link in links: href = link.attrib.get('href', '') rel = link.attrib.get('rel', '') entry['links'].append({'href': href, 'rel': rel}) return tidy_dict(entry)
def _parse_contact(self, elem): ''' parse any CI_Contact ''' contact = {} if elem is None: return contact contact['phone'] = extract_item( elem, ['phone', 'CI_Telephone', 'voice', 'CharacterString']) contact['addresses'] = extract_items( elem, ['address', 'CI_Address', 'deliveryPoint', 'CharacterString']) contact['city'] = extract_item( elem, ['address', 'CI_Address', 'city', 'CharacterString']) contact['state'] = extract_item( elem, ['address', 'CI_Address', 'administrativeArea', 'CharacterString']) contact['postal'] = extract_item( elem, ['address', 'CI_Address', 'postalCode', 'CharacterString']) contact['country'] = extract_item( elem, ['address', 'CI_Address', 'country', 'CharacterString']) contact['email'] = extract_item( elem, ['address', 'CI_Address', 'electronicMailAddress', 'CharacterString']) return tidy_dict(contact)
def _extract_params(self, endpoint): def _extract_prefix(param): pattern = '\{{0,1}(\S*):([\S][^}]*)' # TODO: this is probably a bad assumption (that there's just the # one item in the list, not that urlparse returns the terms as a list) if isinstance(param, list): param = param[0] if ':' not in param: return ('', param) m = re.search(pattern, param) return m.groups() _parameter_formats = { "geo:box": "west, south, east, north", "time:start": "YYYY-MM-DDTHH:mm:ssZ", "time:stop": "YYYY-MM-DDTHH:mm:ssZ" } url = endpoint.get('template', '') query_params = parse_url(url) # deal with the namespaced parameters as [query param key, prefix, type] query_params = [[k] + list(_extract_prefix(v)) for k, v in query_params.iteritems()] return [ tidy_dict({ "name": qp[0], "prefix": qp[1], "type": qp[2], "format": _parameter_formats.get(':'.join(qp[1:])) }) for qp in query_params ]
def _parse_item(self, elem): entry = {} entry['title'] = extract_item(elem, ['title']) entry['id'] = extract_item(elem, ['id']) entry['creator'] = extract_item(elem, ['creator']) entry['author'] = extract_item(elem, ['author', 'name']) entry['date'] = extract_item(elem, ['date']) entry['updated'] = extract_item(elem, ['updated']) entry['published'] = extract_item(elem, ['published']) entry['subjects'] = [ e.attrib.get('term', '') for e in extract_elems(elem, ['category']) ] entry['contents'] = [] contents = extract_elems(elem, ['content']) for content in contents: text = content.text.strip() if content.text else '' content_type = content.attrib.get('type', '') entry['contents'].append({'content': text, 'type': content_type}) entry['links'] = [] links = extract_elems(elem, ['link']) for link in links: href = link.attrib.get('href', '') rel = link.attrib.get('rel', '') entry['links'].append({'href': href, 'rel': rel}) return tidy_dict(entry)
def parse(self): if self.elem is None: self.description = self.output return self.description = parse_identification_info(self.elem) self.description['operations'] = self._handle_operations() self.description = tidy_dict(self.description)
def _parse_keywords(self, elem): ''' for each descriptiveKeywords block in an identification block ''' keywords = [] for key_elem in extract_elems(elem, ['descriptiveKeywords']): # TODO: split these up (if *-delimited in some way) terms = extract_items( key_elem, ['MD_Keywords', 'keyword', 'CharacterString']) key_type = extract_attrib(key_elem, [ 'MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue' ]) thesaurus = extract_item(key_elem, [ 'MD_Keywords', 'thesaurusName', 'CI_Citation', 'title', 'CharacterString' ]) if terms: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms })) # TODO: add the Anchor element handling # ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor'] # add a generic set for the iso topic category isotopics = extract_items(elem, ['topicCategory', 'MD_TopicCategoryCode']) if isotopics: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": 'IsoTopicCategories', "bcube:hasValue": isotopics })) return keywords
def _parse_endpoint(self, elem): endpoint = {} endpoint['mimetype'] = elem.attrib.get('type', '') endpoint['template'] = elem.attrib.get('template', '') endpoint['parameters'] = self._extract_params(elem) endpoint['actionable'] = 'NOPE' # endpoint['url'] = self._generate_url( # endpoint['mimetype'], endpoint['template']) osl = OpenSearchLink(elem) endpoint['url'] = osl.url return tidy_dict(endpoint)
def parse(self): self.description = {} if 'parent_url' in self.harvest_details: self.description['childOf'] = self.harvest_details['parent_url'] if 'service' in self.identify: self.description = self._parse_service() if 'resultset' in self.identify: self.description['children'] = self._parse_children( self.identify['resultset'].get('dialect', '')) self.description = tidy_dict(self.description)
def _parse_endpoint(self, elem): endpoint = {} endpoint["mimetype"] = elem.attrib.get("type", "") endpoint["template"] = elem.attrib.get("template", "") endpoint["parameters"] = self._extract_params(elem) endpoint["actionable"] = "NOPE" # endpoint['url'] = self._generate_url( # endpoint['mimetype'], endpoint['template']) osl = OpenSearchLink(elem) endpoint["url"] = osl.url return tidy_dict(endpoint)
def parse(self): output = {} urls = set() if 'service' in self.identify: service = { "object_id": generate_uuid_urn(), "dcterms:title": extract_attrib(self.parser.xml, ['@name']), "rdf:type": "UNIDATA:THREDDS {0}".format( extract_attrib(self.parser.xml, ['@version'])), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "relationships": [], "urls": [] } url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": url_sha }) service['urls'].append(original_url) # NOTE: this is not the sha from the url service['relationships'].append( { "relate": "bcube:originatedFrom", "object_id": url_sha } ) # deal with the "dataset" service_bases = self.parser.xml.xpath( '//*[local-name()="service" and @base != ""]' ) self.service_bases = { s.attrib.get('name'): s.attrib.get('base') for s in service_bases } # if 'dataset' in self.identify: # # TODO: this is really not right but it is not # # a proper web service so meh # datasets = self._parse_datasets() # # if 'metadata' in self.identify: # # self.description['metadata'] = self._parse_metadata() output['services'] = [service] self.description = tidy_dict(output)
def parse(self): self.description = {} self._parse_results_set_info() self.description['total'] = self.total self.description['subtotal'] = self.subtotal self.description['schema'] = self.schema if self.parent_url: # TODO: consider making this a sha self.description['childOf'] = self.parent_url if 'resultset' in self.identify: self.description['children'] = self._parse_children(self.schema) self.description = tidy_dict(self.description)
def _generate_harvest_manifest(self, **kwargs): harvest = { "vcard:hasURL": self.url, "bcube:atTime": self.harvest_details.get('harvest_date'), "bcube:HTTPStatusCodeValue": 200, "http:reasonPhrase": "OK", "bcube:HTTPStatusFamilyCode": 200, "bcube:HTTPStatusFamilyType": "Success message", "bcube:hasUrlSource": "", "bcube:hasConfidence": "", "bcube:validatedOn": self.harvest_details.get('harvest_date'), "dc:identifier": generate_sha_urn(self.url) } harvest.update(kwargs) return tidy_dict(harvest)
def parse(self): self.description = {} if "parent_url" in self.harvest_details: # TODO: consider making this a sha self.description["childOf"] = self.harvest_details["parent_url"] if "service" in self.identify: self.description = self._parse_service() if "resultset" in self.identify: # TODO: get the root stats self.description["children"] = self._parse_children(self.identify["resultset"].get("dialect", "")) self.description = tidy_dict(self.description)
def _generate_harvest_manifest(self, **kwargs): # NOTE: for iso, you have to include the dc:identifier sha256 # in the kwargs harvest = { "vcard:hasURL": "", "bcube:atTime": self.harvest_details.get('harvest_date'), "bcube:HTTPStatusCodeValue": 200, "http:reasonPhrase": "OK", "bcube:HTTPStatusFamilyCode": 200, "bcube:HTTPStatusFamilyType": "Success message", "bcube:hasUrlSource": "", "bcube:hasConfidence": "", "bcube:validatedOn": self.harvest_details.get('harvest_date') } harvest.update(kwargs) return tidy_dict(harvest)
def parse_service(self): ''' main service parsing method: pull all defined elements, pull anything else text/attribute related returns: dict {service: 'anything ontology-driven'} ''' service = { "service": self.return_service_descriptors(), "dataset": self.return_dataset_descriptors(), "metadata": self.return_metadata_descriptors() } self.service = tidy_dict(service) return self.service
def parse(self): self.description = {} if "parent_url" in self.harvest_details: # TODO: consider making this a sha self.description['childOf'] = self.harvest_details['parent_url'] if 'service' in self.identify: self.description = self._parse_service() if 'resultset' in self.identify: # TODO: get the root stats self.description['children'] = self._parse_children( self.identify['resultset'].get('dialect', '')) self.description = tidy_dict(self.description)
def _parse_item(self, elem): item = {} item['title'] = extract_item(elem, ['title']) item['language'] = extract_item(elem, ['language']) item['author'] = extract_item(elem, ['author']) # TODO: go sort out what this is: http://purl.org/rss/1.0/modules/content/ item['encoded'] = extract_item(elem, ['encoded']) item['id'] = extract_item(elem, ['guid']) item['creator'] = extract_item(elem, ['creator']) item['subjects'] = extract_items(elem, ['category']) item['published'] = extract_item(elem, ['pubDate']) item['timestamp'] = extract_item(elem, ['date']) item['links'] = extract_items(elem, ['link']) item['links'] += extract_items(elem, ['docs']) return tidy_dict(item)
def _parse_child(self, child): item = {} item["title"] = extract_item(child, ["title"]) item["language"] = extract_item(child, ["language"]) item["author"] = extract_item(child, ["author"]) # TODO: go sort out what this is: http://purl.org/rss/1.0/modules/content/ item["encoded"] = extract_item(child, ["encoded"]) item["id"] = extract_item(child, ["guid"]) item["creator"] = extract_item(child, ["creator"]) item["subjects"] = extract_items(child, ["category"]) item["published"] = extract_item(child, ["pubDate"]) item["timestamp"] = extract_item(child, ["date"]) item["links"] = extract_items(child, ["link"]) item["links"] += extract_items(child, ["docs"]) return tidy_dict(item)
def _parse_responsibleparty(self, elem): ''' parse any CI_ResponsibleParty ''' individual_name = extract_item(elem, ['individualName', 'CharacterString']) organization_name = extract_item( elem, ['organisationName', 'CharacterString']) position_name = extract_item(elem, ['positionName', 'CharacterString']) e = extract_elem(elem, ['contactInfo', 'CI_Contact']) contact = self._parse_contact(e) return tidy_dict({ "individual": individual_name, "organization": organization_name, "position": position_name, "contact": contact })
def _parse_responsibleparty(self, elem): ''' parse any CI_ResponsibleParty ''' individual_name = extract_item( elem, ['individualName', 'CharacterString']) organization_name = extract_item( elem, ['organisationName', 'CharacterString']) position_name = extract_item( elem, ['positionName', 'CharacterString']) e = extract_elem(elem, ['contactInfo', 'CI_Contact']) contact = self._parse_contact(e) return tidy_dict({ "individual": individual_name, "organization": organization_name, "position": position_name, "contact": contact })
def parse(self): # get the series self.description = {} md = extract_elem(self.elem, ['seriesMetadata', 'MD_Metadata']) if md is None: return md_parser = MxParser(md) md_parser.parse() self.description = md_parser.description self.description['children'] = [] # get the children children = extract_elems( self.elem, ['composedOf', 'DS_DataSet', 'has', 'MD_Metadata']) for child in children: child_parser = MxParser(child) child_parser.parse() if child_parser.description: self.description['children'].append(child_parser.description) self.description = tidy_dict(self.description)
def _parse_identification_info(self, elem): # ignoring the larger get all the identifiers above # in favor of, hopefully, getting a better dataset id dataset_identifier = extract_item(elem, [ 'citation', 'CI_Citation', 'identifier', 'MD_Identifier', 'code', 'CharacterString' ]) dataset = { "object_id": generate_uuid_urn(), "dc:identifier": dataset_identifier, "dc:description": extract_item( elem, ['abstract', 'CharacterString']), "dcterms:title": extract_item(elem, [ 'citation', 'CI_Citation', 'title', 'CharacterString']), "relationships": [] } # TODO: i think the rights blob is not in the ontology prototypes # the rights information from MD_Constraints or MD_LegalConstraints # rights = extract_item(elem, ['resourceConstraints', '*', # 'useLimitation', 'CharacterString']) # deal with the extent extents = self._parse_extent(elem) dataset.update(extents) keywords = self._parse_keywords(elem) for keyword in keywords: dataset['relationships'].append({ "relate": "dc:conformsTo", "object_id": keyword['object_id'] }) return tidy_dict(dataset), keywords
def _parse_identification_info(self, elem): # ignoring the larger get all the identifiers above # in favor of, hopefully, getting a better dataset id dataset_identifier = extract_item(elem, [ 'citation', 'CI_Citation', 'identifier', 'MD_Identifier', 'code', 'CharacterString' ]) dataset = { "object_id": generate_uuid_urn(), "dc:identifier": dataset_identifier, "dc:description": extract_item(elem, ['abstract', 'CharacterString']), "dcterms:title": extract_item( elem, ['citation', 'CI_Citation', 'title', 'CharacterString']), "relationships": [] } # TODO: i think the rights blob is not in the ontology prototypes # the rights information from MD_Constraints or MD_LegalConstraints # rights = extract_item(elem, ['resourceConstraints', '*', # 'useLimitation', 'CharacterString']) # deal with the extent extents = self._parse_extent(elem) dataset.update(extents) keywords = self._parse_keywords(elem) for keyword in keywords: dataset['relationships'].append({ "relate": "dc:conformsTo", "object_id": keyword['object_id'] }) return tidy_dict(dataset), keywords
def _parse_service(self): output = {} service = { "object_id": generate_uuid_urn(), "dcterms:title": ' '.join( extract_items(self.parser.xml, ["Identify", "repositoryName"])), "rdf:type": "OAI-PMH", "relationships": [], "urls": [] } url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": url_id, "dc:identifier": generate_sha_urn(self.url) }) service['urls'] = [dist] service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": url_id }) # output['version'] = extract_items(self.parser.xml, ["Identify", "protocolVersion"]) # output['endpoints'] = [{'url': e} for e # in extract_items(self.parser.xml, ["Identify", "baseURL"])] output['services'] = [service] return tidy_dict(output)
def identify(self): ''' it is within a protocol if *any* set of filters ''' def _test_option(filters): '''where filters is the set of filters as booleans''' for i, j in filters.iteritems(): if self._evaluate({i: self._filter(i, j, [])}, 0): return True return False def _extract_option(filters): ''' where filters is the set of things to return a value this assumes that you have concatenated the defaults and/or checks set ''' items = [] for check in filters: for c in check[1]: item = '' if c['type'] == 'simple': # TODO: this is still not a safe assumption re: casing filter_value = c['value'].upper() filter_object = self.source_content if c['object'] == 'content' \ else self.source_url filter_object = filter_object.upper() if filter_value in filter_object: item = [c.get('text', '')] # just for the xpath handling later elif c['type'] == 'xpath': if self.parser.xml is None: print 'Parser FAIL' continue try: values = self.parser.xml.xpath(c['value']) values = values if isinstance(values, list) else [values] item = [' '.join(v.strip().split()) for v in values if v is not None] except Exception as ex: print 'XPATH FAIL: ', ex continue if item: items += item return items def _chain(source_dict, keys): try: return list(chain.from_iterable( [source_dict.get(key, {}).items() for key in keys] )) except: print source_dict return [] matches = [] for protocol in self.yaml: protocol_name = protocol['name'] # print protocol_name for k, v in protocol.iteritems(): if k in ['name'] or v is None: continue for option in v: if 'filters' not in option or option['filters'] is None: continue is_match = _test_option(option['filters']) # check the error filters errors = option.get('errors', {}) is_error = _test_option(errors.get('filters', {})) if errors else False # check the language filters language_filters = option.get('language', {}) _filters = _chain(language_filters, ["defaults", "checks"]) languages = _extract_option(_filters) # check the version filters version_filters = option.get('versions', {}) _filters = _chain(version_filters, ["defaults", "checks"]) versions = _extract_option(_filters) # and the dialect if there's a key dialect_filters = option.get('dialect', {}) if dialect_filters: if 'text' in dialect_filters: dialect = dialect_filters.get('text') else: # it's in the response somewhere _filters = _chain(dialect_filters, ["defaults", "checks"]) dialect = _extract_option(_filters) else: dialect = [] # dump it out if is_match: matches.append({ "protocol": protocol_name, k: tidy_dict({ "name": option.get('name', ''), "request": option.get('request', ''), "dialect": dialect, "version": versions, "error": is_error, "language": languages }) }) return matches
def parse_item(self): output = {} urls = set() catalog_object_id = generate_uuid_urn() output['catalog_record'] = { "object_id": catalog_object_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.elem, ['@noNamespaceSchemaLocation']).split(), "rdf:type": "FGDC:CSDGM", "relationships": [], "urls": [] } output['urls'] = [] # add the harvest info # this is not necessary as a sha just for set inclusion url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) output['catalog_record']['urls'].append(original_url) # NOTE: this is not the sha from the url output['catalog_record']['relationships'].append( { "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] } ) datsetid = extract_item(self.elem, ['idinfo', 'datsetid']) dataset_object_id = generate_uuid_urn() dataset = { "object_id": dataset_object_id, "dcterms:identifier": datsetid, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": extract_item( self.elem, ['idinfo', 'descript', 'abstract']), "dcterms:title": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'title']), "urls": [], "relationships": [] } bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding']) if bbox_elem is not None: # that's not even valid west = extract_item(bbox_elem, ['westbc']) east = extract_item(bbox_elem, ['eastbc']) north = extract_item(bbox_elem, ['northbc']) south = extract_item(bbox_elem, ['southbc']) bbox = [west, south, east, north] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) dataset.update({ "dc:spatial": bbox, "esip:westBound": west, "esip:eastBound": east, "esip:northBound": north, "esip:southBound": south }) time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo']) if time_elem is not None: caldate = extract_item(time_elem, ['sngdate', 'caldate']) if caldate: # TODO: we should see if it's at least a valid date dataset['esip:startDate'] = self._convert_date(caldate) rngdate = extract_elem(time_elem, ['rngdates']) if rngdate is not None: dataset['esip:startDate'] = self._convert_date( extract_item(rngdate, ['begdate'])) dataset['esip:endDate'] = self._convert_date( extract_item(rngdate, ['enddate'])) # TODO: add the min/max of the list of dates dataset['relationships'] = [ { "relate": "bcube:hasMetadataRecord", "object_id": catalog_object_id } ] publisher = { "object_id": generate_uuid_urn(), "name": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']), "location": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace']) } output['publisher'] = publisher dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": publisher['object_id'] }) distrib_elems = extract_elems( self.elem, ['distinfo', 'stdorder', 'digform']) for distrib_elem in distrib_elems: link = extract_item( distrib_elem, ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr']) # format = extract_item(distrib_elem, ['digtinfo', 'formname']) url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) # this is a distribution link so # we are assuming it is to data dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) webpages = [] onlink_elems = extract_elems( self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink']) for onlink_elem in onlink_elems: link = onlink_elem.text.strip() if onlink_elem.text else '' if not link: continue url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) webpages.append({ "object_id": generate_uuid_urn(), "relationships": [ { "relate": "dcterms:references", "object_id": url_id } ]} ) output['catalog_record']['webpages'] = webpages for webpage in webpages: dataset['relationships'].append({ "relate": "dcterms:references", "object_id": webpage['object_id'] }) # retain the keyword sets with type, thesaurus name and split # the terms as best we can keywords = [] key_elem = extract_elem(self.elem, ['idinfo', 'keywords']) for child in key_elem.iterchildren(): key_type = extract_element_tag(child.tag) key_tag = 'strat' if key_type == 'stratum' else key_type key_tag = 'temp' if key_tag == 'temporal' else key_tag thesaurus = extract_item(child, ['%skt' % key_tag]) # TODO: split these up terms = extract_items(child, ['%skey' % key_tag]) if terms: # if there's a parsing error (bad cdata, etc) may not have # TODO: add something for a set without a thesaurus name keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms }) ) output['keywords'] = keywords for keyword in keywords: dataset['relationships'].append( { "relate": "dc:conformsTo", "object_id": keyword['object_id'] } ) output['datasets'] = [dataset] # add the metadata relate output['catalog_record']['relationships'].append( { "relate": "foaf:primaryTopic", "object_id": dataset_object_id } ) output['catalog_records'] = [output['catalog_record']] del output['catalog_record'] self.description = tidy_dict(output)
def _parse_service(self): output = {} urls = set() service = { "object_id": generate_uuid_urn(), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "rdf:type": 'OpenSearch1.1:Description', "dcterms:title": extract_item(self.parser.xml, ["ShortName"]), "dc:description": ' '.join( extract_items(self.parser.xml, ["LongName"]) + extract_items(self.parser.xml, ["Description"])), "urls": [], "webpages": [], "relationships": [] } url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) service['urls'].append(original_url) service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) # output['source'] = extract_items( # self.parser.xml, ["Attribution"]) # output['contact'] = extract_items( # self.parser.xml, ["Developer"]) # output['rights'] = extract_items( # self.parser.xml, ["SyndicationRight"]) key_id = generate_uuid_urn() output['keywords'] = [{ "object_id": key_id, "bcube:hasValue": extract_items(self.parser.xml, ["Tags"]) }] service['relationships'].append({ "relate": "dc:conformsTo", "object_id": key_id }) for t in extract_elems(self.parser.xml, ['Url']): ep = self._parse_endpoint(t) url_sha = generate_sha_urn(ep['url']) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Generated", "bcube:hasConfidence": "Not Sure", "vcard:hasURL": ep['url'], "object_id": url_id, "dc:identifier": url_sha }) service['urls'].append(dist) wb_id = generate_uuid_urn() service['webpages'].append({ "object_id": wb_id, "relationships": [{ "relate": "dcterms:references", "object_id": url_id }] }) service['relationships'].append({ "relate": "dcterms:references", "object_id": wb_id }) output['services'] = [service] return tidy_dict(output)
def parse(self): output = {} output['items'] = [ child for child in self.parse_children(tags=['//*', 'item']) ] return tidy_dict(output)
def parse(self): output = {} output["items"] = [child for child in self.parse_children(tags=["//*", "item"])] return tidy_dict(output)
def parse(self): # for ogc, a catalog record is the getcapabilities rsp output = {"layers": [], "catalog_records": []} urls = set() if not self.reader: self.description = {} return if 'service' in self.identify: service_id = generate_uuid_urn() service = { "object_id": service_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "relationships": [], "urls": [], "rdf:type": self.urn } url_sha = generate_sha_urn(self.url) urls.add(url_sha) url_id = generate_uuid_urn() service['urls'].append( self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": url_id, "dc:identifier": url_sha })) service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": url_id }) # self._get_service_config(service_name, version) service_reader = self._parse_service(self.reader, self.service_name, self.version) # map to triples service.update({"dc:description": service_reader.get('abstract')}) keywords = service_reader.get('subject', []) if keywords: output['keywords'] = [{ "object_id": generate_uuid_urn(), "bcube:hasValue": keywords }] for k in output['keywords']: service['relationships'].append({ "relate": "dc:conformsTo", "object_id": k['object_id'] }) if self.identify['service'].get('request', '') == 'GetCapabilities': # this is also awkward. meh. needs must. layers = [] listed_layers = self._parse_getcap_datasets(self.reader) for ld in listed_layers: layer = { "object_id": generate_uuid_urn(), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": ld.get('abstract', ''), "dc:title": ld.get('title', ''), "relationships": [] } service['relationships'].append({ "relate": "bcube:contains", "object_id": layer['object_id'] }) # add the generated url for the service generated_url = self._generate_url(self.url, ld.get('name'), ld.get('bbox'), self.service_name, self.version) if generated_url: url_sha = generate_sha_urn(generated_url) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() layer_url = self._generate_harvest_manifest( **{ "vcard:hasURL": generated_url, "bcube:hasUrlSource": "Generated", "bcube:hasConfidence": "Not Sure", "object_id": url_id, "dc:identifier": url_sha }) service['urls'].append(layer_url) # don't add to the larger set, but do # include the reference within the layer layer['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) # add each as a dataset with just a url for now for mu in ld.get('metadata_urls', []): url_link = generate_uuid_urn() url_sha = generate_sha_urn(mu.get('url')) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() output['catalog_records'] += [{ "object_id": url_link, "urls": [ self._generate_harvest_manifest( **{ "vcard:hasURL": mu.get('url'), "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "object_id": url_id, "dc:identifier": url_sha }) ], "relationships": [{ "relate": "dc:describes", "object_id": layer['object_id'] }, { "relate": "bcube:originatedFrom", "object_id": url_id }] }] if 'temporal_extent' in ld: temporal = tidy_dict({ "esip:startDate": ld['temporal_extent'].get('begin', ''), "esip:endDate": ld['temporal_extent'].get('end', '') }) if temporal: layer.update(temporal) if 'bbox' in ld: layer.update(ld['bbox']) layers.append(layer) service['layers'] = layers # if layers: # service['layers'] = layers # for layer in layers: # service['relationships'].append({ # "relate": "bcube:contains", # "object_id": layer['object_id'] # }) output['services'] = [service] self.description = tidy_dict(output)
def parse(self): # for ogc, a catalog record is the getcapabilities rsp output = { "layers": [], "catalog_records": [] } urls = set() if not self.reader: self.description = {} return if 'service' in self.identify: service_id = generate_uuid_urn() service = { "object_id": service_id, "bcube:dateCreated": self.harvest_details.get( 'harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get( 'harvest_date', ''), "relationships": [], "urls": [], "rdf:type": self.urn } url_sha = generate_sha_urn(self.url) urls.add(url_sha) url_id = generate_uuid_urn() service['urls'].append( self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": url_id, "dc:identifier": url_sha }) ) service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": url_id }) # self._get_service_config(service_name, version) service_reader = self._parse_service( self.reader, self.service_name, self.version) # map to triples service.update({ "dc:description": service_reader.get('abstract') }) keywords = service_reader.get('subject', []) if keywords: output['keywords'] = [{ "object_id": generate_uuid_urn(), "bcube:hasValue": keywords }] for k in output['keywords']: service['relationships'].append( { "relate": "dc:conformsTo", "object_id": k['object_id'] } ) if self.identify['service'].get('request', '') == 'GetCapabilities': # this is also awkward. meh. needs must. layers = [] listed_layers = self._parse_getcap_datasets(self.reader) for ld in listed_layers: layer = { "object_id": generate_uuid_urn(), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": ld.get('abstract', ''), "dc:title": ld.get('title', ''), "relationships": [] } service['relationships'].append({ "relate": "bcube:contains", "object_id": layer['object_id'] }) # add the generated url for the service generated_url = self._generate_url( self.url, ld.get('name'), ld.get('bbox'), self.service_name, self.version ) if generated_url: url_sha = generate_sha_urn(generated_url) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() layer_url = self._generate_harvest_manifest(**{ "vcard:hasURL": generated_url, "bcube:hasUrlSource": "Generated", "bcube:hasConfidence": "Not Sure", "object_id": url_id, "dc:identifier": url_sha }) service['urls'].append(layer_url) # don't add to the larger set, but do # include the reference within the layer layer['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) # add each as a dataset with just a url for now for mu in ld.get('metadata_urls', []): url_link = generate_uuid_urn() url_sha = generate_sha_urn(mu.get('url')) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() output['catalog_records'] += [ { "object_id": url_link, "urls": [ self._generate_harvest_manifest(**{ "vcard:hasURL": mu.get('url'), "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "object_id": url_id, "dc:identifier": url_sha }) ], "relationships": [ { "relate": "dc:describes", "object_id": layer['object_id'] }, { "relate": "bcube:originatedFrom", "object_id": url_id } ] } ] if 'temporal_extent' in ld: temporal = tidy_dict( { "esip:startDate": ld['temporal_extent'].get('begin', ''), "esip:endDate": ld['temporal_extent'].get('end', '') } ) if temporal: layer.update(temporal) if 'bbox' in ld: layer.update(ld['bbox']) layers.append(layer) service['layers'] = layers # if layers: # service['layers'] = layers # for layer in layers: # service['relationships'].append({ # "relate": "bcube:contains", # "object_id": layer['object_id'] # }) output['services'] = [service] self.description = tidy_dict(output)
def _parse_service(self): output = {} urls = set() service = { "object_id": generate_uuid_urn(), "bcube:dateCreated": self.harvest_details.get("harvest_date", ""), "bcube:lastUpdated": self.harvest_details.get("harvest_date", ""), "rdf:type": "OpenSearch1.1:Description", "dcterms:title": extract_item(self.parser.xml, ["ShortName"]), "dc:description": " ".join( extract_items(self.parser.xml, ["LongName"]) + extract_items(self.parser.xml, ["Description"]) ), "urls": [], "webpages": [], "relationships": [], } url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha, } ) service["urls"].append(original_url) service["relationships"].append({"relate": "bcube:originatedFrom", "object_id": original_url["object_id"]}) # output['source'] = extract_items( # self.parser.xml, ["Attribution"]) # output['contact'] = extract_items( # self.parser.xml, ["Developer"]) # output['rights'] = extract_items( # self.parser.xml, ["SyndicationRight"]) key_id = generate_uuid_urn() output["keywords"] = [{"object_id": key_id, "bcube:hasValue": extract_items(self.parser.xml, ["Tags"])}] service["relationships"].append({"relate": "dc:conformsTo", "object_id": key_id}) for t in extract_elems(self.parser.xml, ["Url"]): ep = self._parse_endpoint(t) url_sha = generate_sha_urn(ep["url"]) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Generated", "bcube:hasConfidence": "Not Sure", "vcard:hasURL": ep["url"], "object_id": url_id, "dc:identifier": url_sha, } ) service["urls"].append(dist) wb_id = generate_uuid_urn() service["webpages"].append( {"object_id": wb_id, "relationships": [{"relate": "dcterms:references", "object_id": url_id}]} ) service["relationships"].append({"relate": "dcterms:references", "object_id": wb_id}) output["services"] = [service] return tidy_dict(output)
def parse_item(self): output = {} urls = set() catalog_object_id = generate_uuid_urn() output['catalog_record'] = { "object_id": catalog_object_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.elem, ['@noNamespaceSchemaLocation']).split(), "rdf:type": "FGDC:CSDGM", "relationships": [], "urls": [] } output['urls'] = [] # add the harvest info # this is not necessary as a sha just for set inclusion url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) output['catalog_record']['urls'].append(original_url) # NOTE: this is not the sha from the url output['catalog_record']['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) datsetid = extract_item(self.elem, ['idinfo', 'datsetid']) dataset_object_id = generate_uuid_urn() dataset = { "object_id": dataset_object_id, "dcterms:identifier": datsetid, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": extract_item(self.elem, ['idinfo', 'descript', 'abstract']), "dcterms:title": extract_item(self.elem, ['idinfo', 'citation', 'citeinfo', 'title']), "urls": [], "relationships": [] } bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding']) if bbox_elem is not None: # that's not even valid west = extract_item(bbox_elem, ['westbc']) east = extract_item(bbox_elem, ['eastbc']) north = extract_item(bbox_elem, ['northbc']) south = extract_item(bbox_elem, ['southbc']) bbox = [west, south, east, north] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) dataset.update({ "dc:spatial": bbox, "esip:westBound": west, "esip:eastBound": east, "esip:northBound": north, "esip:southBound": south }) time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo']) if time_elem is not None: caldate = extract_item(time_elem, ['sngdate', 'caldate']) if caldate: # TODO: we should see if it's at least a valid date dataset['esip:startDate'] = self._convert_date(caldate) rngdate = extract_elem(time_elem, ['rngdates']) if rngdate is not None: dataset['esip:startDate'] = self._convert_date( extract_item(rngdate, ['begdate'])) dataset['esip:endDate'] = self._convert_date( extract_item(rngdate, ['enddate'])) # TODO: add the min/max of the list of dates dataset['relationships'] = [{ "relate": "bcube:hasMetadataRecord", "object_id": catalog_object_id }] publisher = { "object_id": generate_uuid_urn(), "name": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']), "location": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace']) } output['publisher'] = publisher dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": publisher['object_id'] }) distrib_elems = extract_elems(self.elem, ['distinfo', 'stdorder', 'digform']) for distrib_elem in distrib_elems: link = extract_item( distrib_elem, ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr']) # format = extract_item(distrib_elem, ['digtinfo', 'formname']) url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) # this is a distribution link so # we are assuming it is to data dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) webpages = [] onlink_elems = extract_elems( self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink']) for onlink_elem in onlink_elems: link = onlink_elem.text.strip() if onlink_elem.text else '' if not link: continue url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) webpages.append({ "object_id": generate_uuid_urn(), "relationships": [{ "relate": "dcterms:references", "object_id": url_id }] }) output['catalog_record']['webpages'] = webpages for webpage in webpages: dataset['relationships'].append({ "relate": "dcterms:references", "object_id": webpage['object_id'] }) # retain the keyword sets with type, thesaurus name and split # the terms as best we can keywords = [] key_elem = extract_elem(self.elem, ['idinfo', 'keywords']) for child in key_elem.iterchildren(): key_type = extract_element_tag(child.tag) key_tag = 'strat' if key_type == 'stratum' else key_type key_tag = 'temp' if key_tag == 'temporal' else key_tag thesaurus = extract_item(child, ['%skt' % key_tag]) # TODO: split these up terms = extract_items(child, ['%skey' % key_tag]) if terms: # if there's a parsing error (bad cdata, etc) may not have # TODO: add something for a set without a thesaurus name keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms })) output['keywords'] = keywords for keyword in keywords: dataset['relationships'].append({ "relate": "dc:conformsTo", "object_id": keyword['object_id'] }) output['datasets'] = [dataset] # add the metadata relate output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset_object_id }) output['catalog_records'] = [output['catalog_record']] del output['catalog_record'] self.description = tidy_dict(output)
def parse(self): ''' from the root node, parse: identification (title, abstract, point of contact, keywords, extent) if identificationInfo contains SV_ServiceIdentification, add as child distribution info ''' # set up the url set urls = set() urls.add(self.output['catalog_record']['urls'][0]['object_id']) for id_elem in extract_elems( self.elem, ['//*', 'identificationInfo', 'MD_DataIdentification']): dataset, keywords = self._parse_identification_info(id_elem) dataset['relationships'].append({ "relate": "bcube:hasMetadataRecord", "object_id": self.output['catalog_record']['object_id'] }) dataset.update({ "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', '') }) self.output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset['object_id'] }) # point of contact from the root node and this might be an issue # in things like the -1/-3 from ngdc so try for an idinfo blob poc_elem = extract_elem(id_elem, [ 'identificationInfo', 'MD_DataIdentification', 'pointOfContact', 'CI_ResponsibleParty']) # if poc_elem is None: # # and if that fails try for the root-level contact # poc_elem = extract_elem( # self.elem, # ['contact', 'CI_ResponsibleParty']) # TODO: point of contact is not necessarily the publisher if poc_elem is not None: poc = self._parse_responsibleparty(poc_elem) location = ( ' '.join( [poc['contact'].get('city', ''), poc['contact'].get('country', '')]) ).strip() if poc.get('contact', {}) else '' self.output['publishers'].append(tidy_dict({ "object_id": generate_uuid_urn(), "name": poc.get('organization', ''), "location": location })) dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": self.output['publisher']['object_id'] }) dataset['urls'] = [] dist_elems = extract_elems(self.elem, ['distributionInfo']) for dist_elem in dist_elems: for d in self._parse_distribution(dist_elem): if not d: continue url_sha = generate_sha_urn(d) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": d, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) self.output['datasets'].append(dataset) self.output['keywords'] += keywords # TODO: removing this until we have a definition for SERVICE # # check for the service elements # service_elems = extract_elems(self.elem, # ['identificationInfo', 'SV_ServiceIdentification']) # self.description['services'] = [] # for service_elem in service_elems: # sv = SrvParser(service_elem) # self.description['services'].append(sv.parse()) # switch the catalog record to a list for conformity. eep. self.output['catalog_records'] = [self.output['catalog_record']] del self.output['catalog_record'] self.description = tidy_dict(self.output)
def _get_operations(self, reader, service, version): ''' each operation can have more than one endpoint (get and post, ex) formatOptions = output format of the response method that can be some controlled vocab value (XMLSCHEMA, etc) this is incorrectly handled in wfs 1.1.0 (hardcoded value) later versions can have an outputFormat parameter instead BUT it comes down to whether that is a proper query parameter or simply the delivered response (no choice in the spec) parameters = {name: {values: list}} ''' _vocabs = { "XMLSCHEMA": "application/xml", "GML2": "text/xml; subtype=gml/2.1.2" } def _check_controlled_vocabs(term): if term in _vocabs: return _vocabs[term] return term def _replace_nones(to_check): return '' if to_check is None else to_check def _append_params(base_url, operation): if not base_url[-1] == '?': base_url += '?' return base_url + 'SERVICE=%s&VERSION=%s&REQUEST=%s' % (service, version, operation) def _merge_params(op_name, found_params): ''' for some parameter structure: {'resultType': {'values': ['results', 'hits']}} integrate into the config params with the common elements ''' # TODO: how to handle aliases (if necessary) req_methods = self.config.get('methods', []) req_params = next( iter( [d for d in req_methods if d['name'] == op_name.upper()] ), {} ).get('params', []) req_params = [] if not req_params else req_params defaults = self.config.get('common', []) + req_params if not found_params: return defaults for k, v in found_params.iteritems(): param = next( iter(d for d in defaults if d['name'] == k.lower()), []) if not param: continue found_index = defaults.index(param) param['values'] = [ _check_controlled_vocabs(a) for a in v['values'] ] defaults[found_index] = param return defaults def _return_parameter(param): # return a parameter dict without empty values parameter = {} for key in ['name', 'type', 'format', 'values']: if key in param and param[key]: parameter[key] = param[key] return parameter operations = [] for o in reader.operations: # TODO: handle the differing formatOptions # get the parameter values if supported by the service try: params = o.parameters except AttributeError: params = {} # merge with defaults (where it can be add the whole element # OR parts of the element) params = _merge_params(o.name, params) # get the formatOptions try: formats = [ _check_controlled_vocabs(fo) for fo in o.formatOptions ] except AttributeError: formats = [] endpoints = [ tidy_dict({ "name": o.name, "protocol": remap_http_method(m.get('type', '')), "url": _append_params(m.get('url', ''), o.name), "constraints": m.get('constraints', []), "mimeType": formats, "actionable": 1 if o.name == 'GetCapabilities' else 2, "parameters": [_return_parameter(p) for p in params] }) for m in o.methods ] operations += endpoints return operations
def _get_operations(self, reader, service, version): ''' each operation can have more than one endpoint (get and post, ex) formatOptions = output format of the response method that can be some controlled vocab value (XMLSCHEMA, etc) this is incorrectly handled in wfs 1.1.0 (hardcoded value) later versions can have an outputFormat parameter instead BUT it comes down to whether that is a proper query parameter or simply the delivered response (no choice in the spec) parameters = {name: {values: list}} ''' _vocabs = { "XMLSCHEMA": "application/xml", "GML2": "text/xml; subtype=gml/2.1.2" } def _check_controlled_vocabs(term): if term in _vocabs: return _vocabs[term] return term def _replace_nones(to_check): return '' if to_check is None else to_check def _append_params(base_url, operation): if not base_url[-1] == '?': base_url += '?' return base_url + 'SERVICE=%s&VERSION=%s&REQUEST=%s' % ( service, version, operation) def _merge_params(op_name, found_params): ''' for some parameter structure: {'resultType': {'values': ['results', 'hits']}} integrate into the config params with the common elements ''' # TODO: how to handle aliases (if necessary) req_methods = self.config.get('methods', []) req_params = next( iter([d for d in req_methods if d['name'] == op_name.upper()]), {}).get('params', []) req_params = [] if not req_params else req_params defaults = self.config.get('common', []) + req_params if not found_params: return defaults for k, v in found_params.iteritems(): param = next( iter(d for d in defaults if d['name'] == k.lower()), []) if not param: continue found_index = defaults.index(param) param['values'] = [ _check_controlled_vocabs(a) for a in v['values'] ] defaults[found_index] = param return defaults def _return_parameter(param): # return a parameter dict without empty values parameter = {} for key in ['name', 'type', 'format', 'values']: if key in param and param[key]: parameter[key] = param[key] return parameter operations = [] for o in reader.operations: # TODO: handle the differing formatOptions # get the parameter values if supported by the service try: params = o.parameters except AttributeError: params = {} # merge with defaults (where it can be add the whole element # OR parts of the element) params = _merge_params(o.name, params) # get the formatOptions try: formats = [ _check_controlled_vocabs(fo) for fo in o.formatOptions ] except AttributeError: formats = [] endpoints = [ tidy_dict({ "name": o.name, "protocol": remap_http_method(m.get('type', '')), "url": _append_params(m.get('url', ''), o.name), "constraints": m.get('constraints', []), "mimeType": formats, "actionable": 1 if o.name == 'GetCapabilities' else 2, "parameters": [_return_parameter(p) for p in params] }) for m in o.methods ] operations += endpoints return operations
def parse(self): ''' from the root node, parse: identification (title, abstract, point of contact, keywords, extent) if identificationInfo contains SV_ServiceIdentification, add as child distribution info ''' # set up the url set urls = set() urls.add(self.output['catalog_record']['urls'][0]['object_id']) for id_elem in extract_elems( self.elem, ['//*', 'identificationInfo', 'MD_DataIdentification']): dataset, keywords = self._parse_identification_info(id_elem) dataset['relationships'].append({ "relate": "bcube:hasMetadataRecord", "object_id": self.output['catalog_record']['object_id'] }) dataset.update({ "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', '') }) self.output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset['object_id'] }) # point of contact from the root node and this might be an issue # in things like the -1/-3 from ngdc so try for an idinfo blob poc_elem = extract_elem(id_elem, [ 'identificationInfo', 'MD_DataIdentification', 'pointOfContact', 'CI_ResponsibleParty' ]) # if poc_elem is None: # # and if that fails try for the root-level contact # poc_elem = extract_elem( # self.elem, # ['contact', 'CI_ResponsibleParty']) # TODO: point of contact is not necessarily the publisher if poc_elem is not None: poc = self._parse_responsibleparty(poc_elem) location = (' '.join([ poc['contact'].get('city', ''), poc['contact'].get( 'country', '') ])).strip() if poc.get('contact', {}) else '' self.output['publishers'].append( tidy_dict({ "object_id": generate_uuid_urn(), "name": poc.get('organization', ''), "location": location })) dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": self.output['publisher']['object_id'] }) dataset['urls'] = [] dist_elems = extract_elems(self.elem, ['distributionInfo']) for dist_elem in dist_elems: for d in self._parse_distribution(dist_elem): if not d: continue url_sha = generate_sha_urn(d) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": d, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) self.output['datasets'].append(dataset) self.output['keywords'] += keywords # TODO: removing this until we have a definition for SERVICE # # check for the service elements # service_elems = extract_elems(self.elem, # ['identificationInfo', 'SV_ServiceIdentification']) # self.description['services'] = [] # for service_elem in service_elems: # sv = SrvParser(service_elem) # self.description['services'].append(sv.parse()) # switch the catalog record to a list for conformity. eep. self.output['catalog_records'] = [self.output['catalog_record']] del self.output['catalog_record'] self.description = tidy_dict(self.output)