def _parse_distribution(self, elem): ''' from the distributionInfo element ''' for dist_elem in extract_elems(elem, ['MD_Distribution']): # this is going to get ugly. # super ugly # get the transferoptions block # get the url, the name, the description, the size # get the format from a parent node # but where the transferoptions can be in some nested # distributor thing or at the root of the element (NOT # the root of the file) transfer_elems = extract_elems( dist_elem, ['//*', 'MD_DigitalTransferOptions']) for transfer_elem in transfer_elems: # _transfer = {} # transfer['url'] = extract_item( # transfer_elem, # ['onLine', 'CI_OnlineResource', 'linkage', 'URL']) # transfer['objectid'] = generate_sha_urn(transfer['url']) # xp = generate_localname_xpath( # ['..', '..', 'distributorFormat', 'MD_Format']) # format_elem = next(iter(transfer_elem.xpath(xp)), None) # if format_elem is not None: # transfer['format'] = ' '.join([ # extract_item(format_elem, # ['name', 'CharacterString']), # extract_item( # format_elem, ['version', 'CharacterString'])]) link = extract_item( transfer_elem, ['onLine', 'CI_OnlineResource', 'linkage', 'URL']) yield link
def _parse_item(self, elem): entry = {} entry['title'] = extract_item(elem, ['title']) entry['id'] = extract_item(elem, ['id']) entry['creator'] = extract_item(elem, ['creator']) entry['author'] = extract_item(elem, ['author', 'name']) entry['date'] = extract_item(elem, ['date']) entry['updated'] = extract_item(elem, ['updated']) entry['published'] = extract_item(elem, ['published']) entry['subjects'] = [ e.attrib.get('term', '') for e in extract_elems(elem, ['category']) ] entry['contents'] = [] contents = extract_elems(elem, ['content']) for content in contents: text = content.text.strip() if content.text else '' content_type = content.attrib.get('type', '') entry['contents'].append({'content': text, 'type': content_type}) entry['links'] = [] links = extract_elems(elem, ['link']) for link in links: href = link.attrib.get('href', '') rel = link.attrib.get('rel', '') entry['links'].append({'href': href, 'rel': rel}) return tidy_dict(entry)
def parse(self): elem = self.parser.xml ncml = {'variables': []} ncml['identifier'] = elem.attrib.get('location', '') for variable in extract_elems(elem, ['variable']): v = {} v['name'] = variable.attrib.get('name', '') v['attributes'] = [] for att in extract_elems(variable, ['attribute']): a = {} for key, value in att.attrib.iteritems(): tag = extract_element_tag(key) if tag == 'values': continue a[tag] = value.strip() if a: v['attributes'] += [a] v = tidy_dict(v) if v: ncml['variables'].append(v) return tidy_dict(ncml)
def _parse_child(self, child): entry = {} entry["title"] = extract_item(child, ["title"]) entry["id"] = extract_item(child, ["id"]) entry["creator"] = extract_item(child, ["creator"]) entry["author"] = extract_item(child, ["author", "name"]) entry["date"] = extract_item(child, ["date"]) entry["updated"] = extract_item(child, ["updated"]) entry["published"] = extract_item(child, ["published"]) entry["subjects"] = [e.attrib.get("term", "") for e in extract_elems(child, ["category"])] entry["contents"] = [] contents = extract_elems(child, ["content"]) for content in contents: text = content.text.strip() if content.text else "" content_type = content.attrib.get("type", "") entry["contents"].append({"content": text, "type": content_type}) entry["links"] = [] links = extract_elems(child, ["link"]) for link in links: href = link.attrib.get("href", "") rel = link.attrib.get("rel", "") entry["links"].append({"href": href, "rel": rel}) return tidy_dict(entry)
def parse_endpoints(self): ''' JUST THE SERVICE ENDPOINTS (service and catalogRef elements at the root level) if the catalog service contains service elements. or a dataset element or catalogRef elements, parse those as endpoints (relative paths and all of the tagging issues) ''' endpoints = [] services = extract_elems(self.parser.xml, ['service']) # ffs, services can be nested too for service in services: description, child_endpoints = self._handle_elem( service, ['service'], self.url, {} ) endpoints += [description] if child_endpoints: endpoints += child_endpoints catrefs = extract_elems(self.parser.xml, ['catalogRef']) for catref in catrefs: description, child_endpoints = self._handle_elem( catref, ['catalogRef', 'metadata'], self.url, {} # TODO: so dap or file base path only? (not the full set, # that makes no sense) ) endpoints += [description] + child_endpoints return endpoints
def _parse_item(self, elem): entry = {} entry['title'] = extract_item(elem, ['title']) entry['id'] = extract_item(elem, ['id']) entry['creator'] = extract_item(elem, ['creator']) entry['author'] = extract_item(elem, ['author', 'name']) entry['date'] = extract_item(elem, ['date']) entry['updated'] = extract_item(elem, ['updated']) entry['published'] = extract_item(elem, ['published']) entry['subjects'] = [e.attrib.get('term', '') for e in extract_elems(elem, ['category'])] entry['contents'] = [] contents = extract_elems(elem, ['content']) for content in contents: text = content.text.strip() if content.text else '' content_type = content.attrib.get('type', '') entry['contents'].append({'content': text, 'type': content_type}) entry['links'] = [] links = extract_elems(elem, ['link']) for link in links: href = link.attrib.get('href', '') rel = link.attrib.get('rel', '') entry['links'].append({'href': href, 'rel': rel}) return tidy_dict(entry)
def _handle_elem(self, elem, child_tags, base_url, service_bases): description = self._get_items( extract_element_tag(elem.tag), elem, base_url, service_bases ) description['source'] = extract_element_tag(elem.tag) endpoints = [] for child_tag in child_tags: elems = extract_elems(elem, [child_tag]) for e in elems: e_desc = self._get_items( extract_element_tag(e.tag), e, base_url, service_bases ) e_desc['childOf'] = description.get('ID', '') e_desc["source"] = extract_element_tag(child_tag) parents = description.get('parentOf', []) parents += [e['ID'] for e in endpoints if 'childOf' in e] description['parentOf'] = parents endpoints.append(e_desc) return description, endpoints
def _handle_operations(self): elems = extract_elems(self.elem, ['containsOperations', 'SV_OperationMetadata']) ops = [] for e in elems: op = {} op['name'] = extract_item(e, ['operationName', 'CharacterString']) op['method'] = extract_attrib(e, ['DCP', 'DCPList', '@codeListValue']) op['url'] = extract_item( e, ['connectPoint', 'CI_OnlineResource', 'linkage', 'URL']) op['parameters'] = [ self._handle_parameter(pe) for pe in extract_elems(e, ['parameters', 'SV_Parameter']) ] ops.append(op) return ops
def _parse_keywords(self, elem): ''' for each descriptiveKeywords block in an identification block ''' keywords = [] for key_elem in extract_elems(elem, ['descriptiveKeywords']): # TODO: split these up (if *-delimited in some way) terms = extract_items( key_elem, ['MD_Keywords', 'keyword', 'CharacterString']) key_type = extract_attrib( key_elem, ['MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue'] ) thesaurus = extract_item( key_elem, [ 'MD_Keywords', 'thesaurusName', 'CI_Citation', 'title', 'CharacterString' ] ) if terms: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms }) ) # TODO: add the Anchor element handling # ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor'] # add a generic set for the iso topic category isotopics = extract_items( elem, ['topicCategory', 'MD_TopicCategoryCode']) if isotopics: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": 'IsoTopicCategories', "bcube:hasValue": isotopics }) ) return keywords
def parse_item(self, elem): identifier = extract_item(self.elem, ['Entry_ID']) title = extract_item(self.elem, ['Entry_Title']) keywords = extract_items(self.elem, ['Keyword']) keywords += extract_items(self.elem, ['ISO_Topic_Category']) abstract = extract_item(self.elem, ['Summary']) organization = extract_item(self.elem, ['Originating_Center']) # temporal extent start_date = extract_item(self.elem, ['Temporal_Coverage', 'Start_Date']) end_date = extract_item(self.elem, ['Temporal_Coverage', 'End_Date']) temporal = [start_date, end_date] if start_date and end_date else [] # spatial extent west = extract_item(self.elem, ['Spatial_Coverage', 'Westernmost_Longitude']) east = extract_item(self.elem, ['Spatial_Coverage', 'Easternmost_Longitude']) south = extract_item(self.elem, ['Spatial_Coverage', 'Southernmost_Latitude']) north = extract_item(self.elem, ['Spatial_Coverage', 'Northernmost_Latitude']) bbox = [west, south, east, north] if \ west and east and north and south else [] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) distributions = [] for related_url in extract_elems(self.elem, ['Related_URL']): url = extract_item(related_url, ['URL']) content_type = extract_item(related_url, ['URL_Content_Type', 'Type']) description = extract_item(related_url, ['Description']) dist = tidy_dict({ "url": url, "description": description, "content_type": content_type }) if dist: distributions.append(dist) return tidy_dict({ "id": identifier, "title": title, "keywords": keywords, "abstract": abstract, "organization": organization, "bbox": bbox, "temporal": temporal, "distributions": distributions })
def parse_item(self, elem): identifier = extract_item(self.elem, ['Entry_ID']) title = extract_item(self.elem, ['Entry_Title']) keywords = extract_items(self.elem, ['Keyword']) keywords += extract_items(self.elem, ['ISO_Topic_Category']) abstract = extract_item(self.elem, ['Summary']) organization = extract_item(self.elem, ['Originating_Center']) # temporal extent start_date = extract_item( self.elem, ['Temporal_Coverage', 'Start_Date']) end_date = extract_item(self.elem, ['Temporal_Coverage', 'End_Date']) temporal = [start_date, end_date] if start_date and end_date else [] # spatial extent west = extract_item( self.elem, ['Spatial_Coverage', 'Westernmost_Longitude']) east = extract_item( self.elem, ['Spatial_Coverage', 'Easternmost_Longitude']) south = extract_item( self.elem, ['Spatial_Coverage', 'Southernmost_Latitude']) north = extract_item( self.elem, ['Spatial_Coverage', 'Northernmost_Latitude']) bbox = [west, south, east, north] if \ west and east and north and south else [] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) distributions = [] for related_url in extract_elems(self.elem, ['Related_URL']): url = extract_item(related_url, ['URL']) content_type = extract_item( related_url, ['URL_Content_Type', 'Type']) description = extract_item(related_url, ['Description']) dist = tidy_dict({ "url": url, "description": description, "content_type": content_type }) if dist: distributions.append(dist) return tidy_dict({ "id": identifier, "title": title, "keywords": keywords, "abstract": abstract, "organization": organization, "bbox": bbox, "temporal": temporal, "distributions": distributions })
def _parse_metadata(self): endpoints = [] # metadatas = self.parser.find(metadata_xpath) metadatas = extract_elems(self.parser.xml, ['metadata']) for metadata in metadatas: description, child_endpoints = self._handle_elem( metadata, [], self.url, self.service_bases ) endpoints += [description] + child_endpoints return {"endpoints": endpoints}
def _handle_operations(self): elems = extract_elems( self.elem, ['containsOperations', 'SV_OperationMetadata']) ops = [] for e in elems: op = {} op['name'] = extract_item( e, ['operationName', 'CharacterString']) op['method'] = extract_attrib( e, ['DCP', 'DCPList', '@codeListValue']) op['url'] = extract_item( e, ['connectPoint', 'CI_OnlineResource', 'linkage', 'URL']) op['parameters'] = [ self._handle_parameter(pe) for pe in extract_elems(e, ['parameters', 'SV_Parameter'])] ops.append(op) return ops
def _parse_datasets(self): # get the level-one children (catalog->child) endpoints = [] datasets = extract_elems(self.parser.xml, ['dataset']) # datasets = self.parser.find(dataset_xpath) for dataset in datasets: description, child_endpoints = self._handle_elem( dataset, ['dataset', 'metadata', 'catalogRef'], self.url, self.service_bases ) endpoints += [description] + child_endpoints return {"endpoints": endpoints}
def parse_children(self, elem=None, tags=[]): ''' where elem = the parent node for the set and tags is the un-namespaced list of explicit items to parse or, if not specified, run the children one level down ''' elem = self.parser.xml if elem is None else elem children = [] if tags: children = extract_elems(elem, tags) else: children = [child for child in elem.iterchildren()] for child in children: parsed = self._parse_child(child) if parsed: yield parsed
def parse(self): ''' key = entry for atom and item for rss ''' key = 'entry' if self.dialect == 'atom' else 'item' elems = extract_elems(self.parser.xml, ['//*', key]) items = [self.item_class(elem).item for elem in elems] # TODO: add the root level parsing, ie the difference btwn atom and rss title = extract_item(self.parser.xml, ['title']) updated = extract_item(self.parser.xml, ['updated']) author_name = extract_item(self.parser.xml, ['author', 'name']) return { "title": title, "updated": updated, "author": author_name, "items": items }
def _parse_keywords(self, elem): ''' for each descriptiveKeywords block in an identification block ''' keywords = [] for key_elem in extract_elems(elem, ['descriptiveKeywords']): # TODO: split these up (if *-delimited in some way) terms = extract_items( key_elem, ['MD_Keywords', 'keyword', 'CharacterString']) key_type = extract_attrib(key_elem, [ 'MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue' ]) thesaurus = extract_item(key_elem, [ 'MD_Keywords', 'thesaurusName', 'CI_Citation', 'title', 'CharacterString' ]) if terms: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms })) # TODO: add the Anchor element handling # ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor'] # add a generic set for the iso topic category isotopics = extract_items(elem, ['topicCategory', 'MD_TopicCategoryCode']) if isotopics: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": 'IsoTopicCategories', "bcube:hasValue": isotopics })) return keywords
def parse(self): # get the series self.description = {} md = extract_elem(self.elem, ['seriesMetadata', 'MD_Metadata']) if md is None: return md_parser = MxParser(md) md_parser.parse() self.description = md_parser.description self.description['children'] = [] # get the children children = extract_elems( self.elem, ['composedOf', 'DS_DataSet', 'has', 'MD_Metadata']) for child in children: child_parser = MxParser(child) child_parser.parse() if child_parser.description: self.description['children'].append(child_parser.description) self.description = tidy_dict(self.description)
def _parse_children(self, dialect): elems = extract_elems(self.parser.xml, ['ListRecords', 'record']) return [self._parse_child(child, dialect) for child in elems]
def _get_items(self, tag, elem, base_url, service_bases): ''' return any structure not part of the current element's attributes ''' def _normalize_key(key): ''' standardize the url (or other) xml tags to the desired json key as source key: endpoint key ''' remaps = { "serviceType": "type", "href": "url", "base": "url", "urlPath": "url" } if key in remaps: return remaps[key] return key def _run_element(elem, service_bases): ''' for a given element, return any text() and any attribute value ''' # run a generated xpath on the given element children = elem.xpath('./node()[local-name()!="metadata"' + 'and local-name()!="dataset" and' + 'local-name()!="catalogRef"]') element = {_normalize_key(extract_element_tag(k)): v for k, v in elem.attrib.iteritems()} element = self._manage_id(element) for child in children: value = child.text # xp = generate_qualified_xpath(child, True) tag = _normalize_key(extract_element_tag(child.tag)) if value: element[tag] = value for k, v in child.attrib.iteritems(): if v: element[tag + '_' + _normalize_key(extract_element_tag(k))] = v # get the service bases in case if [g for g in element.keys() if g.endswith('serviceName')]: sbs = [v for k, v in service_bases.iteritems() if k == element.get('serviceName')] else: sbs = service_bases.values() # send a unique list of base relative paths sbs = list(set(sbs)) url_key = next(iter([g for g in element.keys() if g.endswith('url')]), '') if url_key: # for service urls, if catalog.xml isn't appended it will resolve to # the html endpoint (not desired). so if the path equals the/a path in # the service bases, append catalog.xml to the path # elem_url = element[url_key] # if elem_url in sbs or not sbs: # elem_url += ('' if elem_url.endswith('/') else '/') + 'catalog.xml' # # element['url'] = intersect_url(base_url, elem_url, sbs) # # element['url'] = base_url # let's generate the link tl = ThreddsLink(elem, self.url, sbs) element['url'] = tl.urls element['actionable'] = 2 return element children = extract_elems(elem, ['metadata']) + \ extract_elems(elem, ['dataset']) + \ extract_elems(elem, ['catalogRef']) # elem.xpath('./node()[local-name()="metadata" or ' + # 'local-name()="dataset" or local-name()="catalogRef"]') element = _run_element(elem, service_bases) element_children = [] for c in children: element_desc = _run_element(c, service_bases) element_children.append(element_desc) if element_children: element['children'] = element_children return element
def _parse_service(self): output = {} urls = set() service = { "object_id": generate_uuid_urn(), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "rdf:type": 'OpenSearch1.1:Description', "dcterms:title": extract_item(self.parser.xml, ["ShortName"]), "dc:description": ' '.join( extract_items(self.parser.xml, ["LongName"]) + extract_items(self.parser.xml, ["Description"])), "urls": [], "webpages": [], "relationships": [] } url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) service['urls'].append(original_url) service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) # output['source'] = extract_items( # self.parser.xml, ["Attribution"]) # output['contact'] = extract_items( # self.parser.xml, ["Developer"]) # output['rights'] = extract_items( # self.parser.xml, ["SyndicationRight"]) key_id = generate_uuid_urn() output['keywords'] = [{ "object_id": key_id, "bcube:hasValue": extract_items(self.parser.xml, ["Tags"]) }] service['relationships'].append({ "relate": "dc:conformsTo", "object_id": key_id }) for t in extract_elems(self.parser.xml, ['Url']): ep = self._parse_endpoint(t) url_sha = generate_sha_urn(ep['url']) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Generated", "bcube:hasConfidence": "Not Sure", "vcard:hasURL": ep['url'], "object_id": url_id, "dc:identifier": url_sha }) service['urls'].append(dist) wb_id = generate_uuid_urn() service['webpages'].append({ "object_id": wb_id, "relationships": [{ "relate": "dcterms:references", "object_id": url_id }] }) service['relationships'].append({ "relate": "dcterms:references", "object_id": wb_id }) output['services'] = [service] return tidy_dict(output)
def parse(self): ''' run the routing ''' if not self.identity: # we're going to have to sort it out self.identity = {} metadata = self.identity.get('metadata', {}) if not metadata: return {} metadata_type = metadata.get('name', '') if not metadata_type: return {} # TODO: this is unlikely to be correct, given the ds record # but we're not going there just yet # TODO: deal with conformsTo (multiple schemaLocations, etc) catalog_record = { "object_id": generate_uuid_urn(), "rdf:type": self._version_to_urn(), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.parser.xml, ['@schemaLocation']).split(), "relationships": [], "urls": [] } original_url = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn() }) catalog_record['urls'].append(original_url) catalog_record['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) if metadata_type == 'Data Series': # run the set self.reader = DsParser(self.parser.xml, catalog_record) elif metadata_type == '19119': # run that for srv in extract_elems( self.parser.xml, ['identificationInfo', 'SV_ServiceIdentification']): reader = SrvParser(srv, catalog_record) reader.parse() elif metadata_type == '19115': # it's a mi/md so run that self.reader = MxParser( self.parser.xml, catalog_record, self.harvest_details ) self.reader.parse() # self.reader.parse() # # pass it back up the chain a bit self.description = self.reader.output
def _parse_service(self): output = {} urls = set() service = { "object_id": generate_uuid_urn(), "bcube:dateCreated": self.harvest_details.get("harvest_date", ""), "bcube:lastUpdated": self.harvest_details.get("harvest_date", ""), "rdf:type": "OpenSearch1.1:Description", "dcterms:title": extract_item(self.parser.xml, ["ShortName"]), "dc:description": " ".join( extract_items(self.parser.xml, ["LongName"]) + extract_items(self.parser.xml, ["Description"]) ), "urls": [], "webpages": [], "relationships": [], } url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha, } ) service["urls"].append(original_url) service["relationships"].append({"relate": "bcube:originatedFrom", "object_id": original_url["object_id"]}) # output['source'] = extract_items( # self.parser.xml, ["Attribution"]) # output['contact'] = extract_items( # self.parser.xml, ["Developer"]) # output['rights'] = extract_items( # self.parser.xml, ["SyndicationRight"]) key_id = generate_uuid_urn() output["keywords"] = [{"object_id": key_id, "bcube:hasValue": extract_items(self.parser.xml, ["Tags"])}] service["relationships"].append({"relate": "dc:conformsTo", "object_id": key_id}) for t in extract_elems(self.parser.xml, ["Url"]): ep = self._parse_endpoint(t) url_sha = generate_sha_urn(ep["url"]) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Generated", "bcube:hasConfidence": "Not Sure", "vcard:hasURL": ep["url"], "object_id": url_id, "dc:identifier": url_sha, } ) service["urls"].append(dist) wb_id = generate_uuid_urn() service["webpages"].append( {"object_id": wb_id, "relationships": [{"relate": "dcterms:references", "object_id": url_id}]} ) service["relationships"].append({"relate": "dcterms:references", "object_id": wb_id}) output["services"] = [service] return tidy_dict(output)
def parse(self): ''' from the root node, parse: identification (title, abstract, point of contact, keywords, extent) if identificationInfo contains SV_ServiceIdentification, add as child distribution info ''' # set up the url set urls = set() urls.add(self.output['catalog_record']['urls'][0]['object_id']) for id_elem in extract_elems( self.elem, ['//*', 'identificationInfo', 'MD_DataIdentification']): dataset, keywords = self._parse_identification_info(id_elem) dataset['relationships'].append({ "relate": "bcube:hasMetadataRecord", "object_id": self.output['catalog_record']['object_id'] }) dataset.update({ "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', '') }) self.output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset['object_id'] }) # point of contact from the root node and this might be an issue # in things like the -1/-3 from ngdc so try for an idinfo blob poc_elem = extract_elem(id_elem, [ 'identificationInfo', 'MD_DataIdentification', 'pointOfContact', 'CI_ResponsibleParty']) # if poc_elem is None: # # and if that fails try for the root-level contact # poc_elem = extract_elem( # self.elem, # ['contact', 'CI_ResponsibleParty']) # TODO: point of contact is not necessarily the publisher if poc_elem is not None: poc = self._parse_responsibleparty(poc_elem) location = ( ' '.join( [poc['contact'].get('city', ''), poc['contact'].get('country', '')]) ).strip() if poc.get('contact', {}) else '' self.output['publishers'].append(tidy_dict({ "object_id": generate_uuid_urn(), "name": poc.get('organization', ''), "location": location })) dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": self.output['publisher']['object_id'] }) dataset['urls'] = [] dist_elems = extract_elems(self.elem, ['distributionInfo']) for dist_elem in dist_elems: for d in self._parse_distribution(dist_elem): if not d: continue url_sha = generate_sha_urn(d) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": d, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) self.output['datasets'].append(dataset) self.output['keywords'] += keywords # TODO: removing this until we have a definition for SERVICE # # check for the service elements # service_elems = extract_elems(self.elem, # ['identificationInfo', 'SV_ServiceIdentification']) # self.description['services'] = [] # for service_elem in service_elems: # sv = SrvParser(service_elem) # self.description['services'].append(sv.parse()) # switch the catalog record to a list for conformity. eep. self.output['catalog_records'] = [self.output['catalog_record']] del self.output['catalog_record'] self.description = tidy_dict(self.output)
def parse_item(self): output = {} urls = set() catalog_object_id = generate_uuid_urn() output['catalog_record'] = { "object_id": catalog_object_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.elem, ['@noNamespaceSchemaLocation']).split(), "rdf:type": "FGDC:CSDGM", "relationships": [], "urls": [] } output['urls'] = [] # add the harvest info # this is not necessary as a sha just for set inclusion url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) output['catalog_record']['urls'].append(original_url) # NOTE: this is not the sha from the url output['catalog_record']['relationships'].append( { "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] } ) datsetid = extract_item(self.elem, ['idinfo', 'datsetid']) dataset_object_id = generate_uuid_urn() dataset = { "object_id": dataset_object_id, "dcterms:identifier": datsetid, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": extract_item( self.elem, ['idinfo', 'descript', 'abstract']), "dcterms:title": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'title']), "urls": [], "relationships": [] } bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding']) if bbox_elem is not None: # that's not even valid west = extract_item(bbox_elem, ['westbc']) east = extract_item(bbox_elem, ['eastbc']) north = extract_item(bbox_elem, ['northbc']) south = extract_item(bbox_elem, ['southbc']) bbox = [west, south, east, north] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) dataset.update({ "dc:spatial": bbox, "esip:westBound": west, "esip:eastBound": east, "esip:northBound": north, "esip:southBound": south }) time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo']) if time_elem is not None: caldate = extract_item(time_elem, ['sngdate', 'caldate']) if caldate: # TODO: we should see if it's at least a valid date dataset['esip:startDate'] = self._convert_date(caldate) rngdate = extract_elem(time_elem, ['rngdates']) if rngdate is not None: dataset['esip:startDate'] = self._convert_date( extract_item(rngdate, ['begdate'])) dataset['esip:endDate'] = self._convert_date( extract_item(rngdate, ['enddate'])) # TODO: add the min/max of the list of dates dataset['relationships'] = [ { "relate": "bcube:hasMetadataRecord", "object_id": catalog_object_id } ] publisher = { "object_id": generate_uuid_urn(), "name": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']), "location": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace']) } output['publisher'] = publisher dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": publisher['object_id'] }) distrib_elems = extract_elems( self.elem, ['distinfo', 'stdorder', 'digform']) for distrib_elem in distrib_elems: link = extract_item( distrib_elem, ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr']) # format = extract_item(distrib_elem, ['digtinfo', 'formname']) url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) # this is a distribution link so # we are assuming it is to data dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) webpages = [] onlink_elems = extract_elems( self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink']) for onlink_elem in onlink_elems: link = onlink_elem.text.strip() if onlink_elem.text else '' if not link: continue url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) webpages.append({ "object_id": generate_uuid_urn(), "relationships": [ { "relate": "dcterms:references", "object_id": url_id } ]} ) output['catalog_record']['webpages'] = webpages for webpage in webpages: dataset['relationships'].append({ "relate": "dcterms:references", "object_id": webpage['object_id'] }) # retain the keyword sets with type, thesaurus name and split # the terms as best we can keywords = [] key_elem = extract_elem(self.elem, ['idinfo', 'keywords']) for child in key_elem.iterchildren(): key_type = extract_element_tag(child.tag) key_tag = 'strat' if key_type == 'stratum' else key_type key_tag = 'temp' if key_tag == 'temporal' else key_tag thesaurus = extract_item(child, ['%skt' % key_tag]) # TODO: split these up terms = extract_items(child, ['%skey' % key_tag]) if terms: # if there's a parsing error (bad cdata, etc) may not have # TODO: add something for a set without a thesaurus name keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms }) ) output['keywords'] = keywords for keyword in keywords: dataset['relationships'].append( { "relate": "dc:conformsTo", "object_id": keyword['object_id'] } ) output['datasets'] = [dataset] # add the metadata relate output['catalog_record']['relationships'].append( { "relate": "foaf:primaryTopic", "object_id": dataset_object_id } ) output['catalog_records'] = [output['catalog_record']] del output['catalog_record'] self.description = tidy_dict(output)
def parse(self): ''' from the root node, parse: identification (title, abstract, point of contact, keywords, extent) if identificationInfo contains SV_ServiceIdentification, add as child distribution info ''' # set up the url set urls = set() urls.add(self.output['catalog_record']['urls'][0]['object_id']) for id_elem in extract_elems( self.elem, ['//*', 'identificationInfo', 'MD_DataIdentification']): dataset, keywords = self._parse_identification_info(id_elem) dataset['relationships'].append({ "relate": "bcube:hasMetadataRecord", "object_id": self.output['catalog_record']['object_id'] }) dataset.update({ "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', '') }) self.output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset['object_id'] }) # point of contact from the root node and this might be an issue # in things like the -1/-3 from ngdc so try for an idinfo blob poc_elem = extract_elem(id_elem, [ 'identificationInfo', 'MD_DataIdentification', 'pointOfContact', 'CI_ResponsibleParty' ]) # if poc_elem is None: # # and if that fails try for the root-level contact # poc_elem = extract_elem( # self.elem, # ['contact', 'CI_ResponsibleParty']) # TODO: point of contact is not necessarily the publisher if poc_elem is not None: poc = self._parse_responsibleparty(poc_elem) location = (' '.join([ poc['contact'].get('city', ''), poc['contact'].get( 'country', '') ])).strip() if poc.get('contact', {}) else '' self.output['publishers'].append( tidy_dict({ "object_id": generate_uuid_urn(), "name": poc.get('organization', ''), "location": location })) dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": self.output['publisher']['object_id'] }) dataset['urls'] = [] dist_elems = extract_elems(self.elem, ['distributionInfo']) for dist_elem in dist_elems: for d in self._parse_distribution(dist_elem): if not d: continue url_sha = generate_sha_urn(d) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": d, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) self.output['datasets'].append(dataset) self.output['keywords'] += keywords # TODO: removing this until we have a definition for SERVICE # # check for the service elements # service_elems = extract_elems(self.elem, # ['identificationInfo', 'SV_ServiceIdentification']) # self.description['services'] = [] # for service_elem in service_elems: # sv = SrvParser(service_elem) # self.description['services'].append(sv.parse()) # switch the catalog record to a list for conformity. eep. self.output['catalog_records'] = [self.output['catalog_record']] del self.output['catalog_record'] self.description = tidy_dict(self.output)
def parse(self): ''' run the routing ''' if not self.identity: # we're going to have to sort it out self.identity = {} metadata = self.identity.get('metadata', {}) if not metadata: return {} metadata_type = metadata.get('name', '') if not metadata_type: return {} # TODO: this is unlikely to be correct, given the ds record # but we're not going there just yet # TODO: deal with conformsTo (multiple schemaLocations, etc) catalog_record = { "object_id": generate_uuid_urn(), "rdf:type": self._version_to_urn(), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.parser.xml, ['@schemaLocation']).split(), "relationships": [], "urls": [] } original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn() }) catalog_record['urls'].append(original_url) catalog_record['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) if metadata_type == 'Data Series': # run the set self.reader = DsParser(self.parser.xml, catalog_record) elif metadata_type == '19119': # run that for srv in extract_elems( self.parser.xml, ['identificationInfo', 'SV_ServiceIdentification']): reader = SrvParser(srv, catalog_record) reader.parse() elif metadata_type == '19115': # it's a mi/md so run that self.reader = MxParser(self.parser.xml, catalog_record, self.harvest_details) self.reader.parse() # self.reader.parse() # # pass it back up the chain a bit self.description = self.reader.output
def parse_item(self): output = {} urls = set() catalog_object_id = generate_uuid_urn() output['catalog_record'] = { "object_id": catalog_object_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.elem, ['@noNamespaceSchemaLocation']).split(), "rdf:type": "FGDC:CSDGM", "relationships": [], "urls": [] } output['urls'] = [] # add the harvest info # this is not necessary as a sha just for set inclusion url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) output['catalog_record']['urls'].append(original_url) # NOTE: this is not the sha from the url output['catalog_record']['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) datsetid = extract_item(self.elem, ['idinfo', 'datsetid']) dataset_object_id = generate_uuid_urn() dataset = { "object_id": dataset_object_id, "dcterms:identifier": datsetid, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": extract_item(self.elem, ['idinfo', 'descript', 'abstract']), "dcterms:title": extract_item(self.elem, ['idinfo', 'citation', 'citeinfo', 'title']), "urls": [], "relationships": [] } bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding']) if bbox_elem is not None: # that's not even valid west = extract_item(bbox_elem, ['westbc']) east = extract_item(bbox_elem, ['eastbc']) north = extract_item(bbox_elem, ['northbc']) south = extract_item(bbox_elem, ['southbc']) bbox = [west, south, east, north] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) dataset.update({ "dc:spatial": bbox, "esip:westBound": west, "esip:eastBound": east, "esip:northBound": north, "esip:southBound": south }) time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo']) if time_elem is not None: caldate = extract_item(time_elem, ['sngdate', 'caldate']) if caldate: # TODO: we should see if it's at least a valid date dataset['esip:startDate'] = self._convert_date(caldate) rngdate = extract_elem(time_elem, ['rngdates']) if rngdate is not None: dataset['esip:startDate'] = self._convert_date( extract_item(rngdate, ['begdate'])) dataset['esip:endDate'] = self._convert_date( extract_item(rngdate, ['enddate'])) # TODO: add the min/max of the list of dates dataset['relationships'] = [{ "relate": "bcube:hasMetadataRecord", "object_id": catalog_object_id }] publisher = { "object_id": generate_uuid_urn(), "name": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']), "location": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace']) } output['publisher'] = publisher dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": publisher['object_id'] }) distrib_elems = extract_elems(self.elem, ['distinfo', 'stdorder', 'digform']) for distrib_elem in distrib_elems: link = extract_item( distrib_elem, ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr']) # format = extract_item(distrib_elem, ['digtinfo', 'formname']) url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) # this is a distribution link so # we are assuming it is to data dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) webpages = [] onlink_elems = extract_elems( self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink']) for onlink_elem in onlink_elems: link = onlink_elem.text.strip() if onlink_elem.text else '' if not link: continue url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) webpages.append({ "object_id": generate_uuid_urn(), "relationships": [{ "relate": "dcterms:references", "object_id": url_id }] }) output['catalog_record']['webpages'] = webpages for webpage in webpages: dataset['relationships'].append({ "relate": "dcterms:references", "object_id": webpage['object_id'] }) # retain the keyword sets with type, thesaurus name and split # the terms as best we can keywords = [] key_elem = extract_elem(self.elem, ['idinfo', 'keywords']) for child in key_elem.iterchildren(): key_type = extract_element_tag(child.tag) key_tag = 'strat' if key_type == 'stratum' else key_type key_tag = 'temp' if key_tag == 'temporal' else key_tag thesaurus = extract_item(child, ['%skt' % key_tag]) # TODO: split these up terms = extract_items(child, ['%skey' % key_tag]) if terms: # if there's a parsing error (bad cdata, etc) may not have # TODO: add something for a set without a thesaurus name keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms })) output['keywords'] = keywords for keyword in keywords: dataset['relationships'].append({ "relate": "dc:conformsTo", "object_id": keyword['object_id'] }) output['datasets'] = [dataset] # add the metadata relate output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset_object_id }) output['catalog_records'] = [output['catalog_record']] del output['catalog_record'] self.description = tidy_dict(output)