def _parse_service(self): output = {} service = { "object_id": generate_uuid_urn(), "dcterms:title": ' '.join(extract_items( self.parser.xml, ["Identify", "repositoryName"])), "rdf:type": "OAI-PMH", "relationships": [], "urls": [] } url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": url_id, "dc:identifier": generate_sha_urn(self.url) }) service['urls'] = [dist] service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": url_id }) # output['version'] = extract_items(self.parser.xml, ["Identify", "protocolVersion"]) # output['endpoints'] = [{'url': e} for e # in extract_items(self.parser.xml, ["Identify", "baseURL"])] output['services'] = [service] return tidy_dict(output)
def _parse_keywords(self, elem): ''' for each descriptiveKeywords block in an identification block ''' keywords = [] for key_elem in extract_elems(elem, ['descriptiveKeywords']): # TODO: split these up (if *-delimited in some way) terms = extract_items( key_elem, ['MD_Keywords', 'keyword', 'CharacterString']) key_type = extract_attrib( key_elem, ['MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue'] ) thesaurus = extract_item( key_elem, [ 'MD_Keywords', 'thesaurusName', 'CI_Citation', 'title', 'CharacterString' ] ) if terms: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms }) ) # TODO: add the Anchor element handling # ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor'] # add a generic set for the iso topic category isotopics = extract_items( elem, ['topicCategory', 'MD_TopicCategoryCode']) if isotopics: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": 'IsoTopicCategories', "bcube:hasValue": isotopics }) ) return keywords
def _parse_keywords(self, elem): ''' for each descriptiveKeywords block in an identification block ''' keywords = [] for key_elem in extract_elems(elem, ['descriptiveKeywords']): # TODO: split these up (if *-delimited in some way) terms = extract_items( key_elem, ['MD_Keywords', 'keyword', 'CharacterString']) key_type = extract_attrib(key_elem, [ 'MD_Keywords', 'type', 'MD_KeywordTypeCode', '@codeListValue' ]) thesaurus = extract_item(key_elem, [ 'MD_Keywords', 'thesaurusName', 'CI_Citation', 'title', 'CharacterString' ]) if terms: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms })) # TODO: add the Anchor element handling # ['descriptiveKeywords', 'MD_Keywords', 'keyword', 'Anchor'] # add a generic set for the iso topic category isotopics = extract_items(elem, ['topicCategory', 'MD_TopicCategoryCode']) if isotopics: keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": 'IsoTopicCategories', "bcube:hasValue": isotopics })) return keywords
def parse(self): output = {} urls = set() if 'service' in self.identify: service = { "object_id": generate_uuid_urn(), "dcterms:title": extract_attrib(self.parser.xml, ['@name']), "rdf:type": "UNIDATA:THREDDS {0}".format( extract_attrib(self.parser.xml, ['@version'])), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "relationships": [], "urls": [] } url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": url_sha }) service['urls'].append(original_url) # NOTE: this is not the sha from the url service['relationships'].append( { "relate": "bcube:originatedFrom", "object_id": url_sha } ) # deal with the "dataset" service_bases = self.parser.xml.xpath( '//*[local-name()="service" and @base != ""]' ) self.service_bases = { s.attrib.get('name'): s.attrib.get('base') for s in service_bases } # if 'dataset' in self.identify: # # TODO: this is really not right but it is not # # a proper web service so meh # datasets = self._parse_datasets() # # if 'metadata' in self.identify: # # self.description['metadata'] = self._parse_metadata() output['services'] = [service] self.description = tidy_dict(output)
def _parse_service(self): output = {} service = { "object_id": generate_uuid_urn(), "dcterms:title": ' '.join( extract_items(self.parser.xml, ["Identify", "repositoryName"])), "rdf:type": "OAI-PMH", "relationships": [], "urls": [] } url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": url_id, "dc:identifier": generate_sha_urn(self.url) }) service['urls'] = [dist] service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": url_id }) # output['version'] = extract_items(self.parser.xml, ["Identify", "protocolVersion"]) # output['endpoints'] = [{'url': e} for e # in extract_items(self.parser.xml, ["Identify", "baseURL"])] output['services'] = [service] return tidy_dict(output)
def _parse_identification_info(self, elem): # ignoring the larger get all the identifiers above # in favor of, hopefully, getting a better dataset id dataset_identifier = extract_item(elem, [ 'citation', 'CI_Citation', 'identifier', 'MD_Identifier', 'code', 'CharacterString' ]) dataset = { "object_id": generate_uuid_urn(), "dc:identifier": dataset_identifier, "dc:description": extract_item( elem, ['abstract', 'CharacterString']), "dcterms:title": extract_item(elem, [ 'citation', 'CI_Citation', 'title', 'CharacterString']), "relationships": [] } # TODO: i think the rights blob is not in the ontology prototypes # the rights information from MD_Constraints or MD_LegalConstraints # rights = extract_item(elem, ['resourceConstraints', '*', # 'useLimitation', 'CharacterString']) # deal with the extent extents = self._parse_extent(elem) dataset.update(extents) keywords = self._parse_keywords(elem) for keyword in keywords: dataset['relationships'].append({ "relate": "dc:conformsTo", "object_id": keyword['object_id'] }) return tidy_dict(dataset), keywords
def _parse_identification_info(self, elem): # ignoring the larger get all the identifiers above # in favor of, hopefully, getting a better dataset id dataset_identifier = extract_item(elem, [ 'citation', 'CI_Citation', 'identifier', 'MD_Identifier', 'code', 'CharacterString' ]) dataset = { "object_id": generate_uuid_urn(), "dc:identifier": dataset_identifier, "dc:description": extract_item(elem, ['abstract', 'CharacterString']), "dcterms:title": extract_item( elem, ['citation', 'CI_Citation', 'title', 'CharacterString']), "relationships": [] } # TODO: i think the rights blob is not in the ontology prototypes # the rights information from MD_Constraints or MD_LegalConstraints # rights = extract_item(elem, ['resourceConstraints', '*', # 'useLimitation', 'CharacterString']) # deal with the extent extents = self._parse_extent(elem) dataset.update(extents) keywords = self._parse_keywords(elem) for keyword in keywords: dataset['relationships'].append({ "relate": "dc:conformsTo", "object_id": keyword['object_id'] }) return tidy_dict(dataset), keywords
def parse(self): ''' from the root node, parse: identification (title, abstract, point of contact, keywords, extent) if identificationInfo contains SV_ServiceIdentification, add as child distribution info ''' # set up the url set urls = set() urls.add(self.output['catalog_record']['urls'][0]['object_id']) for id_elem in extract_elems( self.elem, ['//*', 'identificationInfo', 'MD_DataIdentification']): dataset, keywords = self._parse_identification_info(id_elem) dataset['relationships'].append({ "relate": "bcube:hasMetadataRecord", "object_id": self.output['catalog_record']['object_id'] }) dataset.update({ "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', '') }) self.output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset['object_id'] }) # point of contact from the root node and this might be an issue # in things like the -1/-3 from ngdc so try for an idinfo blob poc_elem = extract_elem(id_elem, [ 'identificationInfo', 'MD_DataIdentification', 'pointOfContact', 'CI_ResponsibleParty' ]) # if poc_elem is None: # # and if that fails try for the root-level contact # poc_elem = extract_elem( # self.elem, # ['contact', 'CI_ResponsibleParty']) # TODO: point of contact is not necessarily the publisher if poc_elem is not None: poc = self._parse_responsibleparty(poc_elem) location = (' '.join([ poc['contact'].get('city', ''), poc['contact'].get( 'country', '') ])).strip() if poc.get('contact', {}) else '' self.output['publishers'].append( tidy_dict({ "object_id": generate_uuid_urn(), "name": poc.get('organization', ''), "location": location })) dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": self.output['publisher']['object_id'] }) dataset['urls'] = [] dist_elems = extract_elems(self.elem, ['distributionInfo']) for dist_elem in dist_elems: for d in self._parse_distribution(dist_elem): if not d: continue url_sha = generate_sha_urn(d) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": d, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) self.output['datasets'].append(dataset) self.output['keywords'] += keywords # TODO: removing this until we have a definition for SERVICE # # check for the service elements # service_elems = extract_elems(self.elem, # ['identificationInfo', 'SV_ServiceIdentification']) # self.description['services'] = [] # for service_elem in service_elems: # sv = SrvParser(service_elem) # self.description['services'].append(sv.parse()) # switch the catalog record to a list for conformity. eep. self.output['catalog_records'] = [self.output['catalog_record']] del self.output['catalog_record'] self.description = tidy_dict(self.output)
def _parse_service(self): output = {} urls = set() service = { "object_id": generate_uuid_urn(), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "rdf:type": 'OpenSearch1.1:Description', "dcterms:title": extract_item(self.parser.xml, ["ShortName"]), "dc:description": ' '.join( extract_items(self.parser.xml, ["LongName"]) + extract_items(self.parser.xml, ["Description"])), "urls": [], "webpages": [], "relationships": [] } url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) service['urls'].append(original_url) service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) # output['source'] = extract_items( # self.parser.xml, ["Attribution"]) # output['contact'] = extract_items( # self.parser.xml, ["Developer"]) # output['rights'] = extract_items( # self.parser.xml, ["SyndicationRight"]) key_id = generate_uuid_urn() output['keywords'] = [{ "object_id": key_id, "bcube:hasValue": extract_items(self.parser.xml, ["Tags"]) }] service['relationships'].append({ "relate": "dc:conformsTo", "object_id": key_id }) for t in extract_elems(self.parser.xml, ['Url']): ep = self._parse_endpoint(t) url_sha = generate_sha_urn(ep['url']) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Generated", "bcube:hasConfidence": "Not Sure", "vcard:hasURL": ep['url'], "object_id": url_id, "dc:identifier": url_sha }) service['urls'].append(dist) wb_id = generate_uuid_urn() service['webpages'].append({ "object_id": wb_id, "relationships": [{ "relate": "dcterms:references", "object_id": url_id }] }) service['relationships'].append({ "relate": "dcterms:references", "object_id": wb_id }) output['services'] = [service] return tidy_dict(output)
def parse(self): ''' run the routing ''' if not self.identity: # we're going to have to sort it out self.identity = {} metadata = self.identity.get('metadata', {}) if not metadata: return {} metadata_type = metadata.get('name', '') if not metadata_type: return {} # TODO: this is unlikely to be correct, given the ds record # but we're not going there just yet # TODO: deal with conformsTo (multiple schemaLocations, etc) catalog_record = { "object_id": generate_uuid_urn(), "rdf:type": self._version_to_urn(), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.parser.xml, ['@schemaLocation']).split(), "relationships": [], "urls": [] } original_url = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn() }) catalog_record['urls'].append(original_url) catalog_record['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) if metadata_type == 'Data Series': # run the set self.reader = DsParser(self.parser.xml, catalog_record) elif metadata_type == '19119': # run that for srv in extract_elems( self.parser.xml, ['identificationInfo', 'SV_ServiceIdentification']): reader = SrvParser(srv, catalog_record) reader.parse() elif metadata_type == '19115': # it's a mi/md so run that self.reader = MxParser( self.parser.xml, catalog_record, self.harvest_details ) self.reader.parse() # self.reader.parse() # # pass it back up the chain a bit self.description = self.reader.output
def parse(self): ''' from the root node, parse: identification (title, abstract, point of contact, keywords, extent) if identificationInfo contains SV_ServiceIdentification, add as child distribution info ''' # set up the url set urls = set() urls.add(self.output['catalog_record']['urls'][0]['object_id']) for id_elem in extract_elems( self.elem, ['//*', 'identificationInfo', 'MD_DataIdentification']): dataset, keywords = self._parse_identification_info(id_elem) dataset['relationships'].append({ "relate": "bcube:hasMetadataRecord", "object_id": self.output['catalog_record']['object_id'] }) dataset.update({ "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', '') }) self.output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset['object_id'] }) # point of contact from the root node and this might be an issue # in things like the -1/-3 from ngdc so try for an idinfo blob poc_elem = extract_elem(id_elem, [ 'identificationInfo', 'MD_DataIdentification', 'pointOfContact', 'CI_ResponsibleParty']) # if poc_elem is None: # # and if that fails try for the root-level contact # poc_elem = extract_elem( # self.elem, # ['contact', 'CI_ResponsibleParty']) # TODO: point of contact is not necessarily the publisher if poc_elem is not None: poc = self._parse_responsibleparty(poc_elem) location = ( ' '.join( [poc['contact'].get('city', ''), poc['contact'].get('country', '')]) ).strip() if poc.get('contact', {}) else '' self.output['publishers'].append(tidy_dict({ "object_id": generate_uuid_urn(), "name": poc.get('organization', ''), "location": location })) dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": self.output['publisher']['object_id'] }) dataset['urls'] = [] dist_elems = extract_elems(self.elem, ['distributionInfo']) for dist_elem in dist_elems: for d in self._parse_distribution(dist_elem): if not d: continue url_sha = generate_sha_urn(d) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": d, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) self.output['datasets'].append(dataset) self.output['keywords'] += keywords # TODO: removing this until we have a definition for SERVICE # # check for the service elements # service_elems = extract_elems(self.elem, # ['identificationInfo', 'SV_ServiceIdentification']) # self.description['services'] = [] # for service_elem in service_elems: # sv = SrvParser(service_elem) # self.description['services'].append(sv.parse()) # switch the catalog record to a list for conformity. eep. self.output['catalog_records'] = [self.output['catalog_record']] del self.output['catalog_record'] self.description = tidy_dict(self.output)
def parse(self): # for ogc, a catalog record is the getcapabilities rsp output = { "layers": [], "catalog_records": [] } urls = set() if not self.reader: self.description = {} return if 'service' in self.identify: service_id = generate_uuid_urn() service = { "object_id": service_id, "bcube:dateCreated": self.harvest_details.get( 'harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get( 'harvest_date', ''), "relationships": [], "urls": [], "rdf:type": self.urn } url_sha = generate_sha_urn(self.url) urls.add(url_sha) url_id = generate_uuid_urn() service['urls'].append( self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": url_id, "dc:identifier": url_sha }) ) service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": url_id }) # self._get_service_config(service_name, version) service_reader = self._parse_service( self.reader, self.service_name, self.version) # map to triples service.update({ "dc:description": service_reader.get('abstract') }) keywords = service_reader.get('subject', []) if keywords: output['keywords'] = [{ "object_id": generate_uuid_urn(), "bcube:hasValue": keywords }] for k in output['keywords']: service['relationships'].append( { "relate": "dc:conformsTo", "object_id": k['object_id'] } ) if self.identify['service'].get('request', '') == 'GetCapabilities': # this is also awkward. meh. needs must. layers = [] listed_layers = self._parse_getcap_datasets(self.reader) for ld in listed_layers: layer = { "object_id": generate_uuid_urn(), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": ld.get('abstract', ''), "dc:title": ld.get('title', ''), "relationships": [] } service['relationships'].append({ "relate": "bcube:contains", "object_id": layer['object_id'] }) # add the generated url for the service generated_url = self._generate_url( self.url, ld.get('name'), ld.get('bbox'), self.service_name, self.version ) if generated_url: url_sha = generate_sha_urn(generated_url) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() layer_url = self._generate_harvest_manifest(**{ "vcard:hasURL": generated_url, "bcube:hasUrlSource": "Generated", "bcube:hasConfidence": "Not Sure", "object_id": url_id, "dc:identifier": url_sha }) service['urls'].append(layer_url) # don't add to the larger set, but do # include the reference within the layer layer['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) # add each as a dataset with just a url for now for mu in ld.get('metadata_urls', []): url_link = generate_uuid_urn() url_sha = generate_sha_urn(mu.get('url')) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() output['catalog_records'] += [ { "object_id": url_link, "urls": [ self._generate_harvest_manifest(**{ "vcard:hasURL": mu.get('url'), "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "object_id": url_id, "dc:identifier": url_sha }) ], "relationships": [ { "relate": "dc:describes", "object_id": layer['object_id'] }, { "relate": "bcube:originatedFrom", "object_id": url_id } ] } ] if 'temporal_extent' in ld: temporal = tidy_dict( { "esip:startDate": ld['temporal_extent'].get('begin', ''), "esip:endDate": ld['temporal_extent'].get('end', '') } ) if temporal: layer.update(temporal) if 'bbox' in ld: layer.update(ld['bbox']) layers.append(layer) service['layers'] = layers # if layers: # service['layers'] = layers # for layer in layers: # service['relationships'].append({ # "relate": "bcube:contains", # "object_id": layer['object_id'] # }) output['services'] = [service] self.description = tidy_dict(output)
def parse_item(self): output = {} urls = set() catalog_object_id = generate_uuid_urn() output['catalog_record'] = { "object_id": catalog_object_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.elem, ['@noNamespaceSchemaLocation']).split(), "rdf:type": "FGDC:CSDGM", "relationships": [], "urls": [] } output['urls'] = [] # add the harvest info # this is not necessary as a sha just for set inclusion url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) output['catalog_record']['urls'].append(original_url) # NOTE: this is not the sha from the url output['catalog_record']['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) datsetid = extract_item(self.elem, ['idinfo', 'datsetid']) dataset_object_id = generate_uuid_urn() dataset = { "object_id": dataset_object_id, "dcterms:identifier": datsetid, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": extract_item(self.elem, ['idinfo', 'descript', 'abstract']), "dcterms:title": extract_item(self.elem, ['idinfo', 'citation', 'citeinfo', 'title']), "urls": [], "relationships": [] } bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding']) if bbox_elem is not None: # that's not even valid west = extract_item(bbox_elem, ['westbc']) east = extract_item(bbox_elem, ['eastbc']) north = extract_item(bbox_elem, ['northbc']) south = extract_item(bbox_elem, ['southbc']) bbox = [west, south, east, north] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) dataset.update({ "dc:spatial": bbox, "esip:westBound": west, "esip:eastBound": east, "esip:northBound": north, "esip:southBound": south }) time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo']) if time_elem is not None: caldate = extract_item(time_elem, ['sngdate', 'caldate']) if caldate: # TODO: we should see if it's at least a valid date dataset['esip:startDate'] = self._convert_date(caldate) rngdate = extract_elem(time_elem, ['rngdates']) if rngdate is not None: dataset['esip:startDate'] = self._convert_date( extract_item(rngdate, ['begdate'])) dataset['esip:endDate'] = self._convert_date( extract_item(rngdate, ['enddate'])) # TODO: add the min/max of the list of dates dataset['relationships'] = [{ "relate": "bcube:hasMetadataRecord", "object_id": catalog_object_id }] publisher = { "object_id": generate_uuid_urn(), "name": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']), "location": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace']) } output['publisher'] = publisher dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": publisher['object_id'] }) distrib_elems = extract_elems(self.elem, ['distinfo', 'stdorder', 'digform']) for distrib_elem in distrib_elems: link = extract_item( distrib_elem, ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr']) # format = extract_item(distrib_elem, ['digtinfo', 'formname']) url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) # this is a distribution link so # we are assuming it is to data dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) webpages = [] onlink_elems = extract_elems( self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink']) for onlink_elem in onlink_elems: link = onlink_elem.text.strip() if onlink_elem.text else '' if not link: continue url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) webpages.append({ "object_id": generate_uuid_urn(), "relationships": [{ "relate": "dcterms:references", "object_id": url_id }] }) output['catalog_record']['webpages'] = webpages for webpage in webpages: dataset['relationships'].append({ "relate": "dcterms:references", "object_id": webpage['object_id'] }) # retain the keyword sets with type, thesaurus name and split # the terms as best we can keywords = [] key_elem = extract_elem(self.elem, ['idinfo', 'keywords']) for child in key_elem.iterchildren(): key_type = extract_element_tag(child.tag) key_tag = 'strat' if key_type == 'stratum' else key_type key_tag = 'temp' if key_tag == 'temporal' else key_tag thesaurus = extract_item(child, ['%skt' % key_tag]) # TODO: split these up terms = extract_items(child, ['%skey' % key_tag]) if terms: # if there's a parsing error (bad cdata, etc) may not have # TODO: add something for a set without a thesaurus name keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms })) output['keywords'] = keywords for keyword in keywords: dataset['relationships'].append({ "relate": "dc:conformsTo", "object_id": keyword['object_id'] }) output['datasets'] = [dataset] # add the metadata relate output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset_object_id }) output['catalog_records'] = [output['catalog_record']] del output['catalog_record'] self.description = tidy_dict(output)
def parse(self): ''' run the routing ''' if not self.identity: # we're going to have to sort it out self.identity = {} metadata = self.identity.get('metadata', {}) if not metadata: return {} metadata_type = metadata.get('name', '') if not metadata_type: return {} # TODO: this is unlikely to be correct, given the ds record # but we're not going there just yet # TODO: deal with conformsTo (multiple schemaLocations, etc) catalog_record = { "object_id": generate_uuid_urn(), "rdf:type": self._version_to_urn(), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.parser.xml, ['@schemaLocation']).split(), "relationships": [], "urls": [] } original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn() }) catalog_record['urls'].append(original_url) catalog_record['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) if metadata_type == 'Data Series': # run the set self.reader = DsParser(self.parser.xml, catalog_record) elif metadata_type == '19119': # run that for srv in extract_elems( self.parser.xml, ['identificationInfo', 'SV_ServiceIdentification']): reader = SrvParser(srv, catalog_record) reader.parse() elif metadata_type == '19115': # it's a mi/md so run that self.reader = MxParser(self.parser.xml, catalog_record, self.harvest_details) self.reader.parse() # self.reader.parse() # # pass it back up the chain a bit self.description = self.reader.output
def parse(self): # for ogc, a catalog record is the getcapabilities rsp output = {"layers": [], "catalog_records": []} urls = set() if not self.reader: self.description = {} return if 'service' in self.identify: service_id = generate_uuid_urn() service = { "object_id": service_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "relationships": [], "urls": [], "rdf:type": self.urn } url_sha = generate_sha_urn(self.url) urls.add(url_sha) url_id = generate_uuid_urn() service['urls'].append( self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": url_id, "dc:identifier": url_sha })) service['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": url_id }) # self._get_service_config(service_name, version) service_reader = self._parse_service(self.reader, self.service_name, self.version) # map to triples service.update({"dc:description": service_reader.get('abstract')}) keywords = service_reader.get('subject', []) if keywords: output['keywords'] = [{ "object_id": generate_uuid_urn(), "bcube:hasValue": keywords }] for k in output['keywords']: service['relationships'].append({ "relate": "dc:conformsTo", "object_id": k['object_id'] }) if self.identify['service'].get('request', '') == 'GetCapabilities': # this is also awkward. meh. needs must. layers = [] listed_layers = self._parse_getcap_datasets(self.reader) for ld in listed_layers: layer = { "object_id": generate_uuid_urn(), "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": ld.get('abstract', ''), "dc:title": ld.get('title', ''), "relationships": [] } service['relationships'].append({ "relate": "bcube:contains", "object_id": layer['object_id'] }) # add the generated url for the service generated_url = self._generate_url(self.url, ld.get('name'), ld.get('bbox'), self.service_name, self.version) if generated_url: url_sha = generate_sha_urn(generated_url) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() layer_url = self._generate_harvest_manifest( **{ "vcard:hasURL": generated_url, "bcube:hasUrlSource": "Generated", "bcube:hasConfidence": "Not Sure", "object_id": url_id, "dc:identifier": url_sha }) service['urls'].append(layer_url) # don't add to the larger set, but do # include the reference within the layer layer['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) # add each as a dataset with just a url for now for mu in ld.get('metadata_urls', []): url_link = generate_uuid_urn() url_sha = generate_sha_urn(mu.get('url')) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() output['catalog_records'] += [{ "object_id": url_link, "urls": [ self._generate_harvest_manifest( **{ "vcard:hasURL": mu.get('url'), "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "object_id": url_id, "dc:identifier": url_sha }) ], "relationships": [{ "relate": "dc:describes", "object_id": layer['object_id'] }, { "relate": "bcube:originatedFrom", "object_id": url_id }] }] if 'temporal_extent' in ld: temporal = tidy_dict({ "esip:startDate": ld['temporal_extent'].get('begin', ''), "esip:endDate": ld['temporal_extent'].get('end', '') }) if temporal: layer.update(temporal) if 'bbox' in ld: layer.update(ld['bbox']) layers.append(layer) service['layers'] = layers # if layers: # service['layers'] = layers # for layer in layers: # service['relationships'].append({ # "relate": "bcube:contains", # "object_id": layer['object_id'] # }) output['services'] = [service] self.description = tidy_dict(output)
def _parse_service(self): output = {} urls = set() service = { "object_id": generate_uuid_urn(), "bcube:dateCreated": self.harvest_details.get("harvest_date", ""), "bcube:lastUpdated": self.harvest_details.get("harvest_date", ""), "rdf:type": "OpenSearch1.1:Description", "dcterms:title": extract_item(self.parser.xml, ["ShortName"]), "dc:description": " ".join( extract_items(self.parser.xml, ["LongName"]) + extract_items(self.parser.xml, ["Description"]) ), "urls": [], "webpages": [], "relationships": [], } url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha, } ) service["urls"].append(original_url) service["relationships"].append({"relate": "bcube:originatedFrom", "object_id": original_url["object_id"]}) # output['source'] = extract_items( # self.parser.xml, ["Attribution"]) # output['contact'] = extract_items( # self.parser.xml, ["Developer"]) # output['rights'] = extract_items( # self.parser.xml, ["SyndicationRight"]) key_id = generate_uuid_urn() output["keywords"] = [{"object_id": key_id, "bcube:hasValue": extract_items(self.parser.xml, ["Tags"])}] service["relationships"].append({"relate": "dc:conformsTo", "object_id": key_id}) for t in extract_elems(self.parser.xml, ["Url"]): ep = self._parse_endpoint(t) url_sha = generate_sha_urn(ep["url"]) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Generated", "bcube:hasConfidence": "Not Sure", "vcard:hasURL": ep["url"], "object_id": url_id, "dc:identifier": url_sha, } ) service["urls"].append(dist) wb_id = generate_uuid_urn() service["webpages"].append( {"object_id": wb_id, "relationships": [{"relate": "dcterms:references", "object_id": url_id}]} ) service["relationships"].append({"relate": "dcterms:references", "object_id": wb_id}) output["services"] = [service] return tidy_dict(output)
def parse_item(self): output = {} urls = set() catalog_object_id = generate_uuid_urn() output['catalog_record'] = { "object_id": catalog_object_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.elem, ['@noNamespaceSchemaLocation']).split(), "rdf:type": "FGDC:CSDGM", "relationships": [], "urls": [] } output['urls'] = [] # add the harvest info # this is not necessary as a sha just for set inclusion url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) output['catalog_record']['urls'].append(original_url) # NOTE: this is not the sha from the url output['catalog_record']['relationships'].append( { "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] } ) datsetid = extract_item(self.elem, ['idinfo', 'datsetid']) dataset_object_id = generate_uuid_urn() dataset = { "object_id": dataset_object_id, "dcterms:identifier": datsetid, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": extract_item( self.elem, ['idinfo', 'descript', 'abstract']), "dcterms:title": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'title']), "urls": [], "relationships": [] } bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding']) if bbox_elem is not None: # that's not even valid west = extract_item(bbox_elem, ['westbc']) east = extract_item(bbox_elem, ['eastbc']) north = extract_item(bbox_elem, ['northbc']) south = extract_item(bbox_elem, ['southbc']) bbox = [west, south, east, north] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) dataset.update({ "dc:spatial": bbox, "esip:westBound": west, "esip:eastBound": east, "esip:northBound": north, "esip:southBound": south }) time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo']) if time_elem is not None: caldate = extract_item(time_elem, ['sngdate', 'caldate']) if caldate: # TODO: we should see if it's at least a valid date dataset['esip:startDate'] = self._convert_date(caldate) rngdate = extract_elem(time_elem, ['rngdates']) if rngdate is not None: dataset['esip:startDate'] = self._convert_date( extract_item(rngdate, ['begdate'])) dataset['esip:endDate'] = self._convert_date( extract_item(rngdate, ['enddate'])) # TODO: add the min/max of the list of dates dataset['relationships'] = [ { "relate": "bcube:hasMetadataRecord", "object_id": catalog_object_id } ] publisher = { "object_id": generate_uuid_urn(), "name": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']), "location": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace']) } output['publisher'] = publisher dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": publisher['object_id'] }) distrib_elems = extract_elems( self.elem, ['distinfo', 'stdorder', 'digform']) for distrib_elem in distrib_elems: link = extract_item( distrib_elem, ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr']) # format = extract_item(distrib_elem, ['digtinfo', 'formname']) url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) # this is a distribution link so # we are assuming it is to data dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) webpages = [] onlink_elems = extract_elems( self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink']) for onlink_elem in onlink_elems: link = onlink_elem.text.strip() if onlink_elem.text else '' if not link: continue url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) webpages.append({ "object_id": generate_uuid_urn(), "relationships": [ { "relate": "dcterms:references", "object_id": url_id } ]} ) output['catalog_record']['webpages'] = webpages for webpage in webpages: dataset['relationships'].append({ "relate": "dcterms:references", "object_id": webpage['object_id'] }) # retain the keyword sets with type, thesaurus name and split # the terms as best we can keywords = [] key_elem = extract_elem(self.elem, ['idinfo', 'keywords']) for child in key_elem.iterchildren(): key_type = extract_element_tag(child.tag) key_tag = 'strat' if key_type == 'stratum' else key_type key_tag = 'temp' if key_tag == 'temporal' else key_tag thesaurus = extract_item(child, ['%skt' % key_tag]) # TODO: split these up terms = extract_items(child, ['%skey' % key_tag]) if terms: # if there's a parsing error (bad cdata, etc) may not have # TODO: add something for a set without a thesaurus name keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms }) ) output['keywords'] = keywords for keyword in keywords: dataset['relationships'].append( { "relate": "dc:conformsTo", "object_id": keyword['object_id'] } ) output['datasets'] = [dataset] # add the metadata relate output['catalog_record']['relationships'].append( { "relate": "foaf:primaryTopic", "object_id": dataset_object_id } ) output['catalog_records'] = [output['catalog_record']] del output['catalog_record'] self.description = tidy_dict(output)