def _handle_elem(self, elem, child_tags, base_url, service_bases): description = self._get_items( extract_element_tag(elem.tag), elem, base_url, service_bases ) description['source'] = extract_element_tag(elem.tag) endpoints = [] for child_tag in child_tags: elems = extract_elems(elem, [child_tag]) for e in elems: e_desc = self._get_items( extract_element_tag(e.tag), e, base_url, service_bases ) e_desc['childOf'] = description.get('ID', '') e_desc["source"] = extract_element_tag(child_tag) parents = description.get('parentOf', []) parents += [e['ID'] for e in endpoints if 'childOf' in e] description['parentOf'] = parents endpoints.append(e_desc) return description, endpoints
def _run_element(elem, service_bases): ''' for a given element, return any text() and any attribute value ''' # run a generated xpath on the given element children = elem.xpath('./node()[local-name()!="metadata"' + 'and local-name()!="dataset" and' + 'local-name()!="catalogRef"]') element = {_normalize_key(extract_element_tag(k)): v for k, v in elem.attrib.iteritems()} element = self._manage_id(element) for child in children: value = child.text # xp = generate_qualified_xpath(child, True) tag = _normalize_key(extract_element_tag(child.tag)) if value: element[tag] = value for k, v in child.attrib.iteritems(): if v: element[tag + '_' + _normalize_key(extract_element_tag(k))] = v # get the service bases in case if [g for g in element.keys() if g.endswith('serviceName')]: sbs = [v for k, v in service_bases.iteritems() if k == element.get('serviceName')] else: sbs = service_bases.values() # send a unique list of base relative paths sbs = list(set(sbs)) url_key = next(iter([g for g in element.keys() if g.endswith('url')]), '') if url_key: # for service urls, if catalog.xml isn't appended it will resolve to # the html endpoint (not desired). so if the path equals the/a path in # the service bases, append catalog.xml to the path # elem_url = element[url_key] # if elem_url in sbs or not sbs: # elem_url += ('' if elem_url.endswith('/') else '/') + 'catalog.xml' # # element['url'] = intersect_url(base_url, elem_url, sbs) # # element['url'] = base_url # let's generate the link tl = ThreddsLink(elem, self.url, sbs) element['url'] = tl.urls element['actionable'] = 2 return element
def parse(self): elem = self.parser.xml ncml = {'variables': []} ncml['identifier'] = elem.attrib.get('location', '') for variable in extract_elems(elem, ['variable']): v = {} v['name'] = variable.attrib.get('name', '') v['attributes'] = [] for att in extract_elems(variable, ['attribute']): a = {} for key, value in att.attrib.iteritems(): tag = extract_element_tag(key) if tag == 'values': continue a[tag] = value.strip() if a: v['attributes'] += [a] v = tidy_dict(v) if v: ncml['variables'].append(v) return tidy_dict(ncml)
def _generate(self): tag = extract_element_tag(self.elem.tag) # find the href or urlPath if tag == 'dataset': href = self.elem.attrib.get('urlPath', None) if href is None: href = next( iter( self.elem.xpath( '*[local-name()="access"]/@*[local-name()="urlPath"]' )), None) # if there's a urlPath (anywhere), it *should* be the # terminal file path hrefs = [ self._generate_url('/'.join(service, href), self._get_ogc_params(service)) for service in self.services ] elif tag == 'catalogRef': href = self.elem.attrib.get('{http://www.w3.org/1999/xlink}href', '') hrefs = [self._generate_url(href)] self.urls = hrefs
def _generate(self): tag = extract_element_tag(self.elem.tag) # find the href or urlPath if tag == 'dataset': href = self.elem.attrib.get('urlPath', None) if href is None: href = next( iter(self.elem.xpath('*[local-name()="access"]/@*[local-name()="urlPath"]')), None) # if there's a urlPath (anywhere), it *should* be the # terminal file path hrefs = [ self._generate_url('/'.join(service, href), self._get_ogc_params(service)) for service in self.services] elif tag == 'catalogRef': href = self.elem.attrib.get('{http://www.w3.org/1999/xlink}href', '') hrefs = [self._generate_url(href)] self.urls = hrefs
def parse_item(self): output = {} urls = set() catalog_object_id = generate_uuid_urn() output['catalog_record'] = { "object_id": catalog_object_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.elem, ['@noNamespaceSchemaLocation']).split(), "rdf:type": "FGDC:CSDGM", "relationships": [], "urls": [] } output['urls'] = [] # add the harvest info # this is not necessary as a sha just for set inclusion url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) output['catalog_record']['urls'].append(original_url) # NOTE: this is not the sha from the url output['catalog_record']['relationships'].append({ "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] }) datsetid = extract_item(self.elem, ['idinfo', 'datsetid']) dataset_object_id = generate_uuid_urn() dataset = { "object_id": dataset_object_id, "dcterms:identifier": datsetid, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": extract_item(self.elem, ['idinfo', 'descript', 'abstract']), "dcterms:title": extract_item(self.elem, ['idinfo', 'citation', 'citeinfo', 'title']), "urls": [], "relationships": [] } bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding']) if bbox_elem is not None: # that's not even valid west = extract_item(bbox_elem, ['westbc']) east = extract_item(bbox_elem, ['eastbc']) north = extract_item(bbox_elem, ['northbc']) south = extract_item(bbox_elem, ['southbc']) bbox = [west, south, east, north] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) dataset.update({ "dc:spatial": bbox, "esip:westBound": west, "esip:eastBound": east, "esip:northBound": north, "esip:southBound": south }) time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo']) if time_elem is not None: caldate = extract_item(time_elem, ['sngdate', 'caldate']) if caldate: # TODO: we should see if it's at least a valid date dataset['esip:startDate'] = self._convert_date(caldate) rngdate = extract_elem(time_elem, ['rngdates']) if rngdate is not None: dataset['esip:startDate'] = self._convert_date( extract_item(rngdate, ['begdate'])) dataset['esip:endDate'] = self._convert_date( extract_item(rngdate, ['enddate'])) # TODO: add the min/max of the list of dates dataset['relationships'] = [{ "relate": "bcube:hasMetadataRecord", "object_id": catalog_object_id }] publisher = { "object_id": generate_uuid_urn(), "name": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']), "location": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace']) } output['publisher'] = publisher dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": publisher['object_id'] }) distrib_elems = extract_elems(self.elem, ['distinfo', 'stdorder', 'digform']) for distrib_elem in distrib_elems: link = extract_item( distrib_elem, ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr']) # format = extract_item(distrib_elem, ['digtinfo', 'formname']) url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) # this is a distribution link so # we are assuming it is to data dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) webpages = [] onlink_elems = extract_elems( self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink']) for onlink_elem in onlink_elems: link = onlink_elem.text.strip() if onlink_elem.text else '' if not link: continue url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest( **{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) webpages.append({ "object_id": generate_uuid_urn(), "relationships": [{ "relate": "dcterms:references", "object_id": url_id }] }) output['catalog_record']['webpages'] = webpages for webpage in webpages: dataset['relationships'].append({ "relate": "dcterms:references", "object_id": webpage['object_id'] }) # retain the keyword sets with type, thesaurus name and split # the terms as best we can keywords = [] key_elem = extract_elem(self.elem, ['idinfo', 'keywords']) for child in key_elem.iterchildren(): key_type = extract_element_tag(child.tag) key_tag = 'strat' if key_type == 'stratum' else key_type key_tag = 'temp' if key_tag == 'temporal' else key_tag thesaurus = extract_item(child, ['%skt' % key_tag]) # TODO: split these up terms = extract_items(child, ['%skey' % key_tag]) if terms: # if there's a parsing error (bad cdata, etc) may not have # TODO: add something for a set without a thesaurus name keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms })) output['keywords'] = keywords for keyword in keywords: dataset['relationships'].append({ "relate": "dc:conformsTo", "object_id": keyword['object_id'] }) output['datasets'] = [dataset] # add the metadata relate output['catalog_record']['relationships'].append({ "relate": "foaf:primaryTopic", "object_id": dataset_object_id }) output['catalog_records'] = [output['catalog_record']] del output['catalog_record'] self.description = tidy_dict(output)
def parse_item(self): output = {} urls = set() catalog_object_id = generate_uuid_urn() output['catalog_record'] = { "object_id": catalog_object_id, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), # "dc:conformsTo": extract_attrib( # self.elem, ['@noNamespaceSchemaLocation']).split(), "rdf:type": "FGDC:CSDGM", "relationships": [], "urls": [] } output['urls'] = [] # add the harvest info # this is not necessary as a sha just for set inclusion url_sha = generate_sha_urn(self.url) urls.add(url_sha) original_url = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": self.url, "object_id": generate_uuid_urn(), "dc:identifier": url_sha }) output['catalog_record']['urls'].append(original_url) # NOTE: this is not the sha from the url output['catalog_record']['relationships'].append( { "relate": "bcube:originatedFrom", "object_id": original_url['object_id'] } ) datsetid = extract_item(self.elem, ['idinfo', 'datsetid']) dataset_object_id = generate_uuid_urn() dataset = { "object_id": dataset_object_id, "dcterms:identifier": datsetid, "bcube:dateCreated": self.harvest_details.get('harvest_date', ''), "bcube:lastUpdated": self.harvest_details.get('harvest_date', ''), "dc:description": extract_item( self.elem, ['idinfo', 'descript', 'abstract']), "dcterms:title": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'title']), "urls": [], "relationships": [] } bbox_elem = extract_elem(self.elem, ['idinfo', 'spdom', 'bounding']) if bbox_elem is not None: # that's not even valid west = extract_item(bbox_elem, ['westbc']) east = extract_item(bbox_elem, ['eastbc']) north = extract_item(bbox_elem, ['northbc']) south = extract_item(bbox_elem, ['southbc']) bbox = [west, south, east, north] bbox = bbox_to_geom(bbox) bbox = to_wkt(bbox) dataset.update({ "dc:spatial": bbox, "esip:westBound": west, "esip:eastBound": east, "esip:northBound": north, "esip:southBound": south }) time_elem = extract_elem(self.elem, ['idinfo', 'timeperd', 'timeinfo']) if time_elem is not None: caldate = extract_item(time_elem, ['sngdate', 'caldate']) if caldate: # TODO: we should see if it's at least a valid date dataset['esip:startDate'] = self._convert_date(caldate) rngdate = extract_elem(time_elem, ['rngdates']) if rngdate is not None: dataset['esip:startDate'] = self._convert_date( extract_item(rngdate, ['begdate'])) dataset['esip:endDate'] = self._convert_date( extract_item(rngdate, ['enddate'])) # TODO: add the min/max of the list of dates dataset['relationships'] = [ { "relate": "bcube:hasMetadataRecord", "object_id": catalog_object_id } ] publisher = { "object_id": generate_uuid_urn(), "name": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'publish']), "location": extract_item( self.elem, ['idinfo', 'citation', 'citeinfo', 'pubinfo', 'pubplace']) } output['publisher'] = publisher dataset['relationships'].append({ "relate": "dcterms:publisher", "object_id": publisher['object_id'] }) distrib_elems = extract_elems( self.elem, ['distinfo', 'stdorder', 'digform']) for distrib_elem in distrib_elems: link = extract_item( distrib_elem, ['digtopt', 'onlinopt', 'computer', 'networka', 'networkr']) # format = extract_item(distrib_elem, ['digtinfo', 'formname']) url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) # this is a distribution link so # we are assuming it is to data dataset['relationships'].append({ "relate": "dcterms:references", "object_id": url_id }) webpages = [] onlink_elems = extract_elems( self.elem, ['idinfo', 'citation', 'citeinfo', 'onlink']) for onlink_elem in onlink_elems: link = onlink_elem.text.strip() if onlink_elem.text else '' if not link: continue url_sha = generate_sha_urn(link) if url_sha not in urls: urls.add(url_sha) url_id = generate_uuid_urn() dist = self._generate_harvest_manifest(**{ "bcube:hasUrlSource": "Harvested", "bcube:hasConfidence": "Good", "vcard:hasURL": link, "object_id": url_id, "dc:identifier": url_sha }) dataset['urls'].append(dist) webpages.append({ "object_id": generate_uuid_urn(), "relationships": [ { "relate": "dcterms:references", "object_id": url_id } ]} ) output['catalog_record']['webpages'] = webpages for webpage in webpages: dataset['relationships'].append({ "relate": "dcterms:references", "object_id": webpage['object_id'] }) # retain the keyword sets with type, thesaurus name and split # the terms as best we can keywords = [] key_elem = extract_elem(self.elem, ['idinfo', 'keywords']) for child in key_elem.iterchildren(): key_type = extract_element_tag(child.tag) key_tag = 'strat' if key_type == 'stratum' else key_type key_tag = 'temp' if key_tag == 'temporal' else key_tag thesaurus = extract_item(child, ['%skt' % key_tag]) # TODO: split these up terms = extract_items(child, ['%skey' % key_tag]) if terms: # if there's a parsing error (bad cdata, etc) may not have # TODO: add something for a set without a thesaurus name keywords.append( tidy_dict({ "object_id": generate_uuid_urn(), "dc:partOf": thesaurus, "bcube:hasType": key_type, "bcube:hasValue": terms }) ) output['keywords'] = keywords for keyword in keywords: dataset['relationships'].append( { "relate": "dc:conformsTo", "object_id": keyword['object_id'] } ) output['datasets'] = [dataset] # add the metadata relate output['catalog_record']['relationships'].append( { "relate": "foaf:primaryTopic", "object_id": dataset_object_id } ) output['catalog_records'] = [output['catalog_record']] del output['catalog_record'] self.description = tidy_dict(output)