def __init__(self, dataset_url): # Get an etree object r = requests.get(dataset_url) tree = etree.XML(str(r.text)) dataset = tree.find("{%s}dataset" % INV_NS) self.id = dataset.get("ID") self.name = dataset.get("name") self.metadata = dataset.find("{%s}metadata" % INV_NS) self.catalog_url = dataset_url.split("?")[0] service_tag = dataset.find("{%s}serviceName" % INV_NS) if service_tag is None: service_tag = self.metadata.find("{%s}serviceName" % INV_NS) service_name = service_tag.text self.services = [] for service in tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_name)): if service.get("serviceType") == "Compound": for s in service.findall("{%s}service" % INV_NS): url = construct_url(dataset_url, s.get('base')) + dataset.get("urlPath") if s.get("suffix") is not None: url += s.get("suffix") self.services.append( {'name' : s.get('name'), 'service' : s.get('serviceType'), 'url' : url } ) else: url = construct_url(dataset_url, service.get('base')) + dataset.get("urlPath") + service.get("suffix","") self.services.append( {'name' : service.get('name'), 'service' : service.get('serviceType'), 'url' : url } )
def __init__(self, dataset_url): self.services = [] self.id = None self.name = None self.metadata = None self.catalog_url = None self.data_size = None # Get an etree object r = requests.get(dataset_url) try: tree = etree.XML(r.text.encode('utf-8')) except etree.XMLSyntaxError: logger.error("Error procesing %s, invalid XML" % dataset_url) else: dataset = tree.find("{%s}dataset" % INV_NS) self.id = dataset.get("ID") self.name = dataset.get("name") self.metadata = dataset.find("{%s}metadata" % INV_NS) self.catalog_url = dataset_url.split("?")[0] # Data Size - http://www.unidata.ucar.edu/software/thredds/current/tds/catalog/InvCatalogSpec.html#dataSize data_size = dataset.find("{%s}dataSize" % INV_NS) if data_size is not None: self.data_size = float(data_size.text) data_units = data_size.get('units') # Convert to MB if data_units == "bytes": self.data_size *= 1e-6 elif data_units == "Kbytes": self.data_size *= 0.001 elif data_units == "Gbytes": self.data_size /= 0.001 elif data_units == "Tbytes": self.data_size /= 1e-6 # Services service_tag = dataset.find("{%s}serviceName" % INV_NS) if service_tag is None: service_tag = self.metadata.find("{%s}serviceName" % INV_NS) service_name = service_tag.text for service in tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_name)): if service.get("serviceType") == "Compound": for s in service.findall("{%s}service" % INV_NS): url = construct_url(dataset_url, s.get('base')) + dataset.get("urlPath") if s.get("suffix") is not None: url += s.get("suffix") # ISO like services need additional parameters if s.get('name') in ["iso", "ncml", "uddc"]: url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url)) self.services.append( {'name' : s.get('name'), 'service' : s.get('serviceType'), 'url' : url } ) else: url = construct_url(dataset_url, service.get('base')) + dataset.get("urlPath") + service.get("suffix", "") # ISO like services need additional parameters if service.get('name') in ["iso", "ncml", "uddc"]: url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url)) self.services.append( {'name' : service.get('name'), 'service' : service.get('serviceType'), 'url' : url } )
def __init__(self, dataset_url): self.services = [] self.id = None self.name = None self.metadata = None self.catalog_url = None # Get an etree object r = requests.get(dataset_url) try: tree = etree.XML(str(r.text)) except etree.XMLSyntaxError: logger.error("Error procesing %s, invalid XML" % dataset_url) else: dataset = tree.find("{%s}dataset" % INV_NS) self.id = dataset.get("ID") self.name = dataset.get("name") self.metadata = dataset.find("{%s}metadata" % INV_NS) self.catalog_url = dataset_url.split("?")[0] service_tag = dataset.find("{%s}serviceName" % INV_NS) if service_tag is None: service_tag = self.metadata.find("{%s}serviceName" % INV_NS) service_name = service_tag.text for service in tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_name)): if service.get("serviceType") == "Compound": for s in service.findall("{%s}service" % INV_NS): url = construct_url(dataset_url, s.get('base')) + dataset.get("urlPath") if s.get("suffix") is not None: url += s.get("suffix") # ISO like services need additional parameters if s.get('name') in ["iso", "ncml", "uddc"]: url += "?dataset=%s&catalog=%s" % (self.id, urllib.quote_plus(self.catalog_url)) self.services.append( {'name' : s.get('name'), 'service' : s.get('serviceType'), 'url' : url } ) else: url = construct_url(dataset_url, service.get('base')) + dataset.get("urlPath") + service.get("suffix", "") # ISO like services need additional parameters if s.get('name') in ["iso", "ncml", "uddc"]: url += "?dataset=%s&catalog=%s" % (self.id, urllib.quote_plus(self.catalog_url)) self.services.append( {'name' : service.get('name'), 'service' : service.get('serviceType'), 'url' : url } )
def _run(self, url): if url in self.visited: logger.debug("Skipping %s (already crawled)" % url) return self.visited.append(url) logger.info("Crawling: %s" % url) u = urlparse.urlsplit(url) name, ext = os.path.splitext(u.path) if ext == ".html": u = urlparse.urlsplit(url.replace(".html", ".xml")) url = u.geturl() # Get an etree object try: r = requests.get(url) tree = etree.XML(str(r.text)) except BaseException: logger.error("Skipping %s (error parsing getting XML)" % url) return # Crawl the catalogRefs: for ref in tree.findall('.//{%s}catalogRef' % INV_NS): # Check skips title = ref.get("{%s}title" % XLINK_NS) if not any([x.match(title) for x in self.skip]): for ds in self._run(url=construct_url(url, ref.get("{%s}href" % XLINK_NS))): yield ds else: logger.info("Skipping catalogRef based on 'skips'. Title: %s" % title) continue # Get the leaf datasets ds = [] for leaf in tree.findall('.//{%s}dataset[@urlPath]' % INV_NS): # Subset by the skips name = leaf.get("name") if any([x.match(name) for x in self.skip]): logger.info("Skipping dataset based on 'skips'. Name: %s" % name) continue # Subset by the Selects defined gid = leaf.get('ID') if self.select is not None: if gid is not None and any([x.match(gid) for x in self.select]): logger.debug("Processing %s" % gid) yield "%s?dataset=%s" % (url, gid) else: logger.info("Ignoring dataset based on 'selects'. ID: %s" % gid) continue else: logger.debug("Processing %s" % gid) yield "%s?dataset=%s" % (url, gid)
def _compile_references(self, url, tree): ''' Returns a list of catalog reference URLs for the current catalog :param str url: URL for the current catalog :param lxml.etree.Eleemnt tree: Current XML Tree ''' references = [] for ref in tree.findall('.//{%s}catalogRef' % INV_NS): # Check skips title = ref.get("{%s}title" % XLINK_NS) if any([x.match(title) for x in self.skip]): logger.info("Skipping catalogRef based on 'skips'. Title: %s" % title) continue references.append(construct_url(url, ref.get("{%s}href" % XLINK_NS))) return references
def _compile_references(self, url, tree): ''' Returns a list of catalog reference URLs for the current catalog :param str url: URL for the current catalog :param lxml.etree.Eleemnt tree: Current XML Tree ''' references = [] for ref in tree.findall('.//{%s}catalogRef' % INV_NS): # Check skips title = ref.get("{%s}title" % XLINK_NS) if any([x.match(title) for x in self.skip]): logger.info( "Skipping catalogRef based on 'skips'. Title: %s" % title) continue references.append( construct_url(url, ref.get("{%s}href" % XLINK_NS))) return references
def __init__(self, dataset_url, auth=None): self.services = [] self.id = None self.name = None self.catalog_url = None self.data_size = None # Get an etree object r = requests.get(dataset_url, auth=auth, verify=False) try: tree = etree.XML(r.text.encode('utf-8')) except etree.XMLSyntaxError: logger.error("Error procesing %s, invalid XML" % dataset_url) else: try: dataset = tree.find("{%s}dataset" % INV_NS) self.id = dataset.get("ID") self.name = dataset.get("name") metadata = dataset.find("{%s}metadata" % INV_NS) self.catalog_url = dataset_url.split("?")[0] # Data Size - http://www.unidata.ucar.edu/software/thredds/current/tds/catalog/InvCatalogSpec.html#dataSize data_size = dataset.find("{%s}dataSize" % INV_NS) if data_size is not None: self.data_size = float(data_size.text) data_units = data_size.get('units') # Convert to MB if data_units == "bytes": self.data_size *= 1e-6 elif data_units == "Kbytes": self.data_size *= 0.001 elif data_units == "Gbytes": self.data_size /= 0.001 elif data_units == "Tbytes": self.data_size /= 1e-6 # Services service_tag = dataset.find("{%s}serviceName" % INV_NS) if service_tag is None: if metadata is not None: service_tag = metadata.find("{%s}serviceName" % INV_NS) if service_tag is None: # Use services found in the file. FMRC aggs do this. services = tree.findall(".//{%s}service[@serviceType='Compound']" % INV_NS) else: # Use specific named services services = tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_tag.text)) for service in services: if service.get("serviceType") == "Compound": for s in service.findall("{%s}service" % INV_NS): url = construct_url(dataset_url, s.get('base')) + dataset.get("urlPath") if s.get("suffix") is not None: url += s.get("suffix") # ISO like services need additional parameters if s.get('name') in ["iso", "ncml", "uddc"]: url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url)) self.services.append( {'name' : s.get('name'), 'service' : s.get('serviceType'), 'url' : url } ) else: url = construct_url(dataset_url, service.get('base')) + dataset.get("urlPath") + service.get("suffix", "") # ISO like services need additional parameters if service.get('name') in ["iso", "ncml", "uddc"]: url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url)) self.services.append( {'name' : service.get('name'), 'service' : service.get('serviceType'), 'url' : url } ) # Element objects are not pickable to save as a string try: self.metadata = etree.tostring(metadata) except TypeError: self.metadata = None except BaseException as e: logger.exception('Could not process {}. {}.'.format(dataset_url, e))