def __init__(self, dataset_url): self.services = [] self.id = None self.name = None self.metadata = None self.catalog_url = None self.data_size = None # Get an etree object r = requests.get(dataset_url) try: tree = etree.XML(r.text.encode('utf-8')) except etree.XMLSyntaxError: logger.error("Error procesing %s, invalid XML" % dataset_url) else: dataset = tree.find("{%s}dataset" % INV_NS) self.id = dataset.get("ID") self.name = dataset.get("name") self.metadata = dataset.find("{%s}metadata" % INV_NS) self.catalog_url = dataset_url.split("?")[0] # Data Size - http://www.unidata.ucar.edu/software/thredds/current/tds/catalog/InvCatalogSpec.html#dataSize data_size = dataset.find("{%s}dataSize" % INV_NS) if data_size is not None: self.data_size = float(data_size.text) data_units = data_size.get('units') # Convert to MB if data_units == "bytes": self.data_size *= 1e-6 elif data_units == "Kbytes": self.data_size *= 0.001 elif data_units == "Gbytes": self.data_size /= 0.001 elif data_units == "Tbytes": self.data_size /= 1e-6 # Services service_tag = dataset.find("{%s}serviceName" % INV_NS) if service_tag is None: service_tag = self.metadata.find("{%s}serviceName" % INV_NS) service_name = service_tag.text for service in tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_name)): if service.get("serviceType") == "Compound": for s in service.findall("{%s}service" % INV_NS): url = construct_url(dataset_url, s.get('base')) + dataset.get("urlPath") if s.get("suffix") is not None: url += s.get("suffix") # ISO like services need additional parameters if s.get('name') in ["iso", "ncml", "uddc"]: url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url)) self.services.append( {'name' : s.get('name'), 'service' : s.get('serviceType'), 'url' : url } ) else: url = construct_url(dataset_url, service.get('base')) + dataset.get("urlPath") + service.get("suffix", "") # ISO like services need additional parameters if service.get('name') in ["iso", "ncml", "uddc"]: url += "?dataset=%s&catalog=%s" % (self.id, quote_plus(self.catalog_url)) self.services.append( {'name' : service.get('name'), 'service' : service.get('serviceType'), 'url' : url } )
def find_urls(base, search_str): INV_NS = "http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0" url = os.path.join(base, 'catalog.xml') print "Crawling: %s" % url skips = Crawl.SKIPS + [ ".*Courier*", ".*Express*", ".*Normal*, '.*Priority*", ".*.cfg$" ] u = urlparse.urlsplit(url) name, ext = os.path.splitext(u.path) if ext == ".html": u = urlparse.urlsplit(url.replace(".html", ".xml")) url = u.geturl() urls = [] # Get an etree object try: r = requests.get(url) tree = etree.XML(r.text.encode('utf-8')) # Crawl the catalogRefs: for ref in tree.findall('.//{%s}catalogRef' % INV_NS): try: # get the mission directory name and extract the start and ending dates mission_dir_name = ref.attrib[ '{http://www.w3.org/1999/xlink}title'] dts = mission_dir_name.split('_') dir_start = datetime.datetime.strptime(dts[0], '%Y%m%d') dir_end = datetime.datetime.strptime(dts[1], '%Y%m%d') # if within a valid range, grab the valid urls if dir_start >= startdate and dir_end <= enddate: print 'Found mission directory ' + dts[0] print 'Searching if within range %s and %s %s %s' % ( startdate, enddate, dir_start, dir_end) catalog = ref.attrib['{http://www.w3.org/1999/xlink}href'] c = Crawl(os.path.join(base, catalog), select=[search_str], skip=skips) d = [ s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "opendap" ] for url in d: urls.append(url) except Exception as ex: print "Error reading mission directory name %s" % ex except BaseException: print "Skipping %s (error parsing the XML)" % url if not urls: raise FileNotFound('No urls matching "{}" found in {}'.format( search_str, os.path.join(base, 'catalog.html'))) return urls
def _run(self, url): if url in self.visited: logger.debug("Skipping %s (already crawled)" % url) return self.visited.append(url) logger.info("Crawling: %s" % url) u = urlparse.urlsplit(url) name, ext = os.path.splitext(u.path) if ext == ".html": u = urlparse.urlsplit(url.replace(".html", ".xml")) url = u.geturl() # Get an etree object try: r = requests.get(url) tree = etree.XML(str(r.text)) except BaseException: logger.error("Skipping %s (error parsing getting XML)" % url) return # Crawl the catalogRefs: for ref in tree.findall('.//{%s}catalogRef' % INV_NS): # Check skips title = ref.get("{%s}title" % XLINK_NS) if not any([x.match(title) for x in self.skip]): for ds in self._run(url=construct_url(url, ref.get("{%s}href" % XLINK_NS))): yield ds else: logger.info("Skipping catalogRef based on 'skips'. Title: %s" % title) continue # Get the leaf datasets ds = [] for leaf in tree.findall('.//{%s}dataset[@urlPath]' % INV_NS): # Subset by the skips name = leaf.get("name") if any([x.match(name) for x in self.skip]): logger.info("Skipping dataset based on 'skips'. Name: %s" % name) continue # Subset by the Selects defined gid = leaf.get('ID') if self.select is not None: if gid is not None and any([x.match(gid) for x in self.select]): logger.debug("Processing %s" % gid) yield "%s?dataset=%s" % (url, gid) else: logger.info("Ignoring dataset based on 'selects'. ID: %s" % gid) continue else: logger.debug("Processing %s" % gid) yield "%s?dataset=%s" % (url, gid)
def __init__(self, dataset_url, estimate_size=False): self.services = [] self.id = None self.name = None self.metadata = None self.catalog_url = None self.data_size = None self.estimate_size = estimate_size # Get an etree object r = requests.get(dataset_url) try: tree = etree.XML(str(r.text)) except etree.XMLSyntaxError: logger.error("Error procesing %s, invalid XML" % dataset_url) else: dataset = tree.find("{%s}dataset" % INV_NS) self.id = dataset.get("ID") self.name = dataset.get("name") self.metadata = dataset.find("{%s}metadata" % INV_NS) self.catalog_url = dataset_url.split("?")[0] # Data Size - http://www.unidata.ucar.edu/software/thredds/current/tds/ # catalog/InvCatalogSpec.html#dataSize data_size = dataset.find("{%s}dataSize" % INV_NS) if data_size is not None: self.data_size = float(data_size.text) data_units = data_size.get('units') # Convert to MB if data_units == "bytes": self.data_size *= 1e-6 elif data_units == "Kbytes": self.data_size *= 0.001 elif data_units == "Gbytes": self.data_size /= 0.001 elif data_units == "Tbytes": self.data_size /= 1e-6 # Services service_tag = dataset.find("{%s}serviceName" % INV_NS) if service_tag is None: service_tag = self.metadata.find("{%s}serviceName" % INV_NS) service_name = service_tag.text for service in tree.findall(".//{%s}service[@name='%s']" % (INV_NS, service_name)): if service.get("serviceType") == "Compound": for s in service.findall("{%s}service" % INV_NS): url = '' else: url = ''