def crawl(url): c = Crawl(url, select=['.*meta.nc'], skip=None, debug=None) locsite = [] for jr in c.datasets: locsite.append(str(jr.id)) rdro = [ 'http://tds0.ifremer.fr/thredds/catalog/' + '/'.join(jj.split('/')[:-1]) + '/catalog.html' for jj in locsite ] print(rdro) #print ('the new data is',rdro[0:1]) gd = 0 sitenamerd = [] #for kk in rdro: for kk in rdro[0:4]: # Only first Argo float is called inorder to reduce the computational time. Change it into rdro: urd = rdro[gd] crr = Crawl(urd, select=None, skip=['.*meta.nc', '.*Rtraj.nc', '.*tech.nc'], debug=None) gd += 1 for pp in crr.datasets: #print (pp.id) sitenamerd.append( ['http://tds0.ifremer.fr/thredds/dodsC/' + str(pp.id)]) return sitenamerd
def test_modified_time(self): # after with timezone af = datetime(2015, 12, 30, 0, 0, tzinfo=pytz.utc) c = Crawl( "http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml", after=af) assert len(c.datasets) == 3 # after without timezone af = datetime(2015, 12, 30, 0, 0) c = Crawl( "http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml", after=af) assert len(c.datasets) == 3 # before bf = datetime(2016, 1, 8, 0, 0) c = Crawl( "http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf) assert len(c.datasets) == 3 # both af = datetime(2016, 1, 20, 0, 0) bf = datetime(2016, 2, 1, 0, 0) c = Crawl( "http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml", before=bf, after=af) assert len(c.datasets) == 11
def url_trawler(self, url, expr): if url.endswith(".xml"): c = Crawl(url, select=[expr]) elif url.endswith( "/" ): # we'll try and add catalog.xml as the user may have just provided a directory c = Crawl(url + "catalog.xml", select=[expr]) else: # we'll try and add catalog.xml as the user may have just provided a directory c = Crawl(url + "/catalog.xml", select=[expr]) urls = [ s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "opendap" ] return urls
def test_root_finder(self): urls = [ ('http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.dataset.106_224.thredds.xml', 'http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.thredds.xml', False), ('http://www.esrl.noaa.gov/psd/thredds/catalog/Datasets/noaa.oisst.v2.derived/catalog.xml', 'http://www.esrl.noaa.gov/psd/thredds/catalog.xml', True), ('https://rsg.pml.ac.uk/thredds/catalog/cnr/3b42-3h/1998/01/01/catalog.xml', 'https://rsg.pml.ac.uk/thredds/catalog.xml', True) ] for url, expected, output in urls: crawler = Crawl(url) found_url = crawler._find_root_url() assert (found_url == expected) == output
def loadGliders(loader, stride=1): ''' Crawl the IOOS Glider TDS for OPeNDAP links of Time aggregated files and load into STOQS ''' c = Crawl("http://tds.gliders.ioos.us/thredds/catalog.xml", select=[".*_Time$"]) urls = [ s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "opendap" ] colors = loader.colors.values() for url in urls: aName = url.split('/')[-1].split('.')[0] pName = aName.replace('_Time', '') if pName.find('-') != -1: logger.warn( "Replacing '-' characters in platform name %s with '_'s", pName) pName = pName.replace('-', '_') logger.info("Executing runGliderLoader with url = %s", url) try: runGliderLoader(url, loader.campaignName, il.campaignDescription, aName, pName, colors.pop(), 'glider', 'Glider Mission', loader.parms, loader.dbAlias, stride, loader.startDatetime, loader.endDatetime, il.grdTerrain) except Exception, e: logger.error('%s. Skipping this dataset.', e)
def test_regex_selects(self): c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"]) assert len(c.datasets) == 9 # Get all DAP links: services = [s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "opendap"] assert len(services) == 9
def main( url='http://opendap-devel.ooi.rutgers.edu:8090/thredds/catalog/first-in-class/catalog.xml', stmt='.*ncml'): C = Crawl(url, select=[stmt]) tds = 'http://opendap-devel.ooi.rutgers.edu:8090/thredds/dodsC/' reg_ex = re.compile('|'.join(['config', 'meta', 'engine', 'diag'])) data = [] for dataset in C.datasets: if reg_ex.search(dataset.id) is not None: continue file = tds + dataset.id with xr.open_dataset(file) as ds: ds_disk = ds.swap_dims( {'obs': 'time'}) # change dimensions from 'obs' to 'time' # ds_variables = ds.data_vars.keys() # List of dataset variables refdes = ds.subsite + '-' + ds.node + '-' + ds.sensor stream = ds_disk.stream # List stream name associated with the data delivery = ds.collection_method start = ds.time_coverage_start end = ds.time_coverage_end # data.append((refdes, stream, delivery, 'Yes', 'Yes', start + ' to ' + end)) pd.DataFrame(data, columns=[ 'RefDes', 'Stream', 'Delivery Method', 'Data Downloaded', 'Data range - MIO', 'Time Range' ]) pd.DataFrame.to_csv('/Users/michaesm/Documents/Summary.csv')
def test_single_dataset(self): c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=["MODIS-Agg"]) assert len(c.datasets) == 1 assert c.datasets[0].id == "MODIS-Agg" assert len(c.datasets[0].services) == 2 service_names = sorted(map(lambda x: x.get('service'), c.datasets[0].services)) assert service_names == ["ISO", "OPENDAP"]
def thredds_find_glob(base_catalog: str, skips: List[str], select: List[str], workers: int = 8) -> List[str]: """Glob YAML's from base Thredds Catalog recursively Arguments: base_catalog {str} -- Base of the catlog to crawl from user_skips {list} -- Paths to skip in addition to NCI specific defaults select {list} -- Paths to select (useful YAML's) workers {int} -- Number of workers to use for Thredds Crawling Returns: list -- List of Thredds hosted dataset YAML url's to Index """ user_skips = Crawl.SKIPS user_skips = user_skips.extend(skips) results = Crawl(base_catalog + "/catalog.xml", select=select, skip=user_skips, workers=workers).datasets urls = [ service["url"] for dataset in results for service in dataset.services if service["service"].lower() == "httpserver" ] return urls
def get_metadata(thredds_servers, save_dir, skips=Crawl.SKIPS, select=None, debug=True, logger_name=None): logger = logging.getLogger(logger_name) tsi = thredds_servers.items() local_metadata_paths = [] for subfolder, thredds_url in tsi: logger.info("Crawling {0} ({1})".format(subfolder, thredds_url)) crawler = Crawl(thredds_url, skip=skips, select=select, debug=debug) filefolder = os.path.join(save_dir, subfolder) if not os.path.exists(filefolder): os.makedirs(filefolder) isos = [(d.id, s.get("url")) for d in crawler.datasets for s in d.services if s.get("service").lower() == "iso"] for iso in isos: filename = '{0}{1}'.format(iso[0].replace('/', '_'), '.iso.xml') filepath = os.path.join(filefolder, filename) try: urlretrieve(iso[1], filepath) except BaseException: logger.exception("Error!") else: local_metadata_paths.append(filepath) return local_metadata_paths
def find_urls(base, select, startdate, enddate): url = os.path.join(base, 'catalog.xml') print("Crawling: {}".format(url)) skips = Crawl.SKIPS + [ ".*Courier*", ".*Express*", ".*Normal*, '.*Priority*", ".*.cfg$" ] u = urlparse(url) name, ext = os.path.splitext(u.path) if ext == ".html": u = urlparse(url.replace(".html", ".xml")) url = u.geturl() urls = [] try: c = Crawl(url, select=[".*dlist"]) # Crawl the catalogRefs: for dataset in c.datasets: try: # get the mission directory name and extract the start and ending dates dlist = os.path.basename(dataset.id) mission_dir_name = dlist.split('.')[0] dts = mission_dir_name.split('_') dir_start = datetime.strptime(dts[0], '%Y%m%d') dir_end = datetime.strptime(dts[1], '%Y%m%d') # if within a valid range, grab the valid urls if dir_start >= startdate and dir_end <= enddate: catalog = '{}_{}/catalog.xml'.format( dir_start.strftime('%Y%m%d'), dir_end.strftime('%Y%m%d')) c = Crawl(os.path.join(base, catalog), select=[select], skip=skips) d = [ s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "opendap" ] for url in d: urls.append(url) except Exception as ex: print("Error reading mission directory name {}".format(ex)) except BaseException: print("Skipping {} (error parsing the XML XML)".format(url)) return urls
def drifters(drifter_id, projection, resolution, extent): buoy_id = [] lat = [] lon = [] status = [] if drifter_id in ['all', 'active', 'inactive', 'not responding']: c = Crawl(app.config['DRIFTER_CATALOG_URL'], select=[".*.nc$"]) drifters = [d.name[:-3] for d in c.datasets] else: drifters = drifter_id.split(",") for d in drifters: with Dataset(app.config["DRIFTER_URL"] % d, 'r') as ds: if drifter_id == 'active' and ds.status != 'normal': continue elif drifter_id == 'inactive' and ds.status != 'inactive': continue elif drifter_id == 'not responding' and \ ds.status != 'not responding': continue buoy_id.append(ds.buoyid) lat.append(ds['latitude'][:]) lon.append(ds['longitude'][:]) status.append(ds.status) proj = pyproj.Proj(init=projection) view = _get_view(extent) res = [] for i, bid in enumerate(buoy_id): x, y = proj(lon[i], lat[i]) ls = LineString(zip(y, x)) if view.envelope.intersects(ls): path = np.array(ls.simplify(resolution * 1.5).coords) path = np.array( proj(path[:, 1], path[:, 0], inverse=True)).transpose() res.append({ 'type': "Feature", 'geometry': { 'type': "LineString", 'coordinates': path.astype(float).tolist() }, 'properties': { 'name': bid, 'status': status[i], 'type': "drifter", 'resolution': resolution, } }) result = { 'type': "FeatureCollection", 'features': res, } return result
def get(self): catalog_url = self.get_argument('catalog_url') self.set_header('Content-Type', 'application/json') c = ThreddsConfig(config=self.config) crawl = Crawl(catalog_url, workers=c.workers) datasets = sorted([flatten_dataset(d) for d in crawl.datasets], key=lambda d: d['id']) self.finish(json.dumps(datasets))
def test_iso_links(self): c = Crawl("http://thredds.axiomdatascience.com/thredds/global.html") isos = [ s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "iso" ] assert "?dataset=" in isos[0] assert "&catalog=" in isos[0]
def find_urls(base, search_str): INV_NS = "http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0" url = os.path.join(base, 'catalog.xml') print "Crawling: %s" % url skips = Crawl.SKIPS + [ ".*Courier*", ".*Express*", ".*Normal*, '.*Priority*", ".*.cfg$" ] u = urlparse.urlsplit(url) name, ext = os.path.splitext(u.path) if ext == ".html": u = urlparse.urlsplit(url.replace(".html", ".xml")) url = u.geturl() urls = [] # Get an etree object try: r = requests.get(url) tree = etree.XML(r.text.encode('utf-8')) # Crawl the catalogRefs: for ref in tree.findall('.//{%s}catalogRef' % INV_NS): try: # get the mission directory name and extract the start and ending dates mission_dir_name = ref.attrib[ '{http://www.w3.org/1999/xlink}title'] dts = mission_dir_name.split('_') dir_start = datetime.datetime.strptime(dts[0], '%Y%m%d') dir_end = datetime.datetime.strptime(dts[1], '%Y%m%d') # if within a valid range, grab the valid urls if dir_start >= startdate and dir_end <= enddate: print 'Found mission directory ' + dts[0] print 'Searching if within range %s and %s %s %s' % ( startdate, enddate, dir_start, dir_end) catalog = ref.attrib['{http://www.w3.org/1999/xlink}href'] c = Crawl(os.path.join(base, catalog), select=[search_str], skip=skips) d = [ s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "opendap" ] for url in d: urls.append(url) except Exception as ex: print "Error reading mission directory name %s" % ex except BaseException: print "Skipping %s (error parsing the XML)" % url if not urls: raise FileNotFound('No urls matching "{}" found in {}'.format( search_str, os.path.join(base, 'catalog.html'))) return urls
def parse_opendap(crawl_path): if verbose > 0: print('crawl', crawl_path) skips = Crawl.SKIPS #skips = Crawl.SKIPS + [".*FV00", ".*realtime", ".*Real-time", ".*daily", ".*REAL_TIME", ".*regridded", ".*burst", ".*gridded", ".*long-timeseries"] #skips = Crawl.SKIPS + [".*realtime", ".*Real-time", ".*daily", ".*REAL_TIME", ".*regridded"] #skips = Crawl.SKIPS + [".*regridded"] #crawl_path = 'http://thredds.aodn.org.au/thredds/catalog/IMOS/' + path + '/catalog.xml' #crawl_path='http://thredds.aodn.org.au/thredds/catalog/IMOS/ANMN/NRS/NRSKAI/Biogeochem_profiles/catalog.html' c = Crawl(crawl_path, select=['.*'], skip=skips) #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/IMOS-EAC/catalog.xml', select=['.*']) #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/IMOS-ITF/catalog.xml', select=['.*']) #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/SOTS/catalog.xml', select=['.*']) # print(c.datasets) # serice can be httpService or dapService urls = [ s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "httpserver" ] # httpserver or opendap if verbose > 1: for url in urls: print(url) for d in c.datasets: if verbose > 2: print('datasets', d) for s in d.services: if verbose > 2: print('serices', s) if s.get("service").lower() == 'opendap': url = s.get("url") if verbose > 1: print('url', s.get("url")) #get_nc_dataset(s.get("url"), 'TEMP') nc = None try: nc = Dataset(url, mode="r") postgres_insert(nc, url=url) except Exception as e: print(url, e) if nc: nc.close()
def web_crawler_mooring(beginDT, endDT, location='shelf', method='telemetered'): USERNAME = '******' TOKEN = 'TEMP-TOKEN-A3STSZK6P6ULST' #Sensor Inventory SENSOR_BASE_URL = 'https://ooinet.oceanobservatories.org/api/m2m/12576/sensor/inv/' # Instrument Information if location == 'shelf': site = 'CE02SHSM' elif location == 'offshore': site = 'CE04OSSM' node = 'SBD11' instrument = '06-METBKA000' if method == 'telemetered': stream = 'metbk_a_dcl_instrument' elif method == 'recovered_host': stream = 'metbk_a_dcl_instrument_recovered' data_request_url = '/'.join( (SENSOR_BASE_URL, site, node, instrument, method, stream)) params = { 'beginDT': beginDT, 'endDT': endDT, 'format': 'application/csv', 'include_provenance': 'false', 'include_annotations': 'false', } r = requests.get(data_request_url, params=params, auth=(USERNAME, TOKEN)) dataraw = r.json() print(method) print(dataraw) #This is the part that checls to ensure the link is ready to go. check_complete = dataraw['allURLs'][1] + '/status.txt' for i in range(10000): r = requests.get(check_complete) if r.status_code == requests.codes.ok: print('request completed') break else: time.sleep(.5) #This part then finds and downloads the requested csv file. url = dataraw['allURLs'][0] c = Crawl(url, select=['.*\.csv$'], debug=False) urls = [ s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "httpserver" ] urlsrev = [url for url in reversed(urls)] return urlsrev
def crawl(url): c = Crawl(url, select=['.*meta.nc'], skip=None, debug=None) locsite = [] for jr in c.datasets: locsite.append(str(jr.id)) rdro = [ 'http://tds0.ifremer.fr/thredds/catalog/' + '/'.join(jj.split('/')[:-1]) + '/catalog.html' for jj in locsite ] # print (rdro) #print ('the new data is',rdro[0:1]) gd = 0 sitenamerd = [] added = 0 for kk in rdro: #for kk in rdro[0:4]: urd = rdro[gd] crr = Crawl(urd, select=None, skip=['.*meta.nc', '.*Rtraj.nc', '.*tech.nc'], debug=None) gd += 1 for pp in crr.datasets: #print (pp.id) sitenamerd.append( ['http://tds0.ifremer.fr/thredds/dodsC/' + str(pp.id)]) uricorrect = 'http://tds0.ifremer.fr/thredds/dodsC/' + str(pp.id) ds0, cr0 = try_add_argo_float(uricorrect) if cr0: print('Added %s, no. %d,%d' % (url, added, len(crr.datasets))) added += 1 print('Added', added) # import ipdb # ipdb.set_trace() # return sitenamerd return added
def main(url, out): now = dt.datetime.now().strftime('%Y.%m.%dT%H.%M.00') C = Crawl(url, select=[".*ncml"]) tds = 'https://opendap.oceanobservatories.org/thredds/dodsC/' cf.create_dir(out) fopen = open(out + '/' + now + '-nc-links.txt', 'w') for dataset in C.datasets: fopen.write(tds + dataset.id + '\n') fopen.close()
def list_class4_files(class4_catalog_url): # Taken from Ocean Navigator source file "misc.py". c = Crawl(class4_catalog_url, select=[".*_GIOPS_.*.nc$"]) result = [] for dataset in c.datasets: value = dataset.name[:-3] date = datetime.datetime.strptime(value.split("_")[1], "%Y%m%d") result.append({'name': date.strftime("%Y-%m-%d"), 'id': value}) return result
def list_class4_models(class4_id): select = ["(.*/)?%s.*_profile.nc$" % class4_id[:16]] c = Crawl(current_app.config["CLASS4_CATALOG_URL"], select=select) result = [] for dataset in c.datasets: value = dataset.name[:-3] model = value.split("_")[2] if model != "GIOPS": result.append({'value': value.split("_")[2], 'id': value}) return result
def test_coawst_parse(self): selects = ['.*\.ncd'] skips = Crawl.SKIPS + ['.*MATLAB.*'] c = Crawl( 'http://gamone.whoi.edu/thredds/catalog/coawst_4/use/fmrc/catalog.xml', select=selects, skip=skips) assert len(c.datasets) > 0 isos = [(d.id, s.get("url")) for d in c.datasets for s in d.services if s.get("service").lower() == "iso"] assert len(isos) > 0
def list_class4_files_slowly(): # This function has poor performance; only use as a fallback. c = Crawl(current_app.config["CLASS4_CATALOG_URL"], select=[".*_GIOPS_.*.nc$"], workers=16) result = [] for dataset in c.datasets: value = dataset.name[:-3] date = datetime.datetime.strptime(value.split("_")[1], "%Y%m%d") result.append({'name': date.strftime("%Y-%m-%d"), 'id': value}) return result
def list_class4_files(): c = Crawl(app.config["CLASS4_CATALOG_URL"], select=[".*_GIOPS_.*.nc$"]) result = [] for dataset in c.datasets: value = dataset.name[:-3] date = datetime.datetime.strptime(value.split("_")[1], "%Y%m%d") result.append({ 'name': date.strftime("%Y-%m-%d"), 'id': value }) return result
def get_thredds_waf(url, destination_path, suffix=None): ''' Scrapes the available ISO files at the specified THREDDS instance. The URL must point to the catalog.xml ''' c = Crawl(url) datasets = c.datasets suffix = suffix or '' for dataset in datasets: services = { row['name'] : row for row in dataset.services } iso_url = services['iso']['url'] if iso_url: get_iso_doc(iso_url, destination_path, dataset.id + suffix + '.xml')
def cli(thredds_catalogue, skips, select, workers, outfile): """ Download Metadata from THREDDS server to tarball Example: \b Download files in directory that match `*yaml` and store them as a tar > thredds-to-tar -c "http://dapds00.nci.org.au/thredds/catalog/if87/2018-11-29/" -t ".*ARD-METADATA.yaml" -s '.*NBAR.*' -s '.*SUPPLEMENTARY.*' -s '.*NBART.*' -s '.*/QA/.*' -w 8 --outfile 2018-11-29.tar.gz """ user_skips = Crawl.SKIPS for skip in skips: user_skips = user_skips + [skip] print("Searching {thredds_catalogue} for matching files".format( thredds_catalogue=thredds_catalogue)) results = Crawl(thredds_catalogue + '/catalog.xml', select=[select], skip=user_skips, workers=workers).datasets print("Found {0} metadata files".format(str(len(results)))) # construct (guess) the fileserver url based on # https://www.unidata.ucar.edu/software/thredds/v4.6/tds/reference/Services.html#HTTP parsed_uri = urlparse(thredds_catalogue) split_path = parsed_uri.path.split('/') fileserver_path = parsed_uri.scheme + '://' + parsed_uri.netloc + '/'.join( split_path[:(split_path.index('thredds') + 1)] + ['fileServer', '']) parsed_uri = urlparse(fileserver_path) # use a threadpool to download from thredds pool = ThreadPool(workers) yamls = pool.map(partial(download, parsed_uri=parsed_uri), results) pool.close() pool.join() # jam it all in a tar tar_opts = dict(name=outfile, mode='w' + tar_mode(gzip=True, xz=True, is_pipe=False)) with tarfile.open(**tar_opts) as tar: for yaml in yamls: add_txt_file(tar=tar, content=yaml[0], fname=yaml[1]) print("Done!")
def crawl(url, **options): validate_uri(url) skips = Crawl.SKIPS + ['.*ncml'] c = Crawl(url, skip=skips, debug=True) added = 0 for ds in c.datasets: url = [s.get('url') for s in ds.services if s.get('service').lower()=='opendap'][0] metno_obs_stat, cr = MetObsStation.objects.get_or_create(url) if cr: added += 1 print('Added %s, no. %d/%d'%(url, added, len(c.datasets))) return added
def __init__(self, catalog_url, out_dir, log_file=None, select=None, skip=None, clean=True): self.logger = logging.getLogger('thredds_crawler') self.logger.setLevel(logging.DEBUG) self.logger.handlers = [] self.__add_stream_logger() if log_file is not None: self.__add_file_logger(log_file) if skip is None: skip = Crawl.SKIPS else: skip.extend(Crawl.SKIPS) if not os.path.exists(out_dir): os.makedirs(out_dir) found_isos = [] catalog = Crawl(catalog_url, select=select, skip=skip) isos = [(d.id, s.get("url")) for d in catalog.datasets for s in d.services if s.get("service").lower() == "iso"] for iso in isos: try: filename = iso[0].replace("/", "_") + ".iso.xml" found_isos.append(filename) filepath = os.path.join(out_dir, filename) self.logger.info("Downloading/Saving %s" % filepath) r = requests.get(iso[1], stream=True) if r.ok: with open(filepath, 'wb') as f: for chunk in r.iter_content(): if chunk: f.write(chunk) else: self.logger.info("Got a non-200 status code (%s) from %s" % (r.status_code, iso[1])) except KeyboardInterrupt: self.logger.info("Caught interrupt, exiting") sys.exit(0) except BaseException: self.logger.exception("Error!") if clean: self.__clean_not_found_files(out_dir, found_isos)
def test_unidata_parse(self): selects = [".*Best.*"] skips = Crawl.SKIPS + [ ".*grib2", ".*grib1", ".*GrbF.*", ".*ncx2", "Radar Data", "Station Data", "Point Feature Collections", "Satellite Data", "Unidata NEXRAD Composites \(GINI\)", "Unidata case studies", ".*Reflectivity-[0-9]{8}" ] c = Crawl('http://thredds.ucar.edu/thredds/catalog.xml', select=selects, skip=skips) assert len(c.datasets) > 0 isos = [(d.id, s.get("url")) for d in c.datasets for s in d.services if s.get("service").lower() == "iso"] assert len(isos) > 0
def cli(thredds_catalogue, skips, select, workers, outfile): """ Download Metadata from THREDDS server to tarball Example: \b Download files in directory that match `*yaml` and store them as a tar > thredds-to-tar -c "http://dapds00.nci.org.au/thredds/catalog/if87/2018-11-29/" -t ".*ARD-METADATA.yaml" -s '.*NBAR.*' -s '.*SUPPLEMENTARY.*' -s '.*NBART.*' -s '.*/QA/.*' -w 8 --outfile 2018-11-29.tar.gz """ user_skips = Crawl.SKIPS for skip in skips: user_skips = user_skips + [skip] print("Searching {thredds_catalogue} for matching files".format( thredds_catalogue=thredds_catalogue)) results = Crawl(thredds_catalogue + '/catalog.xml', select=[select], skip=user_skips, workers=workers).datasets print("Found {0} metadata files".format(str(len(results)))) urls = [ service['url'] for dataset in results for service in dataset.services if service['service'].lower() == 'httpserver' ] # use a threadpool to download from thredds pool = ThreadPool(workers) yamls = pool.map(partial(download), urls) pool.close() pool.join() # jam it all in a tar tar_opts = dict(name=outfile, mode='w' + tar_mode(gzip=True, xz=True, is_pipe=False)) with tarfile.open(**tar_opts) as tar: for yaml in yamls: add_txt_file(tar=tar, content=yaml[0], fname=yaml[1]) print("Done!")
if args.post: token = os.environ['SLACKTOKEN'] slack = Slacker(token) # Assume that the database has already been created with description and terrain information, so use minimal arguments in constructor cl = CANONLoader(args.database, args.campaign) cl.dbAlias = args.database cl.campaignName = args.campaign # Get directory list from sites s = args.inUrl.rsplit('/',1) files = s[1] url = s[0] logger.info("Crawling %s for %s files" % (url, files)) c = Crawl(os.path.join(url, 'catalog.xml'), select=[files], debug=False) for d in c.datasets: logger.debug('Found %s' % d.id) urls = [s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "opendap"] pw = lrauvNc4ToNetcdf.InterpolatorWriter() # If parameter names contains any group forward slash '/' delimiters # replace them with underscores. This is because pydap automatically renames slashes as underscores # and needs to reference the parameter correctly in the DAPloader parm_list = [] plot_group = [] parm_process = [] coord = {}