from bs4 import BeautifulSoup from lxml import etree from lxml.html.soupparser import fromstring from thredds_crawler.crawl import Crawl from django.conf import settings import os #url = "http://*****:*****@'), "catalogRefs/CatalogTELEDEM.html") url = "http://localhost:8080/thredds/CatalogTELEDM.html" cat = TDSCatalog('http://localhost:8080/thredds/CatalogTELEDM.html') #context = ssl._create_unverified_context() #catT = TDSCatalog(url) cat = Crawl(url) datasets = [i.id for i in cat.datasets] root = "/home/mers/Bureau/teledm/donnees/" catalog = {} for item in datasets: p = catalog for x in item.split('/'): p = p.setdefault(x, {}) ds = {} for d in datasets: dp = d.split('/') nc = Dataset(root+d,'r') dt = nc.variables['time'] dates = num2date(dt[:], dt.units) dset = {}
#path='ANMN/NRS/NRSKAI' path='ABOS/SOTS' #path='ABOS/SOTS/2016' if len(sys.argv) > 1: path = sys.argv[1] #skips = Crawl.SKIPS + [".*FV00"] skips = Crawl.SKIPS + [".*FV00", ".*realtime", ".*Real-time", ".*daily", ".*REAL_TIME", ".*regridded", ".*burst", ".*gridded", ".*long-timeseries"] #skips = Crawl.SKIPS + [".*realtime", ".*Real-time", ".*daily", ".*REAL_TIME", ".*regridded"] #skips = Crawl.SKIPS + [".*regridded"] crawl_path = 'http://thredds.aodn.org.au/thredds/catalog/IMOS/' + path + '/catalog.xml' #crawl_path='http://thredds.aodn.org.au/thredds/catalog/IMOS/ANMN/NRS/NRSKAI/Biogeochem_profiles/catalog.html' c = Crawl(crawl_path, select=['.*SAZ.*2020'], skip=skips) #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/IMOS-EAC/catalog.xml', select=['.*']) #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/IMOS-ITF/catalog.xml', select=['.*']) #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/SOTS/catalog.xml', select=['.*']) # print(c.datasets) # serice can be httpService or dapService urls = [s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "httpserver"] # httpserver or opendap for url in urls: print(url)
def loadStationData(self, stride=1): '''Crawl the OceanSITES Mooring data TDS for OPeNDAP links and load into STOQS ''' urls = [] strides = {} for dataSet in self.dataSets: c = Crawl(dataSet[0], select=dataSet[1], debug=self.args.verbose) dsUrls = [ s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "opendap" ] for dsu in dsUrls: strides[dsu] = dataSet[2] urls += dsUrls # First pass through urls matching OceanSITES pattern to collect platform names to get colors # Use OceanSITES naming convention for platform "OS_<platformName>_xxx_R|D_<type>.nc" pNames = set() platfrormTypeNames = set() for url in urls: platfrormTypeNames.add(url.split('/')[-2]) if url.find('MOVE1_') != -1: # Special hack for MOVE PlatformCode newUrl = url.replace('MOVE1_', 'MOVE1-') pNames.add(newUrl.split('/')[-1].split('.')[0].split('_')[1]) else: pNames.add(url.split('/')[-1].split('.')[0].split('_')[1]) # Assign colors by platformTypeName pColors = {} for ptName, color in zip(sorted(platfrormTypeNames), self.getColor(len(platfrormTypeNames))): pColors[ptName] = color # Now loop again, this time loading the data for url in urls: logger.info("Executing runMooringLoader with url = %s", url) if self.args.optimal_stride and strides[url]: stride = strides[url] elif self.args.test: stride = strides[url] * 2 fixedUrl = url if url.find('OS_IMOS-EAC_EAC') != -1: # Special fix to get platform name fixedUrl = url.replace('OS_IMOS-EAC_EAC', 'OS_IMOS-EAC-EAC') if stride > 1: aName = fixedUrl.split('/')[-1].split( '.')[0] + '(stride=%d)' % stride else: aName = fixedUrl.split('/')[-1].split('.')[0] pName = aName.split('_')[1] ptName = url.split('/')[-2] logger.debug("Instantiating Mooring_Loader for url = %s", url) try: ml = Mooring_Loader( url=url, campaignName=self.campaignName, campaignDescription=self.campaignDescription, dbAlias=self.dbAlias, activityName=aName, activitytypeName='Mooring Deployment', platformName=pName, platformColor=pColors[ptName], platformTypeName=ptName, stride=stride, startDatetime=self.startDatetime, dataStartDatetime=None, endDatetime=self.endDatetime) except UnicodeDecodeError as e: logger.warn(str(e)) logger.warn(f'Cannot read data from {url}') continue # Special fixes for non standard metadata and if files don't contain the standard TEMP and PSAL parameters if url.find('MBARI-') != -1: ml.include_names = ['TEMP', 'PSAL'] ml.auxCoords = {} for v in ml.include_names: ml.auxCoords[v] = { 'time': 'TIME', 'latitude': 'LATITUDE', 'longitude': 'LONGITUDE', 'depth': 'DEPTH' } elif url.find('OS_PAPA_2009PA003_D_CTD_10min') != -1: ml.include_names = ['TEMP'] elif url.find('OS_PAPA_2009PA003_D_PSAL_1hr') != -1: ml.include_names = ['PSAL'] elif url.find('OS_SOTS_SAZ-15-2012_D_microcat-4422m') != -1: ml.include_names = ['TEMP', 'PSAL'] # DEPTH_CN_PR_PS_TE coordinate missing standard_name attribute ml.auxCoords = {} for v in ml.include_names: ml.auxCoords[v] = { 'time': 'TIME', 'latitude': 'LATITUDE', 'longitude': 'LONGITUDE', 'depth': 'DEPTH_CN_PR_PS_TE' } # Only global attribute is 'cdm_data_type: Time-series'; monkey-patch the method Mooring_Loader.getFeatureType = lambda self: 'timeseries' elif url.find('D_MICROCAT-PART') != -1: ml.include_names = ['TEMP', 'PSAL'] ml.auxCoords = {} for v in ml.include_names: ml.auxCoords[v] = { 'time': 'TIME', 'latitude': 'LATITUDE', 'longitude': 'LONGITUDE', 'depth': 'DEPTH' } elif url.find('D_RDI-WORKHORSE-ADCP-') != -1: ml.include_names = ['UCUR', 'VCUR', 'WCUR'] ml.auxCoords = {} for v in ml.include_names: ml.auxCoords[v] = { 'time': 'TIME', 'latitude': 'LATITUDE', 'longitude': 'LONGITUDE', 'depth': 'HEIGHT_ABOVE_SENSOR' } # Metadata in file states 'timeseries', but it's really something different; monkey-patch the getFeatureType() method Mooring_Loader.getFeatureType = lambda self: 'trajectoryprofile' elif url.find('TVSM_dy.nc') != -1: ##ml.include_names = ['UCUR', 'VCUR', 'TEMP', 'PSAL', 'CSPD', 'CDIR'] ml.include_names = ['TEMP', 'PSAL'] ml.auxCoords = {} for v in ('UCUR', 'VCUR', 'CSPD', 'CDIR'): ml.auxCoords[v] = { 'time': 'TIME', 'latitude': 'LATITUDE', 'longitude': 'LONGITUDE', 'depth': 'DEPCUR' } for v in ('TEMP', ): ml.auxCoords[v] = { 'time': 'TIME', 'latitude': 'LATITUDE', 'longitude': 'LONGITUDE', 'depth': 'DEPTH' } for v in ('PSAL', ): ml.auxCoords[v] = { 'time': 'TIME', 'latitude': 'LATITUDE', 'longitude': 'LONGITUDE', 'depth': 'DEPPSAL' } # These PIRATA daily files are timeSeriesProfile which hsa no featureType attribute Mooring_Loader.getFeatureType = lambda self: 'timeseriesprofile' elif url.find('CCE') != -1: ml.include_names = ['TEMP', 'PSAL'] ml.auxCoords = {} for v in ml.include_names: ml.auxCoords[v] = { 'time': 'TIME', 'latitude': 'LATITUDE', 'longitude': 'LONGITUDE', 'depth': 'DEPTH' } elif url.find('NOG') != -1: ml.include_names = ['TEMP', 'PSAL'] Mooring_Loader.getFeatureType = lambda self: 'timeseries' elif url.find('Stratus') != -1: # Variable attrubute coordinates: TIME, DEPTH, LATITUDE, LONGITUDE; it should not contain commas ml.include_names = ['TEMP', 'PSAL'] ml.auxCoords = {} for v in ml.include_names: ml.auxCoords[v] = { 'time': 'TIME', 'latitude': 'LATITUDE', 'longitude': 'LONGITUDE', 'depth': 'DEPTH' } else: ml.include_names = ['TEMP', 'PSAL'] try: (nMP, path, parmCountHash) = ml.process_data() logger.debug("Loaded Activity with name = %s", aName) except NoValidData as e: logger.warning(e)
if args.post: token = os.environ['SLACKTOKEN'] slack = Slacker(token) # Assume that the database has already been created with description and terrain information, so use minimal arguments in constructor lm = CANONLoader(args.database, args.campaign) lm.dbAlias = args.database lm.campaignName = args.campaign # Get directory list from sites s = args.inUrl.rsplit('/', 1) files = s[1] url = s[0] logger.info("Crawling %s for %s files", url, files) c = Crawl(os.path.join(url, 'catalog.xml'), select=[files], debug=False) for d in c.datasets: logger.debug('Found %s', d.id) urls = [ s2.get("url") for d in c.datasets for s2 in d.services if s2.get("service").lower() == "opendap" ] pw = lrauvNc4ToNetcdf.InterpolatorWriter() # If parameter names contains any group forward slash '/' delimiters # replace them with underscores. This is because pydap automatically renames slashes as underscores # and needs to reference the parameter correctly in the DAPloader parm_list = []
def download(folder, projects, filesubset, since): # Use thredds_crawler to find DAP endpoints of the CF-1.6 data. skips = Crawl.SKIPS if projects: skips += ['^(?!{}).*^(?!.*\.(cdf|nc)).*$'.format('|'.join(projects))] catalog = 'http://geoport.whoi.edu/thredds/catalog/usgs/data2/emontgomery/stellwagen/CF-1.6/catalog.html' try: datasets = Crawl(catalog, select=['.*\.(cdf|nc)'], skip=skips, after=since).datasets logger.info("Found {0} TOTAL datasets!".format(len(datasets))) except KeyboardInterrupt: logger.info("Breaking out of crawling loop.") datasets = [] try: os.makedirs(folder) except OSError: pass # Save datasets to download directory saved_files = [] for num, d in enumerate(datasets): if filesubset and d.name.lower() not in filesubset: continue try: http_url = next(s["url"] for s in d.services if s["service"].lower() == "httpserver") project_name = http_url.split("/")[-2] except StopIteration: logger.error("No HTTPServer endpoint found, skipping") continue # Make download folder save_file = os.path.join(folder, project_name, d.name) if not os.path.isdir(os.path.dirname(save_file)): os.makedirs(os.path.dirname(save_file)) logger.info("Downloading {0}".format(http_url)) try: with open(save_file, "wb") as f: r = requests.get(http_url, stream=True) if not r.ok: logger.error("Could not download '{!s}' from '{!s}', skipping".format(d.name, http_url)) break for block in r.iter_content(1024): if not block: break f.write(block) except KeyboardInterrupt: logger.info("Breaking out of download loop.") raise except BaseException: logger.error("Could not download... error with HTTP endpoint. Skipping.") continue # Try to open file, if it fails, writing failed. try: nc = netCDF4.Dataset(save_file) except BaseException: os.remove(save_file) else: logger.info("{!s} saved ({!s}/{!s})".format(d.name, num + 1, len(datasets))) saved_files.append(save_file) finally: nc_close(nc) return saved_files
def main(url, save_dir): if type(url) is str: if url.endswith('.html'): url = url.replace('.html', '.xml') tds_url = 'https://opendap.oceanobservatories.org/thredds/dodsC' c = Crawl(url, select=[".*\.nc$"], debug=False) datasets = [os.path.join(tds_url, x.id) for x in c.datasets] splitter = url.split('/')[-2].split('-') elif url.endswith('.xml'): tds_url = 'https://opendap.oceanobservatories.org/thredds/dodsC' c = Crawl(url, select=[".*\.nc$"], debug=False) datasets = [os.path.join(tds_url, x.id) for x in c.datasets] splitter = url.split('/')[-2].split('-') elif url.endswith('.nc') or url.endswith('.ncml'): datasets = [url] splitter = url.split('/')[-2].split('-') else: print 'Unrecognized input. Input must be a string of the file location(s) or list of file(s)' else: print 'Dataset must be in a string.' data = OrderedDict(deployments=OrderedDict()) for dataset in datasets: filename = os.path.basename(dataset) if 'ENG000000' not in filename: # script will not analyze glider ENG data files logging.info('Processing {}'.format(str(dataset))) try: print 'Opening file: {}'.format(dataset) with xr.open_dataset(dataset, mask_and_scale=False) as ds: ref_des = '{}-{}-{}'.format(ds.subsite, ds.node, ds.sensor) deployment = np.unique(ds['deployment'].data)[0] qc_data = request_qc_json( ref_des) # grab data from the qc database ref_des_dict = get_parameter_list(qc_data) deploy_info = get_deployment_information( qc_data, deployment) if deploy_info is None: print 'info from deployment ' + str( deployment) + ' does not match data' continue data_start = ds.time_coverage_start + 'Z' data_end = ds.time_coverage_end + 'Z' # Deployment Variables deploy_start = str(deploy_info['start_date'] + 'Z') if deploy_info['stop_date']: deploy_stop = str(deploy_info['stop_date'] + 'Z') else: deploy_stop = str(deploy_info['stop_date']) deploy_lon = deploy_info['longitude'] deploy_lat = deploy_info['latitude'] # Add reference designator to dictionary try: data['ref_des'] except KeyError: data['ref_des'] = ref_des deployment = 'D0000{}'.format(deployment) deployments = data['deployments'].keys() # Add deployment to dictionary and initialize stream sub dictionary if not deployment in deployments: data['deployments'][deployment] = OrderedDict( start=deploy_start, end=deploy_stop, lon=deploy_lon, lat=deploy_lat, streams=OrderedDict(), data_times=dict(start=[], end=[])) # Add data start and stop times to a data_times array. When the files are all processed, it checks data vs deployment times if ds.stream == splitter[-1]: data['deployments'][deployment]['data_times'][ 'start'].append(data_start) data['deployments'][deployment]['data_times'][ 'end'].append(data_end) streams = data['deployments'][deployment]['streams'].keys() # Add stream to subdictionary inside deployment if not ds.stream in streams: data['deployments'][deployment]['streams'][ ds.stream] = OrderedDict(files=OrderedDict()) qc_df = parse_qc(ds) qc_vars = [x for x in qc_df.keys() if not 'test' in x] qc_df = qc_df.reset_index() variables = ds.data_vars.keys() variables = eliminate_common_variables(variables) variables = [ x for x in variables if not 'qc' in x ] # remove qc variables, because we don't care about them # Gap test. Get a list of gaps gap_list = test_gaps(qc_df) # Deployment Distance data_lat = np.unique(ds['lat'])[0] data_lon = np.unique(ds['lon'])[0] dist_calc = distance((deploy_lat, deploy_lon), (data_lat, data_lon)) # Unique times time = ds['time'] len_time = time.__len__() len_time_unique = np.unique(time).__len__() if len_time == len_time_unique: time_test = True else: time_test = False db_list = ref_des_dict[ds.stream] [_, unmatch1] = compare_lists(db_list, variables) [_, unmatch2] = compare_lists(variables, db_list) filenames = data['deployments'][deployment]['streams'][ ds.stream]['files'] if not filename in filenames: data['deployments'][deployment]['streams'][ ds.stream]['files'][filename] = OrderedDict( data_start=data_start, data_end=data_end, time_gaps=gap_list, lon=data_lon, lat=data_lat, distance_from_deploy_km=dist_calc, unique_times=str(time_test), variables=OrderedDict(), vars_not_in_file=unmatch1, vars_not_in_db=unmatch2) else: print filename + ' already in dictionary. Skipping' for v in variables: # print v # Availability test if v in db_list: available = True else: available = False if ds[v].dtype.kind == 'S' \ or ds[v].dtype == np.dtype('datetime64[ns]') \ or 'time' in v: dict_vars = data['deployments'][deployment][ 'streams'][ds.stream]['files'][filename][ 'variables'].keys() if not v in dict_vars: data['deployments'][deployment]['streams'][ ds.stream]['files'][filename]['variables'][ v] = OrderedDict( available=str(available)) continue else: var_data = ds[v].data # NaN test. Make sure the parameter is not all NaNs nan_test = np.all(np.isnan(var_data)) if not nan_test or available is False: # Global range test [g_min, g_max ] = get_global_ranges(ds.subsite, ds.node, ds.sensor, v) try: ind = reject_outliers(var_data, 3) min = float(np.nanmin(var_data[ind])) max = float(np.nanmax(var_data[ind])) except (TypeError, ValueError): min = None max = None # Fill Value test try: fill_value = float(ds[v]._FillValue) fill_test = np.any( var_data == ds[v]._FillValue) except AttributeError: fill_value = None fill_test = None dict_vars = data['deployments'][deployment][ 'streams'][ds.stream]['files'][filename][ 'variables'].keys() if not v in dict_vars: data['deployments'][deployment]['streams'][ ds.stream]['files'][filename][ 'variables'][v] = OrderedDict( available=str(available), all_nans=str(nan_test), data_min=min, data_max=max, global_min=g_min, global_max=g_max, fill_test=str(fill_test), fill_value=fill_value) if v in qc_vars: temp_list = [] tests = [ 'global_range_test', 'dataqc_stuckvaluetest', 'dataqc_spiketest' ] for test in tests: var = '{}_{}'.format(v, test) group_var = 'group_{}'.format(var) try: qc_df[group_var] = qc_df[var].diff( ).cumsum().fillna(0) except KeyError as e: # logging.warn('Error: P') temp_list.append(['Did not run']) continue tdf = qc_df.groupby([ group_var, var ])['time'].agg(['first', 'last']) tdf = tdf.reset_index().drop( [group_var], axis=1) tdf = tdf.loc[tdf[var] == False].drop( var, axis=1) tdf['first'] = tdf['first'].apply( lambda x: x.strftime( '%Y-%m-%dT%H:%M:%SZ')) tdf['last'] = tdf['last'].apply( lambda x: x.strftime( '%Y-%m-%dT%H:%M:%SZ')) if tdf.empty: data['deployments'][deployment][ 'streams'][ds.stream]['files'][ filename]['variables'][v][ test] = [] else: data['deployments'][deployment][ 'streams'][ds.stream]['files'][ filename]['variables'][v][ test] = map( list, tdf.values) else: data['deployments'][deployment]['streams'][ ds.stream]['files'][filename][ 'variables'][v][ 'global_range_test'] = None data['deployments'][deployment]['streams'][ ds.stream]['files'][filename][ 'variables'][v][ 'dataqc_stuckvaluetest'] = None data['deployments'][deployment]['streams'][ ds.stream]['files'][filename][ 'variables'][v][ 'dataqc_spiketest'] = None else: dict_vars = data['deployments'][deployment][ 'streams'][ds.stream]['files'][filename][ 'variables'].keys() if not v in dict_vars: data['deployments'][deployment]['streams'][ ds.stream]['files'][filename][ 'variables'][v] = OrderedDict( available=str(available), all_nans=str(nan_test)) except Exception as e: logging.warn('Error: Processing failed due to {}.'.format( str(e))) raise else: pass deployments = data['deployments'].keys() for d in deployments: data['deployments'][d]['data_times']['start'].sort(key=natural_keys) data['deployments'][d]['data_times']['end'].sort(key=natural_keys) data['deployments'][d]['data_times']['start'] = data['deployments'][d][ 'data_times']['start'][0] data['deployments'][d]['data_times']['end'] = data['deployments'][d][ 'data_times']['end'][-1] #make_dir(save_dir) json_dir = (os.path.join(save_dir, 'json_output')) make_dir(json_dir) save_file = os.path.join( json_dir, '{}-{}-{}-{}__{}-{}__requested_{}.json'.format( splitter[1], splitter[2], splitter[3], splitter[4], splitter[5], splitter[6], splitter[0])) with open(save_file, 'w') as outfile: json.dump(data, outfile) return save_file
def main(url): tds_url = 'http://opendap.oceanobservatories.org/thredds/dodsC' c = Crawl(url, select=[".*ncml"]) data = [] for n in c.datasets: ncml_url = os.path.join(tds_url, n.id) ds = xr.open_dataset(ncml_url, mask_and_scale=False) deployment = np.unique(ds['deployment'].data)[0] variables = ds.data_vars.keys() variables = eliminate_common_variables(variables) variables = [x for x in variables if not 'qc' in x ] # remove qc variables, because we don't care about them ref_des = '{}-{}-{}'.format(ds.subsite, ds.node, ds.sensor) ref_des_dict = get_parameter_list(ref_des) # Gap test. Get a list of gaps gap_list = test_gaps(ds['time'].data) for v in variables: var_data = ds[v].data print v # Availability test if v in ref_des_dict[ds.stream]: available = True else: available = False # Global range test [g_min, g_max] = get_global_ranges(ds.subsite, ds.node, ds.sensor, v) try: min = np.nanmin(var_data) max = np.nanmax(var_data) except TypeError: min = 'n/a' max = 'n/a' if g_min is not None: if min > g_min: if max < g_max: gr_result = True else: gr_result = False else: gr_result = False else: gr_result = 'None' # Fill Value test fill_test = np.all(var_data == ds[v]._FillValue) try: # NaN test. Make sure the parameter is not all NaNs nan_test = np.all(np.isnan(var_data)) except TypeError: nan_test = 'None' data.append((ref_des, ds.stream, deployment, v, available, gr_result, [g_min, min], [g_max, max], fill_test, ds[v]._FillValue, nan_test, gap_list)) df = pd.DataFrame(data, columns=[ 'ref_des', 'stream', 'deployment', 'variable', 'availability', 'global_range_test', 'min[global, data]', 'max[global, data]', 'fill_test', 'fill_value', 'not_nan', 'gaps' ]) df.to_csv('/Users/michaesm/Documents/test.csv', index=False)
if len(sys.argv) > 1: path = sys.argv[1] #skips = Crawl.SKIPS + [".*FV00"] skips = Crawl.SKIPS + [ ".*FV00", ".*realtime", ".*Real-time", ".*daily", ".*REAL_TIME", ".*regridded", ".*burst", ".*gridded", ".*long-timeseries" ] #skips = Crawl.SKIPS + [".*realtime", ".*Real-time", ".*daily", ".*REAL_TIME", ".*regridded"] #skips = Crawl.SKIPS + [".*regridded"] crawl_path = 'http://thredds.aodn.org.au/thredds/catalog/IMOS/' + path + '/catalog.xml' #crawl_path='http://thredds.aodn.org.au/thredds/catalog/IMOS/ANMN/NRS/NRSKAI/Biogeochem_profiles/catalog.html' c = Crawl(crawl_path, select=['.*FV01'], skip=skips) #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/IMOS-EAC/catalog.xml', select=['.*']) #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/IMOS-ITF/catalog.xml', select=['.*']) #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/SOTS/catalog.xml', select=['.*']) # print(c.datasets) # serice can be httpService or dapService urls = [ s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "httpserver" ] # httpserver or opendap for url in urls: print(url)
def main(url, save_dir): if type(url) is str: if url.endswith('.html'): url = url.replace('.html', '.xml') tds_url = 'https://opendap.oceanobservatories.org/thredds/dodsC' c = Crawl(url, select=[".*ncml"]) datasets = [os.path.join(tds_url, x.id) for x in c.datasets] elif url.endswith('.xml'): tds_url = 'https://opendap.oceanobservatories.org/thredds/dodsC' c = Crawl(url, select=[".*ncml"]) datasets = [os.path.join(tds_url, x.id) for x in c.datasets] elif url.endswith('.nc') or url.endswith('.ncml'): datasets = [url] elif os.path.exists(url): datasets = glob.glob(url + '/*.nc') else: print 'Unrecognized input. Input must be a string of the file location(s) or list of file(s)' elif type(url) is list: datasets = url data = [] for dataset in datasets: logging.info('Processing {}'.format(str(dataset))) try: print 'Opening file: {}'.format(dataset) with xr.open_dataset(dataset, mask_and_scale=False) as ds: qc_df = parse_qc(ds) qc_vars = [x for x in qc_df.keys() if not 'test' in x] qc_df = qc_df.reset_index() deployment = np.unique(ds['deployment'].data)[0] variables = ds.data_vars.keys() variables = eliminate_common_variables(variables) variables = [ x for x in variables if not 'qc' in x ] # remove qc variables, because we don't care about them ref_des = '{}-{}-{}'.format(ds.subsite, ds.node, ds.sensor) qc_data = request_qc_json( ref_des) # grab data from the qc database ref_des_dict = get_parameter_list(qc_data) deploy_info = get_deployment_information(qc_data, deployment) # Gap test. Get a list of gaps gap_list = test_gaps(qc_df) # Deployment Variables deploy_start = str(deploy_info['start_date']) deploy_stop = str(deploy_info['stop_date']) deploy_lon = deploy_info['longitude'] deploy_lat = deploy_info['latitude'] # Deployment Time data_start = ds.time_coverage_start data_stop = ds.time_coverage_end start_test = [str(deploy_start), str(data_start)] stop_test = [str(deploy_stop), str(data_stop)] # Deployment Distance data_lat = np.unique(ds['lat'])[0] data_lon = np.unique(ds['lon'])[0] dist_calc = distance((deploy_lat, deploy_lon), (data_lat, data_lon)) if dist_calc < .5: # if distance is less than .5 km dist = True else: dist = False dist_test = '{} [{} km]'.format(dist, dist_calc) # Unique times time = ds['time'] len_time = time.__len__() len_time_unique = np.unique(time).__len__() if len_time == len_time_unique: time_test = True else: time_test = False for v in variables: print v # Availability test if v in ref_des_dict[ds.stream]: available = True else: available = False if ds[v].dtype == np.dtype( 'S64' ) or ds[v].dtype == np.dtype( 'datetime64[ns]' ) or 'time' in v: # this will skip most engineering/system variables because they are strings # ['ref_des', 'stream', 'deployment', 'start', 'stop', 'distance_from_deploy_<=.5km', # 'time_unique', 'variable', 'availability', 'all_nans', 'global_range_test', 'min', 'max', # 'fill_test', 'fill_value', 'gaps', 'global_range', 'stuck_value', 'spike_test']) data.append((ref_des, ds.stream, deployment, start_test, stop_test, dist_test, time_test, v, available, None, None, None, None, None, None, None, None, None, None)) continue else: var_data = ds[v].data # NaN test. Make sure the parameter is not all NaNs nan_test = np.all(np.isnan(var_data)) if not nan_test or available is False: # Global range test [g_min, g_max ] = get_global_ranges(ds.subsite, ds.node, ds.sensor, v) try: ind = reject_outliers(var_data, 3) min = np.nanmin(var_data[ind]) max = np.nanmax(var_data[ind]) except TypeError: min = None max = None if g_min is not None: if min >= g_min: if max <= g_max: gr_result = True else: gr_result = False else: gr_result = False else: gr_result = None # Fill Value test try: fill_value = ds[v]._FillValue fill_test = np.any( var_data == ds[v]._FillValue) except AttributeError: fill_value = 'n/a' fill_test = 'n/a' data_tuple = (ref_des, ds.stream, deployment, start_test, stop_test, dist_test, time_test, v, available, nan_test, gr_result, [g_min, min], [g_max, max], fill_test, fill_value, gap_list) if v in qc_vars: temp_list = [] tests = [ 'global_range_test', 'dataqc_stuckvaluetest', 'dataqc_spiketest' ] for test in tests: var = '{}_{}'.format(v, test) group_var = 'group_{}'.format(var) try: qc_df[group_var] = qc_df[var].diff( ).cumsum().fillna(0) except KeyError as e: logging.warn('Error: P') temp_list.append('DNR') continue tdf = qc_df.groupby([group_var, var])['time'].agg( ['first', 'last']) tdf = tdf.reset_index().drop([group_var], axis=1) tdf = tdf.loc[tdf[var] == False].drop( var, axis=1) tdf['first'] = tdf['first'].apply( lambda x: x.strftime( '%Y-%m-%d %H:%M:%S')) tdf['last'] = tdf['last'].apply( lambda x: x.strftime( '%Y-%m-%d %H:%M:%S')) if tdf.empty: temp_list.append([]) else: temp_list.append(map(list, tdf.values)) temp_tuple = data_tuple + tuple(temp_list) data.append(temp_tuple) else: temp_tuple = data_tuple + ('n/a', 'n/a', 'n/a') data.append(temp_tuple) else: data.append( (ref_des, ds.stream, deployment, start_test, stop_test, dist_test, time_test, v, available, nan_test, 'n/a', 'n/a', 'n/a', 'n/a', 'n/a', gap_list, 'n/a', 'n/a', 'n/a')) except Exception as e: logging.warn('Error: Processing failed due to {}.'.format(str(e))) raise df = pd.DataFrame(data, columns=[ 'ref_des', 'stream', 'deployment', 'start', 'stop', 'distance_from_deploy_<=.5km', 'time_unique', 'variable', 'availability', 'all_nans', 'global_range_test', 'min[global,data]', 'max[global,data]', 'fill_test', 'fill_value', 'gaps', 'global_range', 'stuck_value', 'spike_test' ]) df.to_csv(os.path.join( save_dir, '{}-{}-{}-{}-process_on_{}.csv'.format( ds.subsite, ds.node, ds.sensor, ds.stream, dt.now().strftime('%Y-%m-%dT%H%M00'))), index=False)
def download(folder, project_metadata, filesubset, since): # Use thredds_crawler to find DAP endpoints of the RAW data. total_datasets = [] skips = Crawl.SKIPS + ['.*OTHER.*', '.*ancillary.*', '.*OLD_VERSIONS.*'] try: for k, v in project_metadata.items(): # http://regexr.com/3conn datasets = Crawl( v['catalog_xml'], select=[ '([0-9]+\..*|.*(-(a|A)|(ls|[0-9]s|isus|[aAB]s|sgs|aqds|vs|pcs|d|tide)-cal){1}(?!lp)(?!1(h|H))\.*.*)' ], skip=skips, after=since).datasets logger.info("Found {0} datasets in {1}!".format(len(datasets), k)) total_datasets += datasets logger.info("Found {0} TOTAL datasets!".format(len(total_datasets))) except KeyboardInterrupt: logger.info("Breaking out of crawling loop.") total_datasets = [] try: os.makedirs(folder) except OSError: pass # Save datasets to download directory saved_files = [] for num, d in enumerate(total_datasets): if filesubset and d.name.lower() not in filesubset: continue try: http_url = next(s["url"] for s in d.services if s["service"].lower() == "httpserver") project_name = http_url.split("/")[-2] except StopIteration: logger.error("No HTTPServer endpoint found, skipping") continue # Make download folder save_file = os.path.join(folder, project_name, d.name) if not os.path.isdir(os.path.dirname(save_file)): os.makedirs(os.path.dirname(save_file)) logger.info("Downloading {0}".format(http_url)) try: with open(save_file, "wb") as f: r = requests.get(http_url, stream=True) if not r.ok: logger.error( "Could not download '{!s}' from '{!s}', skipping". format(d.name, http_url)) break for block in r.iter_content(1024): if not block: break f.write(block) except KeyboardInterrupt: logger.info("Breaking out of download loop.") raise except BaseException: logger.error( "Could not download... error with HTTP endpoint. Skipping.") continue # Try to open file, if it fails, writing failed. try: with EnhancedDataset(save_file, 'a') as nc: name, _ = os.path.splitext(d.name) nc.id = "{0}/{1}".format(project_name, name) except BaseException: os.remove(save_file) raise else: logger.info("{!s} saved ({!s}/{!s})".format( d.name, num + 1, len(total_datasets))) saved_files.append(save_file) return saved_files
def download_data(data, array_name, refdes, method, stream): ''' Download data from THREDDS sever after data request is successful. data: returned value by reqest_data() function refdes: Reference Designator method: 'telemetered' or other stream: stream name from ooi beginDT: beginning time. endDT: ending time. Details please see: http://ooi.visualocean.net/instruments/view/GP02HYPM-WFP02-04-CTDPFL000 ''' # Get the data URL for the NetCDF file dataset from THREDDS server. url = data['allURLs'][0] # This is the THREDDS server address. print('THREDDS server address: ' + url) url = url.replace('.html', '.xml') tds_url = 'https://opendap.oceanobservatories.org/thredds/dodsC' c = Crawl(url, select=[".*\.nc$"], debug=False) datasets = [os.path.join(tds_url, x.id) for x in c.datasets] #print(datasets) ds = xr.open_mfdataset(datasets) #ds = ds.swap_dims({'obs': 'time'}) # Useful variables. Use L1 data and QC flags. select_var = [ 'time', 'lon', 'lat', 'ctdpf_ckl_seawater_temperature', 'ctdpf_ckl_seawater_conductivity', 'ctdpf_ckl_seawater_pressure', 'practical_salinity', 'density', 'density_qc_executed', 'density_qc_results', 'practical_salinity_qc_executed', 'practical_salinity_qc_results', 'ctdpf_ckl_seawater_pressure_qc_executed', 'ctdpf_ckl_seawater_pressure_qc_results', 'ctdpf_ckl_seawater_temperature_qc_executed', 'ctdpf_ckl_seawater_temperature_qc_results', 'ctdpf_ckl_seawater_conductivity_qc_executed', 'ctdpf_ckl_seawater_conductivity_qc_results' ] df = ds[select_var].to_dataframe() df.drop(columns=['pressure'], inplace=True) # Rename columns to make things easier. df.columns = [ 'time', 'lon', 'lat', 'sea_water_temperature', 'sea_water_conductivity', 'sea_water_pressure', 'sea_water_salinity', 'sea_water_density', 'density_qc_executed', 'density_qc_results', 'salinity_qc_executed', 'salinity_qc_results', 'pressure_qc_executed', 'pressure_qc_results', 'temperature_qc_executed', 'temperature_qc_results', 'conductivity_qc_executed', 'conductivity_qc_results' ] # Set normal or suspicious flags for each variable # If one qc is not passed, I consider it as suspicious. df['pressure_flag'] = df['pressure_qc_executed'] == df[ 'pressure_qc_results'] df['temperature_flag'] = df['temperature_qc_executed'] == df[ 'temperature_qc_results'] df['conductivity_flag'] = df['conductivity_qc_executed'] == df[ 'conductivity_qc_results'] df['density_flag'] = df['density_qc_executed'] == df['density_qc_results'] df['salinity_flag'] = df['salinity_qc_executed'] == df[ 'salinity_qc_results'] df.to_csv(array_name + '_' + refdes + '_' + method + '_' + stream + '.csv') print('Data saved as ' + array_name + '_' + refdes + '_' + method + '_' + stream + '.csv') return df
def harvest_thredds(src_url, dest_url): c = Crawl(src_url) return c
def find_urls(self, base, select, startdate, enddate): cat_url = os.path.join(base, 'catalog.xml') u = urlparse(cat_url) name, ext = os.path.splitext(u.path) if ext == ".html": u = urlparse(cat_url.replace(".html", ".xml")) cat_url = u.geturl() urls = [] if self.args.realtime: self.logger.info( f"Attempting to crawl {cat_url} for realtime shore.nc4 files") skips = Crawl.SKIPS + [ ".*Courier*", ".*Express*", ".*Normal*, '.*Priority*", ".*.cfg$", ".*.js$", ".*.kml$", ".*.log$" ] crawl_debug = False if self.args.verbose > 2: crawl_debug = True rt_cat = Crawl(cat_url, select=[".*shore.nc4"], skip=skips, debug=crawl_debug) for url in [ s.get("url") for d in rt_cat.datasets for s in d.services if s.get("service").lower() == "opendap" ]: dir_start = datetime.strptime( url.split('/')[-2], '%Y%m%dT%H%M%S') if startdate <= dir_start and dir_start <= enddate: self.logger.debug(f"Adding url {url}") urls.append(url) else: self.logger.debug( f"Attempting to Crawl {cat_url} looking for .dlist files") skips = Crawl.SKIPS + [ ".*Courier*", ".*Express*", ".*Normal*, '.*Priority*", ".*.cfg$" ] dlist_cat = Crawl(cat_url, select=[".*dlist"], skip=skips) self.logger.info( f"Crawling {cat_url} for {files} files to make {self.args.resampleFreq}_{self.args.appendString}.nc files" ) for dataset in dlist_cat.datasets: # get the mission directory name and extract the start and ending dates dlist = os.path.basename(dataset.id) mission_dir_name = dlist.split('.')[0] dts = mission_dir_name.split('_') dir_start = datetime.strptime(dts[0], '%Y%m%d') dir_end = datetime.strptime(dts[1], '%Y%m%d') # if within a valid range, grab the valid urls self.logger.debug( f"Checking if .dlist {dlist} is within {startdate} and {enddate}" ) if (startdate <= dir_start and dir_start <= enddate) or (startdate <= dir_end and dir_end <= enddate): catalog = '{}_{}/catalog.xml'.format( dir_start.strftime('%Y%m%d'), dir_end.strftime('%Y%m%d')) self.logger.debug( f"Crawling {os.path.join(base, catalog)}") log_cat = Crawl(os.path.join(base, catalog), select=[select], skip=skips) self.logger.debug( f"Getting opendap urls from datasets {log_cat.datasets}" ) for url in [ s.get("url") for d in log_cat.datasets for s in d.services if s.get("service").lower() == "opendap" ]: self.logger.debug(f"Adding url {url}") urls.append(url) if not urls: self.logger.info("No URLs found.") return urls