def append_variable_data(ds, variable_dict, common_stream_name, exclude_times): pressure_unit, pressure_name = [], [] ds_vars = cf.return_raw_vars(list(ds.data_vars.keys()) + list(ds.coords)) vars_dict = variable_dict[common_stream_name]['vars'] print('\nPARAMETERS: ') for var in ds_vars: try: long_name = ds[var].long_name x = [x for x in list(vars_dict.keys()) if long_name in x] if len(x) != 0: long_name = x[0] if ds[var].units == vars_dict[long_name]['db_units']: print('______', long_name) if ds[var]._FillValue not in vars_dict[long_name]['fv']: vars_dict[long_name]['fv'].append(ds[var]._FillValue) if ds[var].units not in vars_dict[long_name]['units']: vars_dict[long_name]['units'].append(ds[var].units) tD = ds['time'].values varD = ds[var].values deployD = ds['deployment'].values # find the pressure to use from the data file pvar, pD, p_unit, p_name, p_fv = cf.add_pressure_to_dictionary_of_sci_vars( ds) if p_unit not in pressure_unit: pressure_unit.append(p_unit) if p_name not in pressure_name: pressure_name.append(p_name) if len(ds[var].dims) == 1: if len(exclude_times) > 0: for et in exclude_times: tD, pD, varD, deployD = exclude_time_ranges( tD, pD, varD, deployD, et) if len(tD) > 0: vars_dict[long_name]['t'] = np.append( vars_dict[long_name]['t'], tD) vars_dict[long_name]['pressure'] = np.append( vars_dict[long_name]['pressure'], pD) vars_dict[long_name]['values'] = np.append( vars_dict[long_name]['values'], varD) vars_dict[long_name][ 'deployments'] = np.append( vars_dict[long_name]['deployments'], deployD) else: vars_dict[long_name]['t'] = np.append( vars_dict[long_name]['t'], tD) vars_dict[long_name]['pressure'] = np.append( vars_dict[long_name]['pressure'], pD) vars_dict[long_name]['values'] = np.append( vars_dict[long_name]['values'], varD) vars_dict[long_name]['deployments'] = np.append( vars_dict[long_name]['deployments'], deployD) else: # appending 2D datasets if type(vars_dict[long_name]['values']) != dict: vars_dict[long_name].pop('values') vars_dict[long_name].update({'values': dict()}) varD = varD.T # for presf_wave_burst data, telemetered and recovered_host pressure data have a matrix of 20, # while recovered_inst data have a matrix of 1024. for DCL data, whatever is above 20 will # be an array of nans as placeholders (so the indices match between DCL and recovered_inst if common_stream_name == 'presf_abc_wave_burst': lendims = 1024 else: lendims = len(varD) for i in range(lendims): tD = ds['time'].values # reset the time variable deployD = ds['deployment'].values pDi = pD try: vars_dict[long_name]['values'][i] except KeyError: vars_dict[long_name]['values'].update( {i: np.array([])}) try: varDi = varD[i] except IndexError: varDi = np.empty(np.shape(tD)) varDi[:] = np.nan if len(exclude_times) > 0: for et in exclude_times: tD, pDi, varDi, deployD = exclude_time_ranges( tD, pDi, varDi, deployD, et) if len(tD) > 0: if i == 0: vars_dict[long_name]['t'] = np.append( vars_dict[long_name]['t'], tD) vars_dict[long_name][ 'pressure'] = np.append( vars_dict[long_name] ['pressure'], pDi) vars_dict[long_name]['values'][ i] = np.append( vars_dict[long_name]['values'] [i], varDi) vars_dict[long_name][ 'deployments'] = np.append( vars_dict[long_name] ['deployments'], deployD) else: vars_dict[long_name]['values'][ i] = np.append( vars_dict[long_name]['values'] [i], varDi) else: if i == 0: vars_dict[long_name]['t'] = np.append( vars_dict[long_name]['t'], tD) vars_dict[long_name][ 'pressure'] = np.append( vars_dict[long_name]['pressure'], pDi) vars_dict[long_name]['values'][ i] = np.append( vars_dict[long_name]['values'][i], varDi) vars_dict[long_name][ 'deployments'] = np.append( vars_dict[long_name] ['deployments'], deployD) else: vars_dict[long_name]['values'][ i] = np.append( vars_dict[long_name]['values'][i], varDi) except AttributeError: continue return variable_dict, pressure_unit, pressure_name
def append_evaluated_data(sDir, deployment, ds, variable_dict, common_stream_name, zdbar): pressure_unit, pressure_name = [], [] r = '{}-{}-{}'.format(ds.subsite, ds.node, ds.sensor) ds_vars = cf.return_raw_vars(list(ds.data_vars.keys()) + list(ds.coords)) vars_dict = variable_dict[common_stream_name]['vars'] total_len = 0 for var in ds_vars: try: long_name = ds[var].long_name x = [x for x in list(vars_dict.keys()) if long_name in x] if len(x) != 0: long_name = x[0] if ds[var].units == vars_dict[long_name]['db_units']: print('\n' + var) if ds[var]._FillValue not in vars_dict[long_name]['fv']: vars_dict[long_name]['fv'].append(ds[var]._FillValue) if ds[var].units not in vars_dict[long_name]['units']: vars_dict[long_name]['units'].append(ds[var].units) tD = ds['time'].values varD = ds[var].values deployD = ds['deployment'].values # find the pressure to use from the data file pvarname, pD, p_unit, p_name, p_fillvalue = cf.add_pressure_to_dictionary_of_sci_vars( ds) if p_unit not in pressure_unit: pressure_unit.append(p_unit) if p_name not in pressure_name: pressure_name.append(p_name) l0 = len(tD) # reject erroneous data tD, pD, varD, deployD = reject_erroneous_data( r, var, tD, pD, varD, deployD, ds[var]._FillValue) l_erroneous = len(tD) print('{} erroneous data'.format(l0 - l_erroneous)) if l_erroneous != 0: # reject time range from data portal file export tD, pD, varD, deployD = reject_timestamps_data_portal( ds.subsite, r, tD, pD, varD, deployD) l_portal = len(tD) print('{} suspect - data portal'.format(l_erroneous - l_portal)) if l_portal != 0: # reject timestamps from stat analysis Dpath = '{}/{}/{}/{}/{}'.format( sDir, ds.subsite[0:2], ds.subsite, r, 'time_to_exclude') tD, pD, varD, deployD = reject_timestamps_from_stat_analysis( Dpath, deployment, var, tD, pD, varD, deployD) l_stat = len(tD) print( '{} suspect - stat analysis'.format(l_portal - l_stat)) # # reject timestamps in a depth range tD, pD, varD, deployD = reject_data_in_depth_range( tD, pD, varD, deployD, zdbar) l_zrange = len(tD) print('{} suspect - water depth > {} dbar'.format( l_stat - l_zrange, zdbar)) else: print( 'suspect data - rejected all, see data portal') else: print('erroneous data - rejected all') vars_dict[long_name]['t'] = np.append( vars_dict[long_name]['t'], tD) vars_dict[long_name]['pressure'] = np.append( vars_dict[long_name]['pressure'], pD) vars_dict[long_name]['values'] = np.append( vars_dict[long_name]['values'], varD) vars_dict[long_name]['deployments'] = np.append( vars_dict[long_name]['deployments'], deployD) total_len += l0 except AttributeError: continue return variable_dict, pressure_unit, pressure_name, total_len
def main(sDir, url_list, start_time, end_time, preferred_only): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = [] if preferred_only == 'yes': # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): rms = '-'.join((r, row[ii])) for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join((spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets for fd in fdatasets: with xr.open_dataset(fd, mask_and_scale=False) as ds: ds = ds.swap_dims({'obs': 'time'}) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(fd) print('\nPlotting {} {}'.format(r, deployment)) array = subsite[0:2] save_dir = os.path.join(sDir, array, subsite, refdes, 'ts_plots') cf.create_dir(save_dir) tme = ds['time'].values t0 = pd.to_datetime(tme.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(tme.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) filename = '-'.join(('_'.join(fname.split('_')[:-1]), 'ts', t0[:10])) ds_vars = list(ds.data_vars.keys()) raw_vars = cf.return_raw_vars(ds_vars) xvar = return_var(ds, raw_vars, 'salinity', 'Practical Salinity') sal = ds[xvar].values sal_fv = ds[xvar]._FillValue yvar = return_var(ds, raw_vars, 'temp', 'Seawater Temperature') temp = ds[yvar].values temp_fv = ds[yvar]._FillValue press = pf.pressure_var(ds, list(ds.coords.keys())) if press is None: press = pf.pressure_var(ds, list(ds.data_vars.keys())) p = ds[press].values # get rid of nans, 0.0s, fill values sind1 = (~np.isnan(sal)) & (sal != 0.0) & (sal != sal_fv) sal = sal[sind1] temp = temp[sind1] tme = tme[sind1] p = p[sind1] tind1 = (~np.isnan(temp)) & (temp != 0.0) & (temp != temp_fv) sal = sal[tind1] temp = temp[tind1] tme = tme[tind1] p = p[tind1] # reject values outside global ranges: global_min, global_max = cf.get_global_ranges(r, xvar) if any(e is None for e in [global_min, global_max]): sal = sal temp = temp tme = tme p = p else: sgr_ind = cf.reject_global_ranges(sal, global_min, global_max) sal = sal[sgr_ind] temp = temp[sgr_ind] tme = tme[sgr_ind] p = p[sgr_ind] global_min, global_max = cf.get_global_ranges(r, yvar) if any(e is None for e in [global_min, global_max]): sal = sal temp = temp tme = tme p = p else: tgr_ind = cf.reject_global_ranges(temp, global_min, global_max) sal = sal[tgr_ind] temp = temp[tgr_ind] tme = tme[tgr_ind] p = p[tgr_ind] # get rid of outliers soind = cf.reject_outliers(sal, 5) sal = sal[soind] temp = temp[soind] tme = tme[soind] p = p[soind] toind = cf.reject_outliers(temp, 5) sal = sal[toind] temp = temp[toind] tme = tme[toind] p = p[toind] if len(sal) > 0: # if there are any data to plot colors = cm.rainbow(np.linspace(0, 1, len(tme))) # Figure out boundaries (mins and maxes) #smin = sal.min() - (0.01 * sal.min()) #smax = sal.max() + (0.01 * sal.max()) if sal.max() - sal.min() < 0.2: smin = sal.min() - (0.0005 * sal.min()) smax = sal.max() + (0.0005 * sal.max()) else: smin = sal.min() - (0.001 * sal.min()) smax = sal.max() + (0.001 * sal.max()) if temp.max() - temp.min() <= 1: tmin = temp.min() - (0.01 * temp.min()) tmax = temp.max() + (0.01 * temp.max()) elif 1 < temp.max() - temp.min() < 1.5: tmin = temp.min() - (0.05 * temp.min()) tmax = temp.max() + (0.05 * temp.max()) else: tmin = temp.min() - (0.1 * temp.min()) tmax = temp.max() + (0.1 * temp.max()) # Calculate how many gridcells are needed in the x and y directions and # Create temp and sal vectors of appropriate dimensions xdim = int(round((smax-smin)/0.1 + 1, 0)) if xdim == 1: xdim = 2 si = np.linspace(0, xdim - 1, xdim) * 0.1 + smin if 1.1 <= temp.max() - temp.min() < 1.7: # if the diff between min and max temp is small ydim = int(round((tmax-tmin)/0.75 + 1, 0)) ti = np.linspace(0, ydim - 1, ydim) * 0.75 + tmin elif temp.max() - temp.min() < 1.1: ydim = int(round((tmax - tmin) / 0.1 + 1, 0)) ti = np.linspace(0, ydim - 1, ydim) * 0.1 + tmin else: ydim = int(round((tmax - tmin) + 1, 0)) ti = np.linspace(0, ydim - 1, ydim) + tmin # Create empty grid of zeros mdens = np.zeros((ydim, xdim)) # Loop to fill in grid with densities for j in range(0, ydim): for i in range(0, xdim): mdens[j, i] = gsw.density.rho(si[i], ti[j], np.median(p)) # calculate density using median pressure value fig, ax = pf.plot_ts(si, ti, mdens, sal, temp, colors) ax.set_title((title + '\n' + t0 + ' - ' + t1 + '\ncolors = time (cooler: earlier)'), fontsize=9) leg_text = ('Removed {} values (SD=5)'.format(len(ds[xvar].values) - len(sal)),) ax.legend(leg_text, loc='best', fontsize=6) pf.save_fig(save_dir, filename)
def main(sDir, ncdir, start_time, end_time): rd_list = [ncdir.split('/')[-2]] for r in rd_list: print('\n{}'.format(r)) datasets = [] for root, dirs, files in os.walk(ncdir): for f in files: if f.endswith('.nc'): datasets.append(f) # for u in url_list: # splitter = u.split('/')[-2].split('-') # rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) # if rd_check == r: # udatasets = cf.get_nc_urls([u]) # datasets.append(udatasets) #datasets = list(itertools.chain(*datasets)) for fd in datasets: if '_blank' not in fd: ds = xr.open_dataset(os.path.join(ncdir, fd), mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) ds_vars = list(ds.data_vars.keys()) + [ x for x in ds.coords.keys() if 'pressure' in x ] # get pressure variable from coordinates #raw_vars = cf.return_raw_vars(ds_vars) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})' .format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( os.path.join(ncdir, fd)) if 'NUTNR' in refdes or 'VEL3D in refdes': vars = cf.return_science_vars(stream) else: vars = cf.return_raw_vars(ds_vars) print('\nPlotting {} {}'.format(r, deployment)) array = subsite[0:2] filename = '_'.join(fname.split('_')[:-1]) save_dir = os.path.join(sDir, array, subsite, refdes, 'timeseries_plots', deployment) cf.create_dir(save_dir) tm = ds['time'].values t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) for var in vars: print(var) if var not in ['id', 'record_type', 'unique_id']: # if var != 'id' y = ds[var] try: fv = y._FillValue except AttributeError: fv = np.nan if len(y.dims) == 1: # Check if the array is all NaNs y[y == fv] = np.nan # turn fill values to nans if sum(np.isnan(y.values)) == len(y.values): print( 'Array of all NaNs and/or fill values - skipping plot.' ) # Check if the array is all fill values # elif len(y[y != fv]) == 0: # print('Array of all fill values - skipping plot.') else: # reject fill values ind = y.values != fv t = tm[ind] y = y[ind] # Plot all data fig, ax = pf.plot_timeseries(t, y, y.name, stdev=None) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '-'.join((filename, y.name, t0[:10])) pf.save_fig(save_dir, sfile) # Plot data with outliers removed fig, ax = pf.plot_timeseries(t, y, y.name, stdev=5) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '-'.join((filename, y.name, t0[:10])) + '_rmoutliers' pf.save_fig(save_dir, sfile)
def main(sDir, url_list, start_time, end_time, preferred_only): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = [] if preferred_only == 'yes': # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): try: rms = '-'.join((r, row[ii])) except TypeError: continue for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join((spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets fdatasets = np.unique(fdatasets).tolist() main_sensor = r.split('-')[-1] fdatasets = cf.filter_collocated_instruments(main_sensor, fdatasets) for fd in fdatasets: if '_blank' not in fd: ds = xr.open_dataset(fd, mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) ds_vars = list(ds.data_vars.keys()) + [x for x in ds.coords.keys() if 'pressure' in x] # get pressure variable from coordinates #raw_vars = cf.return_raw_vars(ds_vars) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(fd) if 'NUTNR' in refdes: vars = cf.return_science_vars(stream) else: vars = cf.return_raw_vars(ds_vars) print('\nPlotting {} {}'.format(r, deployment)) array = subsite[0:2] filename = '_'.join(fname.split('_')[:-1]) save_dir = os.path.join(sDir, array, subsite, refdes, 'timeseries_plots', deployment) cf.create_dir(save_dir) tm = ds['time'].values t0 = pd.to_datetime(tm.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(tm.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) for var in vars: print(var) if var != 'id': y = ds[var] try: fv = y._FillValue except AttributeError: fv = np.nan if len(y.dims) == 1: # Check if the array is all NaNs if sum(np.isnan(y.values)) == len(y.values): print('Array of all NaNs - skipping plot.') # Check if the array is all fill values elif len(y[y != fv]) == 0: print('Array of all fill values - skipping plot.') else: # reject fill values ind = y.values != fv t = tm[ind] y = y[ind] # Plot all data fig, ax = pf.plot_timeseries(t, y, y.name, stdev=None) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '-'.join((filename, y.name, t0[:10])) pf.save_fig(save_dir, sfile) # Plot data with outliers removed fig, ax = pf.plot_timeseries(t, y, y.name, stdev=5) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '-'.join((filename, y.name, t0[:10])) + '_rmoutliers' pf.save_fig(save_dir, sfile)
def main(sDir, f, start_time, end_time): ff = pd.read_csv(os.path.join(sDir, f)) url_list = ff['outputUrl'].tolist() for i, u in enumerate(url_list): print('\nUrl {} of {}: {}'.format(i + 1, len(url_list), u)) main_sensor = u.split('/')[-2].split('-')[4] datasets = cf.get_nc_urls([u]) datasets_sel = cf.filter_collocated_instruments(main_sensor, datasets) for ii, d in enumerate(datasets_sel): print('\nDataset {} of {}: {}'.format(ii + 1, len(datasets_sel), d)) with xr.open_dataset(d, mask_and_scale=False) as ds: ds = ds.swap_dims({'obs': 'time'}) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})' .format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( d) vars = ds.data_vars.keys() if 'MOAS' in subsite and 'CTD' in main_sensor: # for glider CTDs, pressure is a coordinate pressure = 'sci_water_pressure_dbar' else: pressure = pf.pressure_var(ds, vars) raw_vars = cf.return_raw_vars(vars) raw_vars = [s for s in raw_vars if s not in [pressure] ] # remove pressure from sci_vars save_dir = os.path.join(sDir, subsite, refdes, 'profile_plots', deployment) cf.create_dir(save_dir) t = ds['time'].values t0 = pd.to_datetime(t.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(t.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) colors = cm.rainbow(np.linspace(0, 1, len(t))) y = ds[pressure] print('Plotting variables...') for var in raw_vars: print(var) x = ds[var] # Plot all data xlabel = var + " (" + x.units + ")" ylabel = pressure + " (" + y.units + ")" fig, ax = pf.plot_profiles(x, y, colors, ylabel, xlabel, stdev=None) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '_'.join((fname[0:-46], x.name)) pf.save_fig(save_dir, sfile) # Plot data with outliers removed fig, ax = pf.plot_profiles(x, y, colors, ylabel, xlabel, stdev=5) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '_'.join((fname[0:-46], x.name, 'rmoutliers')) pf.save_fig(save_dir, sfile)