def main(sDir, url_list, start_time, end_time, preferred_only): rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) for r in rd_list: print('\n{}'.format(r)) datasets = [] for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = [] if preferred_only == 'yes': # get the preferred stream information ps_df, n_streams = cf.get_preferred_stream_info(r) for index, row in ps_df.iterrows(): for ii in range(n_streams): rms = '-'.join((r, row[ii])) for dd in datasets: spl = dd.split('/')[-2].split('-') catalog_rms = '-'.join((spl[1], spl[2], spl[3], spl[4], spl[5], spl[6])) fdeploy = dd.split('/')[-1].split('_')[0] if rms == catalog_rms and fdeploy == row['deployment']: fdatasets.append(dd) else: fdatasets = datasets for fd in fdatasets: with xr.open_dataset(fd, mask_and_scale=False) as ds: ds = ds.swap_dims({'obs': 'time'}) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print('No data to plot for specified time range: ({} to {})'.format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes(fd) print('\nPlotting {} {}'.format(r, deployment)) array = subsite[0:2] save_dir = os.path.join(sDir, array, subsite, refdes, 'ts_plots') cf.create_dir(save_dir) tme = ds['time'].values t0 = pd.to_datetime(tme.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(tme.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) filename = '-'.join(('_'.join(fname.split('_')[:-1]), 'ts', t0[:10])) ds_vars = list(ds.data_vars.keys()) raw_vars = cf.return_raw_vars(ds_vars) xvar = return_var(ds, raw_vars, 'salinity', 'Practical Salinity') sal = ds[xvar].values sal_fv = ds[xvar]._FillValue yvar = return_var(ds, raw_vars, 'temp', 'Seawater Temperature') temp = ds[yvar].values temp_fv = ds[yvar]._FillValue press = pf.pressure_var(ds, list(ds.coords.keys())) if press is None: press = pf.pressure_var(ds, list(ds.data_vars.keys())) p = ds[press].values # get rid of nans, 0.0s, fill values sind1 = (~np.isnan(sal)) & (sal != 0.0) & (sal != sal_fv) sal = sal[sind1] temp = temp[sind1] tme = tme[sind1] p = p[sind1] tind1 = (~np.isnan(temp)) & (temp != 0.0) & (temp != temp_fv) sal = sal[tind1] temp = temp[tind1] tme = tme[tind1] p = p[tind1] # reject values outside global ranges: global_min, global_max = cf.get_global_ranges(r, xvar) if any(e is None for e in [global_min, global_max]): sal = sal temp = temp tme = tme p = p else: sgr_ind = cf.reject_global_ranges(sal, global_min, global_max) sal = sal[sgr_ind] temp = temp[sgr_ind] tme = tme[sgr_ind] p = p[sgr_ind] global_min, global_max = cf.get_global_ranges(r, yvar) if any(e is None for e in [global_min, global_max]): sal = sal temp = temp tme = tme p = p else: tgr_ind = cf.reject_global_ranges(temp, global_min, global_max) sal = sal[tgr_ind] temp = temp[tgr_ind] tme = tme[tgr_ind] p = p[tgr_ind] # get rid of outliers soind = cf.reject_outliers(sal, 5) sal = sal[soind] temp = temp[soind] tme = tme[soind] p = p[soind] toind = cf.reject_outliers(temp, 5) sal = sal[toind] temp = temp[toind] tme = tme[toind] p = p[toind] if len(sal) > 0: # if there are any data to plot colors = cm.rainbow(np.linspace(0, 1, len(tme))) # Figure out boundaries (mins and maxes) #smin = sal.min() - (0.01 * sal.min()) #smax = sal.max() + (0.01 * sal.max()) if sal.max() - sal.min() < 0.2: smin = sal.min() - (0.0005 * sal.min()) smax = sal.max() + (0.0005 * sal.max()) else: smin = sal.min() - (0.001 * sal.min()) smax = sal.max() + (0.001 * sal.max()) if temp.max() - temp.min() <= 1: tmin = temp.min() - (0.01 * temp.min()) tmax = temp.max() + (0.01 * temp.max()) elif 1 < temp.max() - temp.min() < 1.5: tmin = temp.min() - (0.05 * temp.min()) tmax = temp.max() + (0.05 * temp.max()) else: tmin = temp.min() - (0.1 * temp.min()) tmax = temp.max() + (0.1 * temp.max()) # Calculate how many gridcells are needed in the x and y directions and # Create temp and sal vectors of appropriate dimensions xdim = int(round((smax-smin)/0.1 + 1, 0)) if xdim == 1: xdim = 2 si = np.linspace(0, xdim - 1, xdim) * 0.1 + smin if 1.1 <= temp.max() - temp.min() < 1.7: # if the diff between min and max temp is small ydim = int(round((tmax-tmin)/0.75 + 1, 0)) ti = np.linspace(0, ydim - 1, ydim) * 0.75 + tmin elif temp.max() - temp.min() < 1.1: ydim = int(round((tmax - tmin) / 0.1 + 1, 0)) ti = np.linspace(0, ydim - 1, ydim) * 0.1 + tmin else: ydim = int(round((tmax - tmin) + 1, 0)) ti = np.linspace(0, ydim - 1, ydim) + tmin # Create empty grid of zeros mdens = np.zeros((ydim, xdim)) # Loop to fill in grid with densities for j in range(0, ydim): for i in range(0, xdim): mdens[j, i] = gsw.density.rho(si[i], ti[j], np.median(p)) # calculate density using median pressure value fig, ax = pf.plot_ts(si, ti, mdens, sal, temp, colors) ax.set_title((title + '\n' + t0 + ' - ' + t1 + '\ncolors = time (cooler: earlier)'), fontsize=9) leg_text = ('Removed {} values (SD=5)'.format(len(ds[xvar].values) - len(sal)),) ax.legend(leg_text, loc='best', fontsize=6) pf.save_fig(save_dir, filename)
def add_pressure_to_dictionary_of_sci_vars(ds): y_unit = [] y_name = [] y_fillvalue = [] if 'MOAS' in ds.subsite: if 'CTD' in ds.sensor: # for glider CTDs, pressure is a coordinate pressure = 'sci_water_pressure_dbar' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = 'int_ctd_pressure' y = ds[pressure].values try: if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) except AttributeError: y_unit.append('no_units') try: if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) except AttributeError: y_name.append('pressure') else: try: pressure = pf.pressure_var(ds, ds.data_vars.keys()) y = ds[pressure].values if len(ds[pressure].dims) > 1: print('Pressure has >1 dimension') y_empty = np.empty((1, len(ds['time'].values))) y_empty[:] = np.nan y = y_empty.ravel() except KeyError: print('no pressure variable in file - replacing by a nan array') y_empty = np.empty((1, len(ds['time'].values))) y_empty[:] = np.nan y = y_empty.ravel() if sum(np.isnan(y)) == len(y) or len(y[y != 0]) == 0 or len( y[y != ds[pressure]._FillValue]) == 0: print( 'Pressure array of all NaNs or zeros or fill values - ... trying to use pressure coordinate' ) pressure = [ pressure for pressure in ds.coords.keys() if 'pressure' in ds.coords[pressure].name ] if len(pressure) == 1: pressure = pressure[0] y = ds.coords[pressure].values else: print('Missing pressure coordinate: ', pressure) y_empty = np.empty((1, len(ds['time'].values))) y_empty[:] = np.nan y = y_empty.ravel() try: ds[pressure].units if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) except AttributeError: print('pressure attributes missing units') if 'pressure unit missing' not in y_unit: y_unit.append('pressure unit missing') try: ds[pressure].long_name if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) except AttributeError: print('pressure attributes missing long_name') if 'pressure long name missing' not in y_name: y_name.append('pressure long name missing') try: ds[pressure]._FillValue if ds[pressure]._FillValue not in y_fillvalue: y_fillvalue.append(ds[pressure]._FillValue) except AttributeError: print('pressure attributes missing _FillValue') if 'pressure Fill Value missing' not in y_fillvalue: y_fillvalue.append('pressure Fill Value missing') return pressure, y, y_unit, y_name, y_fillvalue
def main(sDir, url_list): reviewlist = pd.read_csv( 'https://raw.githubusercontent.com/ooi-data-lab/data-review-prep/master/review_list/data_review_list.csv' ) rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) json_file_list = [] for r in rd_list: dependencies = [] print('\n{}'.format(r)) data = OrderedDict(deployments=OrderedDict()) save_dir = os.path.join(sDir, r.split('-')[0], r) cf.create_dir(save_dir) # Deployment location test deploy_loc_test = cf.deploy_location_check(r) data['location_comparison'] = deploy_loc_test for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) catalog_rms = '-'.join((r, splitter[-2], splitter[-1])) # complete the analysis by reference designator if rd_check == r: udatasets = cf.get_nc_urls([u]) # check for the OOI 1.0 datasets for review rl_filtered = reviewlist.loc[ (reviewlist['Reference Designator'] == r) & (reviewlist['status'] == 'for review')] review_deployments = rl_filtered['deploymentNumber'].tolist() review_deployments_int = [ 'deployment%04d' % int(x) for x in review_deployments ] for rev_dep in review_deployments_int: rdatasets = [s for s in udatasets if rev_dep in s] if len(rdatasets) > 0: datasets = [] for dss in rdatasets: # filter out collocated data files if catalog_rms == dss.split('/')[-1].split( '_20')[0][15:]: datasets.append(dss) else: drd = dss.split('/')[-1].split('_20')[0][15:42] if drd not in dependencies and drd != r: dependencies.append(drd) notes = [] time_ascending = '' if len(datasets) == 1: try: ds = xr.open_dataset(datasets[0], mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) fname, subsite, refdes, method, data_stream, deployment = cf.nc_attributes( datasets[0]) except OSError: print('OSError - skipping file {}'.format( datasets[0])) continue elif len(datasets) > 1: ds = xr.open_mfdataset(datasets, mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) #ds = ds.chunk({'time': 100}) fname, subsite, refdes, method, data_stream, deployment = cf.nc_attributes( datasets[0]) fname = fname.split('_20')[0] notes.append('multiple deployment .nc files') # when opening multiple datasets, don't check that the timestamps are in ascending order time_ascending = 'not_tested' else: continue print('\nAnalyzing file: {}'.format(fname)) # Get info from the data review database dr_data = cf.refdes_datareview_json(refdes) stream_vars = cf.return_stream_vars(data_stream) sci_vars = cf.return_science_vars(data_stream) node = refdes.split('-')[1] if 'cspp' in data_stream or 'WFP' in node: sci_vars.append('int_ctd_pressure') # if 'FDCHP' in refdes: # remove_vars = ['fdchp_wind_x', 'fdchp_wind_y', 'fdchp_wind_z', 'fdchp_speed_of_sound_sonic', # 'fdchp_x_accel_g', 'fdchp_y_accel_g', 'fdchp_z_accel_g'] # rv_regex = re.compile('|'.join(remove_vars)) # rv_sci_vars = [nn for nn in sci_vars if not rv_regex.search(nn)] # sci_vars = rv_sci_vars deploy_info = get_deployment_information( dr_data, int(deployment[-4:])) # Grab deployment Variables deploy_start = str(deploy_info['start_date']) deploy_stop = str(deploy_info['stop_date']) deploy_lon = deploy_info['longitude'] deploy_lat = deploy_info['latitude'] deploy_depth = deploy_info['deployment_depth'] # Calculate days deployed if deploy_stop != 'None': r_deploy_start = pd.to_datetime( deploy_start).replace(hour=0, minute=0, second=0) if deploy_stop.split('T')[1] == '00:00:00': r_deploy_stop = pd.to_datetime(deploy_stop) else: r_deploy_stop = (pd.to_datetime(deploy_stop) + timedelta(days=1)).replace( hour=0, minute=0, second=0) n_days_deployed = (r_deploy_stop - r_deploy_start).days else: n_days_deployed = None # Add reference designator to dictionary try: data['refdes'] except KeyError: data['refdes'] = refdes deployments = data['deployments'].keys() data_start = pd.to_datetime(min( ds['time'].values)).strftime('%Y-%m-%dT%H:%M:%S') data_stop = pd.to_datetime(max( ds['time'].values)).strftime('%Y-%m-%dT%H:%M:%S') # Add deployment and info to dictionary and initialize delivery method sub-dictionary if deployment not in deployments: data['deployments'][deployment] = OrderedDict( deploy_start=deploy_start, deploy_stop=deploy_stop, n_days_deployed=n_days_deployed, lon=deploy_lon, lat=deploy_lat, deploy_depth=deploy_depth, method=OrderedDict()) # Add delivery methods to dictionary and initialize stream sub-dictionary methods = data['deployments'][deployment][ 'method'].keys() if method not in methods: data['deployments'][deployment]['method'][ method] = OrderedDict(stream=OrderedDict()) # Add streams to dictionary and initialize file sub-dictionary streams = data['deployments'][deployment]['method'][ method]['stream'].keys() if data_stream not in streams: data['deployments'][deployment]['method'][method][ 'stream'][data_stream] = OrderedDict( file=OrderedDict()) # Get a list of data gaps >1 day time_df = pd.DataFrame(ds['time'].values, columns=['time']) gap_list = cf.timestamp_gap_test(time_df) # Calculate the sampling rate to the nearest second time_df['diff'] = time_df['time'].diff().astype( 'timedelta64[s]') rates_df = time_df.groupby(['diff']).agg(['count']) n_diff_calc = len(time_df) - 1 rates = dict(n_unique_rates=len(rates_df), common_sampling_rates=dict()) for i, row in rates_df.iterrows(): percent = (float(row['time']['count']) / float(n_diff_calc)) if percent > 0.1: rates['common_sampling_rates'].update( {int(i): '{:.2%}'.format(percent)}) sampling_rt_sec = None for k, v in rates['common_sampling_rates'].items(): if float(v.strip('%')) > 50.00: sampling_rt_sec = k if not sampling_rt_sec: sampling_rt_sec = 'no consistent sampling rate: {}'.format( rates['common_sampling_rates']) # Check that the timestamps in the file are unique time = ds['time'] len_time = time.__len__() len_time_unique = np.unique(time).__len__() if len_time == len_time_unique: time_test = 'pass' else: time_test = 'fail' # Check that the timestamps in the file are in ascending order if time_ascending != 'not_tested': # convert time to number time_in = [ dt.datetime.utcfromtimestamp( np.datetime64(x).astype('O') / 1e9) for x in ds['time'].values ] time_data = nc.date2num( time_in, 'seconds since 1900-01-01') # Create a list of True or False by iterating through the array of time and checking # if every time stamp is increasing result = [(time_data[k + 1] - time_data[k]) > 0 for k in range(len(time_data) - 1)] # Print outcome of the iteration with the list of indices when time is not increasing if result.count(True) == len(time) - 1: time_ascending = 'pass' else: ind_fail = { k: time_in[k] for k, v in enumerate(result) if v is False } time_ascending = 'fail: {}'.format(ind_fail) # Count the number of days for which there is at least 1 timestamp n_days = len( np.unique(time.values.astype('datetime64[D]'))) # Compare variables in file to variables in Data Review Database ds_variables = list(ds.data_vars.keys()) + list( ds.coords.keys()) #ds_variables = [k for k in ds] ds_variables = eliminate_common_variables(ds_variables) ds_variables = [ x for x in ds_variables if 'qc' not in x ] [_, unmatch1] = compare_lists(stream_vars, ds_variables) [_, unmatch2] = compare_lists(ds_variables, stream_vars) # Check deployment pressure from asset management against pressure variable in file press = pf.pressure_var(ds, list(ds.coords.keys())) if press is None: press = pf.pressure_var(ds, list(ds.data_vars.keys())) # calculate mean pressure from data, excluding outliers +/- 3 SD try: pressure = ds[press] num_dims = len(pressure.dims) if len(pressure) > 1: # if the pressure variable is an array of all zeros (as in the case of pressure_depth # for OPTAAs on surface piercing profilers if (len(np.unique(pressure)) == 1) & ( np.unique(pressure)[0] == 0.0): try: pressure = ds['int_ctd_pressure'] press = 'int_ctd_pressure' except KeyError: pressure = pressure # reject NaNs p_nonan = pressure.values[~np.isnan(pressure. values)] # reject fill values p_nonan_nofv = p_nonan[ p_nonan != pressure._FillValue] # reject data outside of global ranges [pg_min, pg_max] = cf.get_global_ranges(r, press) if pg_min is not None and pg_max is not None: pgr_ind = cf.reject_global_ranges( p_nonan_nofv, pg_min, pg_max) p_nonan_nofv_gr = p_nonan_nofv[pgr_ind] else: p_nonan_nofv_gr = p_nonan_nofv if (len(p_nonan_nofv_gr) > 0) and (num_dims == 1): [ press_outliers, pressure_mean, _, pressure_max, _, _ ] = cf.variable_statistics( p_nonan_nofv_gr, 3) pressure_mean = round(pressure_mean, 2) pressure_max = round(pressure_max, 2) elif (len(p_nonan_nofv_gr) > 0) and (num_dims > 1): print('variable has more than 1 dimension') press_outliers = 'not calculated: variable has more than 1 dimension' pressure_mean = round( np.nanmean(p_nonan_nofv_gr), 2) pressure_max = round( np.nanmax(p_nonan_nofv_gr), 2) else: press_outliers = None pressure_mean = None pressure_max = None if len(pressure) > 0 and len(p_nonan) == 0: notes.append( 'Pressure variable all NaNs') elif len(pressure) > 0 and len( p_nonan) > 0 and len( p_nonan_nofv) == 0: notes.append( 'Pressure variable all fill values' ) elif len(pressure) > 0 and len( p_nonan) > 0 and len( p_nonan_nofv) > 0 and len( p_nonan_nofv_gr) == 0: notes.append( 'Pressure variable outside of global ranges' ) else: # if there is only 1 data point press_outliers = 0 pressure_mean = round( ds[press].values.tolist()[0], 2) pressure_max = round( ds[press].values.tolist()[0], 2) try: pressure_units = pressure.units except AttributeError: pressure_units = 'no units attribute for pressure' if pressure_mean: if ('WFP' in node) or ('MOAS' in subsite) or ( 'SP' in node): pressure_compare = int(round(pressure_max)) else: pressure_compare = int( round(pressure_mean)) if pressure_units == '0.001 dbar': pressure_max = round((pressure_max / 1000), 2) pressure_mean = round( (pressure_mean / 1000), 2) pressure_compare = round( (pressure_compare / 1000), 2) notes.append( 'Pressure converted from 0.001 dbar to dbar for pressure comparison' ) elif pressure_units == 'daPa': pressure_max = round((pressure_max / 1000), 2) pressure_mean = round( (pressure_mean / 1000), 2) pressure_compare = round( (pressure_compare / 1000), 2) notes.append( 'Pressure converted from daPa to dbar for pressure comparison' ) else: pressure_compare = None if (not deploy_depth) or (not pressure_mean): pressure_diff = None else: pressure_diff = pressure_compare - deploy_depth except KeyError: press = 'no seawater pressure in file' pressure_diff = None pressure_mean = None pressure_max = None pressure_compare = None press_outliers = None pressure_units = None # Add files and info to dictionary filenames = data['deployments'][deployment]['method'][ method]['stream'][data_stream]['file'].keys() if fname not in filenames: data['deployments'][deployment]['method'][method][ 'stream'][data_stream]['file'][ fname] = OrderedDict( file_downloaded=pd.to_datetime( splitter[0][0:15]).strftime( '%Y-%m-%dT%H:%M:%S'), file_coordinates=list( ds.coords.keys()), sampling_rate_seconds=sampling_rt_sec, sampling_rate_details=rates, data_start=data_start, data_stop=data_stop, time_gaps=gap_list, unique_timestamps=time_test, n_timestamps=len_time, n_days=n_days, notes=notes, ascending_timestamps=time_ascending, pressure_comparison=dict( pressure_mean=pressure_mean, units=pressure_units, num_outliers=press_outliers, diff=pressure_diff, pressure_max=pressure_max, variable=press, pressure_compare=pressure_compare), vars_in_file=ds_variables, vars_not_in_file=[ x for x in unmatch1 if 'time' not in x ], vars_not_in_db=unmatch2, sci_var_stats=OrderedDict()) # calculate statistics for science variables, excluding outliers +/- 5 SD for sv in sci_vars: if sv != 't_max': # for ADCP if sv != 'wavss_a_buoymotion_time': print(sv) try: var = ds[sv] # need to round SPKIR values to 1 decimal place to match the global ranges. # otherwise, values that round to zero (e.g. 1.55294e-05) will be excluded by # the global range test # if 'spkir' in sv: # vD = np.round(var.values, 1) # else: # vD = var.values vD = var.values if 'timedelta' not in str( var.values.dtype): # for OPTAA wavelengths: when multiple files are opened with xr.open_mfdataset # xarray automatically forces all variables to have the same number of # dimensions. So in this case wavelength_a and wavelength_c have 1 dimension # in the individual files, so I'm forcing the analysis to treat them like # they have 1 dimension (when there are multiple files for 1 deployment) if sv == 'wavelength_a' or sv == 'wavelength_c': [g_min, g_max] = cf.get_global_ranges( r, sv) vnum_dims = len(var.dims) if vnum_dims == 1: n_all = len(var) mean = list(vD) else: vnum_dims = 1 n_all = len(vD[0]) mean = list(vD[0]) num_outliers = None vmin = None vmax = None sd = None n_stats = 'not calculated' var_units = var.units n_nan = None n_fv = None n_grange = 'no global ranges' fv = var._FillValue else: vnum_dims = len(var.dims) if vnum_dims > 2: print( 'variable has more than 2 dimensions' ) num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = 'variable has more than 2 dimensions' var_units = var.units n_nan = None n_fv = None n_grange = None fv = None n_all = None else: if vnum_dims > 1: n_all = [ len(vD), len(vD.flatten()) ] else: n_all = len(vD) n_nan = int( np.sum(np.isnan(vD))) fv = var._FillValue var_nofv = var.where( var != fv) n_fv = int( np.sum( np.isnan( var_nofv.values ))) - n_nan try: var_units = var.units except AttributeError: var_units = 'no_units' [g_min, g_max ] = cf.get_global_ranges( r, sv) if list( np.unique( np.isnan( var_nofv)) ) != [True]: # reject data outside of global ranges if g_min is not None and g_max is not None: var_gr = var_nofv.where( (var_nofv >= g_min) & (var_nofv <= g_max)) n_grange = int( np.sum( np.isnan( var_gr) ) - n_fv - n_nan) else: n_grange = 'no global ranges' var_gr = var_nofv if list( np.unique( np.isnan( var_gr) )) != [True]: if sv == 'spkir_abj_cspp_downwelling_vector': # don't remove outliers from dataset [ num_outliers, mean, vmin, vmax, sd, n_stats ] = cf.variable_statistics_spkir( var_gr) else: if vnum_dims > 1: var_gr = var_gr.values.flatten( ) # drop nans before calculating stats var_gr = var_gr[ ~np.isnan( var_gr )] [ num_outliers, mean, vmin, vmax, sd, n_stats ] = cf.variable_statistics( var_gr, 5) else: num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = 0 n_grange = None else: num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = 0 n_grange = None except KeyError: if sv == 'int_ctd_pressure': continue else: num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = 'variable not found in file' var_units = None n_nan = None n_fv = None fv = None n_grange = None n_all = None if vnum_dims > 1: sv = '{} (dims: {})'.format( sv, list(var.dims)) else: sv = sv if 'timedelta' not in str( var.values.dtype): data['deployments'][deployment][ 'method'][method]['stream'][ data_stream]['file'][fname][ 'sci_var_stats'][sv] = dict( n_outliers=num_outliers, mean=mean, min=vmin, max=vmax, stdev=sd, n_stats=n_stats, units=var_units, n_nans=n_nan, n_fillvalues=n_fv, fill_value=str(fv), global_ranges=[ g_min, g_max ], n_grange=n_grange, n_all=n_all) sfile = os.path.join(save_dir, '{}-file_analysis.json'.format(r)) with open(sfile, 'w') as outfile: json.dump(data, outfile) depfile = os.path.join(save_dir, '{}-dependencies.txt'.format(r)) with open(depfile, 'w') as depf: depf.write(str(dependencies)) json_file_list.append(str(sfile)) return json_file_list
def main(sDir, url_list, deployment_num): reviewlist = pd.read_csv( 'https://raw.githubusercontent.com/ooi-data-lab/data-review-prep/master/review_list/data_review_list.csv') rd_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) if rd not in rd_list: rd_list.append(rd) json_file_list = [] for r in rd_list: dependencies = [] print('\n{}'.format(r)) data = OrderedDict(deployments=OrderedDict()) save_dir = os.path.join(sDir, r.split('-')[0], r) cf.create_dir(save_dir) # Deployment location test deploy_loc_test = cf.deploy_location_check(r) data['location_comparison'] = deploy_loc_test for u in url_list: splitter = u.split('/')[-2].split('-') rd_check = '-'.join((splitter[1], splitter[2], splitter[3], splitter[4])) catalog_rms = '-'.join((r, splitter[-2], splitter[-1])) # complete the analysis by reference designator if rd_check == r: udatasets = cf.get_nc_urls([u]) # check for the OOI 1.0 datasets for review rl_filtered = reviewlist.loc[ (reviewlist['Reference Designator'] == r) & (reviewlist['status'] == 'for review')] review_deployments = rl_filtered['deploymentNumber'].tolist() review_deployments_int = ['deployment%04d' % int(x) for x in review_deployments] for rev_dep in review_deployments_int: if deployment_num is not None: if int(rev_dep[-4:]) is not deployment_num: print('\nskipping {}'.format(rev_dep)) continue rdatasets = [s for s in udatasets if rev_dep in s] rdatasets.sort() if len(rdatasets) > 0: datasets = [] for dss in rdatasets: # filter out collocated data files if catalog_rms == dss.split('/')[-1].split('_20')[0][15:]: datasets.append(dss) else: drd = dss.split('/')[-1].split('_20')[0][15:42] if drd not in dependencies and drd != r: dependencies.append(drd) notes = [] time_ascending = '' sci_vars_dict = {} #datasets = datasets[0:2] #### for testing for i in range(len(datasets)): ds = xr.open_dataset(datasets[i], mask_and_scale=False) ds = ds.swap_dims({'obs': 'time'}) print('\nAppending data from {}: file {} of {}'.format(rev_dep, i+1, len(datasets))) # when opening multiple datasets, don't check that the timestamps are in ascending order time_ascending = 'not_tested' if i == 0: fname, subsite, refdes, method, data_stream, deployment = cf.nc_attributes(datasets[0]) fname = fname.split('_20')[0] # Get info from the data review database dr_data = cf.refdes_datareview_json(refdes) stream_vars = cf.return_stream_vars(data_stream) sci_vars = cf.return_science_vars(data_stream) node = refdes.split('-')[1] if 'cspp' in data_stream or 'WFP' in node: sci_vars.append('int_ctd_pressure') # Add pressure to the list of science variables press = pf.pressure_var(ds, list(ds.coords.keys())) if press is None: press = pf.pressure_var(ds, list(ds.data_vars.keys())) if press is not None: sci_vars.append(press) sci_vars.append('time') sci_vars = list(np.unique(sci_vars)) if 'ADCP' in r: sci_vars = [x for x in sci_vars if 'beam' not in x] for sci_var in sci_vars: if sci_var == 'time': sci_vars_dict.update( {sci_var: dict(values=np.array([], dtype=np.datetime64), units=[], fv=[])}) else: sci_vars_dict.update({sci_var: dict(values=np.array([]), units=[], fv=[])}) deploy_info = get_deployment_information(dr_data, int(deployment[-4:])) # Grab deployment Variables deploy_start = str(deploy_info['start_date']) deploy_stop = str(deploy_info['stop_date']) deploy_lon = deploy_info['longitude'] deploy_lat = deploy_info['latitude'] deploy_depth = deploy_info['deployment_depth'] # Calculate days deployed if deploy_stop != 'None': r_deploy_start = pd.to_datetime(deploy_start).replace(hour=0, minute=0, second=0) if deploy_stop.split('T')[1] == '00:00:00': r_deploy_stop = pd.to_datetime(deploy_stop) else: r_deploy_stop = (pd.to_datetime(deploy_stop) + timedelta(days=1)).replace(hour=0, minute=0, second=0) n_days_deployed = (r_deploy_stop - r_deploy_start).days else: n_days_deployed = None # Add reference designator to dictionary try: data['refdes'] except KeyError: data['refdes'] = refdes # append data for the deployment into a dictionary for s_v in sci_vars_dict.keys(): vv = ds[s_v] try: if vv.units not in sci_vars_dict[s_v]['units']: sci_vars_dict[s_v]['units'].append(vv.units) except AttributeError: print('') try: if vv._FillValue not in sci_vars_dict[s_v]['fv']: sci_vars_dict[s_v]['fv'].append(vv._FillValue) except AttributeError: print('') if len(vv.dims) == 1: if s_v in ['wavelength_a', 'wavelength_c']: # if the array is not same as the array that was already appended for these # two OPTAA variables, append. if it's already there, don't append if np.sum(vv.values == sci_vars_dict[s_v]['values']) != len(vv.values): sci_vars_dict[s_v]['values'] = np.append(sci_vars_dict[s_v]['values'], vv.values) else: sci_vars_dict[s_v]['values'] = np.append(sci_vars_dict[s_v]['values'], vv.values) elif len(vv.dims) == 2: # appending 2D datasets vD = vv.values.T if len(sci_vars_dict[s_v]['values']) == 0: sci_vars_dict[s_v]['values'] = vD else: sci_vars_dict[s_v]['values'] = np.concatenate((sci_vars_dict[s_v]['values'], vD), axis=1) deployments = data['deployments'].keys() data_start = pd.to_datetime(min(sci_vars_dict['time']['values'])).strftime('%Y-%m-%dT%H:%M:%S') data_stop = pd.to_datetime(max(sci_vars_dict['time']['values'])).strftime('%Y-%m-%dT%H:%M:%S') # Add deployment and info to dictionary and initialize delivery method sub-dictionary if deployment not in deployments: data['deployments'][deployment] = OrderedDict(deploy_start=deploy_start, deploy_stop=deploy_stop, n_days_deployed=n_days_deployed, lon=deploy_lon, lat=deploy_lat, deploy_depth=deploy_depth, method=OrderedDict()) # Add delivery methods to dictionary and initialize stream sub-dictionary methods = data['deployments'][deployment]['method'].keys() if method not in methods: data['deployments'][deployment]['method'][method] = OrderedDict( stream=OrderedDict()) # Add streams to dictionary and initialize file sub-dictionary streams = data['deployments'][deployment]['method'][method]['stream'].keys() if data_stream not in streams: data['deployments'][deployment]['method'][method]['stream'][ data_stream] = OrderedDict(file=OrderedDict()) # Get a list of data gaps >1 day time_df = pd.DataFrame(sci_vars_dict['time']['values'], columns=['time']) time_df = time_df.sort_values(by=['time']) gap_list = cf.timestamp_gap_test(time_df) # Calculate the sampling rate to the nearest second time_df['diff'] = time_df['time'].diff().astype('timedelta64[s]') rates_df = time_df.groupby(['diff']).agg(['count']) n_diff_calc = len(time_df) - 1 rates = dict(n_unique_rates=len(rates_df), common_sampling_rates=dict()) for i, row in rates_df.iterrows(): percent = (float(row['time']['count']) / float(n_diff_calc)) if percent > 0.1: rates['common_sampling_rates'].update({int(i): '{:.2%}'.format(percent)}) sampling_rt_sec = None for k, v in rates['common_sampling_rates'].items(): if float(v.strip('%')) > 50.00: sampling_rt_sec = k if not sampling_rt_sec: sampling_rt_sec = 'no consistent sampling rate: {}'.format(rates['common_sampling_rates']) # Don't do : Check that the timestamps in the file are unique time_test = '' # Count the number of days for which there is at least 1 timestamp n_days = len(np.unique(sci_vars_dict['time']['values'].astype('datetime64[D]'))) # Compare variables in file to variables in Data Review Database ds_variables = list(ds.data_vars.keys()) + list(ds.coords.keys()) ds_variables = eliminate_common_variables(ds_variables) ds_variables = [x for x in ds_variables if 'qc' not in x] [_, unmatch1] = compare_lists(stream_vars, ds_variables) [_, unmatch2] = compare_lists(ds_variables, stream_vars) # calculate mean pressure from data, excluding outliers +/- 3 SD try: pressure = sci_vars_dict[press] if len(pressure) > 1: # reject NaNs p_nonan = pressure['values'][~np.isnan(pressure['values'])] # reject fill values p_nonan_nofv = p_nonan[p_nonan != pressure['fv'][0]] # reject data outside of global ranges [pg_min, pg_max] = cf.get_global_ranges(r, press) if pg_min is not None and pg_max is not None: pgr_ind = cf.reject_global_ranges(p_nonan_nofv, pg_min, pg_max) p_nonan_nofv_gr = p_nonan_nofv[pgr_ind] else: p_nonan_nofv_gr = p_nonan_nofv if (len(p_nonan_nofv_gr) > 0): [press_outliers, pressure_mean, _, pressure_max, _, _] = cf.variable_statistics(p_nonan_nofv_gr, 3) pressure_mean = round(pressure_mean, 2) pressure_max = round(pressure_max, 2) else: press_outliers = None pressure_mean = None pressure_max = None if len(pressure) > 0 and len(p_nonan) == 0: notes.append('Pressure variable all NaNs') elif len(pressure) > 0 and len(p_nonan) > 0 and len(p_nonan_nofv) == 0: notes.append('Pressure variable all fill values') elif len(pressure) > 0 and len(p_nonan) > 0 and len(p_nonan_nofv) > 0 and len(p_nonan_nofv_gr) == 0: notes.append('Pressure variable outside of global ranges') else: # if there is only 1 data point press_outliers = 0 pressure_mean = round(ds[press].values.tolist()[0], 2) pressure_max = round(ds[press].values.tolist()[0], 2) try: pressure_units = pressure['units'][0] except AttributeError: pressure_units = 'no units attribute for pressure' if pressure_mean: if 'SF' in node: pressure_compare = int(round(pressure_max)) else: pressure_compare = int(round(pressure_mean)) if pressure_units == '0.001 dbar': pressure_max = round((pressure_max / 1000), 2) pressure_mean = round((pressure_mean / 1000), 2) pressure_compare = round((pressure_compare / 1000), 2) notes.append('Pressure converted from 0.001 dbar to dbar for pressure comparison') elif pressure_units == 'daPa': pressure_max = round((pressure_max / 1000), 2) pressure_mean = round((pressure_mean / 1000), 2) pressure_compare = round((pressure_compare / 1000), 2) notes.append('Pressure converted from daPa to dbar for pressure comparison') else: pressure_compare = None if (not deploy_depth) or (not pressure_mean): pressure_diff = None else: pressure_diff = pressure_compare - deploy_depth except KeyError: press = 'no seawater pressure in file' pressure_diff = None pressure_mean = None pressure_max = None pressure_compare = None press_outliers = None pressure_units = None # Add files and info to dictionary filenames = data['deployments'][deployment]['method'][method]['stream'][data_stream][ 'file'].keys() if fname not in filenames: data['deployments'][deployment]['method'][method]['stream'][data_stream]['file'][ fname] = OrderedDict( file_downloaded=pd.to_datetime(splitter[0][0:15]).strftime('%Y-%m-%dT%H:%M:%S'), file_coordinates=list(ds.coords.keys()), sampling_rate_seconds=sampling_rt_sec, sampling_rate_details=rates, data_start=data_start, data_stop=data_stop, time_gaps=gap_list, unique_timestamps=time_test, n_timestamps=len(sci_vars_dict['time']['values']), n_days=n_days, notes=notes, ascending_timestamps=time_ascending, pressure_comparison=dict(pressure_mean=pressure_mean, units=pressure_units, num_outliers=press_outliers, diff=pressure_diff, pressure_max=pressure_max, variable=press, pressure_compare=pressure_compare), vars_in_file=ds_variables, vars_not_in_file=[x for x in unmatch1 if 'time' not in x], vars_not_in_db=unmatch2, sci_var_stats=OrderedDict()) # calculate statistics for science variables, excluding outliers +/- 5 SD for sv in sci_vars_dict.keys(): if sv != 't_max': # for ADCP if sv != 'time': print(sv) var = sci_vars_dict[sv] vD = var['values'] var_units = var['units'] #if 'timedelta' not in str(vD.dtype): vnum_dims = len(np.shape(vD)) # for OPTAA wavelengths, print the array if sv == 'wavelength_a' or sv == 'wavelength_c': [g_min, g_max] = cf.get_global_ranges(r, sv) n_all = len(var) mean = list(vD) num_outliers = None vmin = None vmax = None sd = None n_stats = 'not calculated' n_nan = None n_fv = None n_grange = 'no global ranges' fv = var['fv'][0] else: if vnum_dims > 2: print('variable has more than 2 dimensions') num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = 'variable has more than 2 dimensions' n_nan = None n_fv = None n_grange = None fv = None n_all = None else: if vnum_dims > 1: n_all = [len(vD), len(vD.flatten())] else: n_all = len(vD) n_nan = int(np.sum(np.isnan(vD))) fv = var['fv'][0] vD[vD == fv] = np.nan # turn fill values to nans n_fv = int(np.sum(np.isnan(vD))) - n_nan [g_min, g_max] = cf.get_global_ranges(r, sv) if list(np.unique(np.isnan(vD))) != [True]: # reject data outside of global ranges if g_min is not None and g_max is not None: # turn data outside of global ranges to nans #var_gr = var_nofv.where((var_nofv >= g_min) & (var_nofv <= g_max)) vD[vD < g_min] = np.nan vD[vD > g_max] = np.nan n_grange = int(np.sum(np.isnan(vD)) - n_fv - n_nan) else: n_grange = 'no global ranges' if list(np.unique(np.isnan(vD))) != [True]: if sv == 'spkir_abj_cspp_downwelling_vector': # don't remove outliers from dataset [num_outliers, mean, vmin, vmax, sd, n_stats] = cf.variable_statistics_spkir(vD) else: if vnum_dims > 1: var_gr = vD.flatten() else: var_gr = vD # drop nans before calculating stats var_gr = var_gr[~np.isnan(var_gr)] [num_outliers, mean, vmin, vmax, sd, n_stats] = cf.variable_statistics(var_gr, 5) else: num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = 0 n_grange = None else: num_outliers = None mean = None vmin = None vmax = None sd = None n_stats = 0 n_grange = None if vnum_dims > 1: sv = '{} (dims: {})'.format(sv, list(np.shape(var['values']))) else: sv = sv #if 'timedelta' not in str(var.values.dtype): data['deployments'][deployment]['method'][method]['stream'][data_stream]['file'][ fname]['sci_var_stats'][sv] = dict(n_outliers=num_outliers, mean=mean, min=vmin, max=vmax, stdev=sd, n_stats=n_stats, units=var_units, n_nans=n_nan, n_fillvalues=n_fv, fill_value=str(fv), global_ranges=[g_min, g_max], n_grange=n_grange, n_all=n_all) sfile = os.path.join(save_dir, '{}-{}-file_analysis.json'.format(rev_dep, r)) with open(sfile, 'w') as outfile: json.dump(data, outfile) json_file_list.append(str(sfile)) depfile = os.path.join(save_dir, '{}-dependencies.txt'.format(r)) with open(depfile, 'w') as depf: depf.write(str(dependencies)) return json_file_list
print( 'The CTD cast was done {} km from the mooring location' .format(diff_loc)) # define pressure from the data file if 'SBD' in ds.node: press = np.empty(np.shape(ds['time'])) press[:] = 1 elif 'RID' in ds.node: press = np.empty(np.shape(ds['time'])) press[:] = 7 # elif 'RIS' in ds.node: # press = np.empty(np.shape(ds['time'])) # press[:] = 30 else: press = pf.pressure_var( ds, list(ds.data_vars.keys())) # press = pf.pressure_var(ds, list(ds.coords.keys())) # if press is None: # press = pf.pressure_var(ds, list(ds.data_vars.keys())) press = ds[press].values if 'CTD' in ds.sensor: try: ctd_cond = np.squeeze(np.array(df['CNDC'])) except KeyError: try: ctd_cond = np.squeeze( np.array(df['c1mS/cm'])) / 10 except KeyError: try: ctd_cond = np.squeeze(
def main(url_list, sDir, plot_type): """"" URL : path to instrument data by methods sDir : path to the directory on your machine to save files plot_type: folder name for a plot type """ "" rd_list = [] ms_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) ms = uu.split(rd + '-')[1].split('/')[0] if rd not in rd_list: rd_list.append(rd) if ms not in ms_list: ms_list.append(ms) ''' separate different instruments ''' for r in rd_list: print('\n{}'.format(r)) subsite = r.split('-')[0] array = subsite[0:2] main_sensor = r.split('-')[-1] ps_df, n_streams = cf.get_preferred_stream_info(r) # read in the analysis file dr_data = cf.refdes_datareview_json(r) # get end times of deployments deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = get_deployment_information(dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append(pd.to_datetime(deploy_info['stop_date'])) # get the list of data files and filter out collocated instruments and other streams chat datasets = [] for u in url_list: print(u) splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = cf.filter_collocated_instruments(main_sensor, datasets) fdatasets = cf.filter_other_streams(r, ms_list, fdatasets) ''' separate the data files by methods ''' for ms in ms_list: # np.unique(methodstream) fdatasets_sel = [x for x in fdatasets if ms in x] # create a folder to save figures save_dir = os.path.join(sDir, array, subsite, r, plot_type, ms.split('-')[0]) cf.create_dir(save_dir) # create a dictionary for science variables from analysis file stream_sci_vars_dict = dict() for x in dr_data['instrument']['data_streams']: dr_ms = '-'.join((x['method'], x['stream_name'])) if ms == dr_ms: stream_sci_vars_dict[dr_ms] = dict(vars=dict()) sci_vars = dict() for y in x['stream']['parameters']: if y['data_product_type'] == 'Science Data': sci_vars.update( {y['name']: dict(db_units=y['unit'])}) if len(sci_vars) > 0: stream_sci_vars_dict[dr_ms]['vars'] = sci_vars # initialize an empty data array for science variables in dictionary sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms) y_unit = [] y_name = [] for fd in fdatasets_sel: ds = xr.open_dataset(fd, mask_and_scale=False) print('\nAppending data file: {}'.format(fd.split('/')[-1])) for var in list(sci_vars_dict[ms]['vars'].keys()): sh = sci_vars_dict[ms]['vars'][var] if ds[var].units == sh['db_units']: if ds[var]._FillValue not in sh['fv']: sh['fv'].append(ds[var]._FillValue) if ds[var].units not in sh['units']: sh['units'].append(ds[var].units) # time t = ds['time'].values t0 = pd.to_datetime( t.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime( t.max()).strftime('%Y-%m-%dT%H:%M:%S') # sci variable z = ds[var].values sh['t'] = np.append(sh['t'], t) sh['values'] = np.append(sh['values'], z) # add pressure to dictionary of sci vars if 'MOAS' in subsite: if 'CTD' in main_sensor: # for glider CTDs, pressure is a coordinate pressure = 'sci_water_pressure_dbar' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = 'int_ctd_pressure' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = pf.pressure_var(ds, ds.data_vars.keys()) y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) sh['pressure'] = np.append(sh['pressure'], y) if len(y_unit) != 1: print('pressure unit varies!') else: y_unit = y_unit[0] if len(y_name) != 1: print('pressure long name varies!') else: y_name = y_name[0] for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print('\nWorking on variable: {}'.format(sv)) if len(vinfo['t']) < 1: print('no variable data to plot') else: sv_units = vinfo['units'][0] fv = vinfo['fv'][0] t0 = pd.to_datetime(min( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t = vinfo['t'] x = vinfo['values'] y = vinfo['pressure'] # Check if the array is all NaNs if sum(np.isnan(x)) == len(x): print('Array of all NaNs - skipping plot.') continue # Check if the array is all fill values elif len(x[x != fv]) == 0: print('Array of all fill values - skipping plot.') continue else: # reject fill values fv_ind = x != fv y_nofv = y[fv_ind] t_nofv = t[fv_ind] c_nofv = cm.rainbow(np.linspace(0, 1, len(t[fv_ind]))) x_nofv = x[fv_ind] print(len(x) - len(fv_ind), ' fill values') # reject NaNs nan_ind = ~np.isnan(x) t_nofv_nonan = t_nofv[nan_ind] c_nofv_nonan = c_nofv[nan_ind] y_nofv_nonan = y_nofv[nan_ind] x_nofv_nonan = x_nofv[nan_ind] print(len(x) - len(nan_ind), ' NaNs') # reject extreme values ev_ind = cf.reject_extreme_values(x_nofv_nonan) t_nofv_nonan_noev = t_nofv_nonan[ev_ind] c_nofv_nonan_noev = c_nofv_nonan[ev_ind] y_nofv_nonan_noev = y_nofv_nonan[ev_ind] x_nofv_nonan_noev = x_nofv_nonan[ev_ind] print(len(z) - len(ev_ind), ' Extreme Values', '|1e7|') # reject values outside global ranges: global_min, global_max = cf.get_global_ranges(r, sv) # platform not in qc-table (parad_k_par) # global_min = 0 # global_max = 2500 print('global ranges for : {}-{} {} - {}'.format( r, sv, global_min, global_max)) if isinstance(global_min, (int, float)) and isinstance( global_max, (int, float)): gr_ind = cf.reject_global_ranges( x_nofv_nonan_noev, global_min, global_max) t_nofv_nonan_noev_nogr = t_nofv_nonan_noev[gr_ind] y_nofv_nonan_noev_nogr = y_nofv_nonan_noev[gr_ind] x_nofv_nonan_noev_nogr = x_nofv_nonan_noev[gr_ind] else: t_nofv_nonan_noev_nogr = t_nofv_nonan_noev y_nofv_nonan_noev_nogr = y_nofv_nonan_noev x_nofv_nonan_noev_nogr = x_nofv_nonan_noev if len(x_nofv_nonan_noev) > 0: if m == 'common_stream_placeholder': sname = '-'.join((r, sv)) else: sname = '-'.join((r, m, sv)) if sv != 'pressure': columns = ['tsec', 'dbar', str(sv)] bin_size = 10 min_r = int(round(min(y_nofv_nonan_noev) - bin_size)) max_r = int(round(max(y_nofv_nonan_noev) + bin_size)) ranges = list(range(min_r, max_r, bin_size)) groups, d_groups = gt.group_by_depth_range( t_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr, x_nofv_nonan_noev_nogr, columns, ranges) y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr = [], [], [], [], [], [], [] tm = 1 for ii in range(len(groups)): nan_ind = d_groups[ii + tm].notnull() xtime = d_groups[ii + tm][nan_ind] colors = cm.rainbow(np.linspace(0, 1, len(xtime))) ypres = d_groups[ii + tm + 1][nan_ind] nval = d_groups[ii + tm + 2][nan_ind] tm += 2 l_arr.append(len( nval)) # count of data to filter out small groups y_avg.append(ypres.mean()) n_avg.append(nval.mean()) n_min.append(nval.min()) n_max.append(nval.max()) n_std = 3 n0_std.append(nval.mean() + n_std * nval.std()) n1_std.append(nval.mean() - n_std * nval.std()) # Plot all data ylabel = y_name + " (" + y_unit + ")" xlabel = sv + " (" + sv_units + ")" clabel = 'Time' fig, ax = pf.plot_profiles(x_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr, t_nofv_nonan_noev_nogr, ylabel, xlabel, clabel, end_times, deployments, stdev=None) title_text = ' '.join((r, ms.split('-')[-1])) + '\n' \ + t0 + ' - ' + t1 + '\n' + str(bin_size) +\ ' m average and ' + str(n_std) + ' std shown' ax.set_title(title_text, fontsize=9) ax.plot(n_avg, y_avg, '-k') ax.fill_betweenx(y_avg, n0_std, n1_std, color='m', alpha=0.2) pf.save_fig(save_dir, sname) # Plot data with outliers removed fig, ax = pf.plot_profiles(x_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr, t_nofv_nonan_noev_nogr, ylabel, xlabel, clabel, end_times, deployments, stdev=5) ax.set_title(' '.join((r, ms.split('-')[-1])) + '\n' \ + t0 + ' - ' + t1, fontsize=9) sfile = '_'.join((sname, 'rmoutliers')) pf.save_fig(save_dir, sfile)
def main(sDir, f, start_time, end_time): ff = pd.read_csv(os.path.join(sDir, f)) url_list = ff['outputUrl'].tolist() for i, u in enumerate(url_list): print('\nUrl {} of {}: {}'.format(i + 1, len(url_list), u)) main_sensor = u.split('/')[-2].split('-')[4] datasets = cf.get_nc_urls([u]) datasets_sel = cf.filter_collocated_instruments(main_sensor, datasets) for ii, d in enumerate(datasets_sel): print('\nDataset {} of {}: {}'.format(ii + 1, len(datasets_sel), d)) with xr.open_dataset(d, mask_and_scale=False) as ds: ds = ds.swap_dims({'obs': 'time'}) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})' .format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( d) vars = ds.data_vars.keys() if 'MOAS' in subsite and 'CTD' in main_sensor: # for glider CTDs, pressure is a coordinate pressure = 'sci_water_pressure_dbar' else: pressure = pf.pressure_var(ds, vars) sci_vars = cf.return_science_vars(stream) sci_vars = [s for s in sci_vars if s not in [pressure] ] # remove pressure from sci_vars save_dir = os.path.join(sDir, subsite, refdes, 'xsection_plots', deployment) cf.create_dir(save_dir) t = ds['time'].values t0 = pd.to_datetime(t.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(t.max()).strftime('%Y-%m-%dT%H:%M:%S') title = ' '.join((deployment, refdes, method)) y = ds[pressure] print('Plotting variables...') for var in sci_vars: print(var) z = ds[var] # Plot all data clabel = var + " (" + z.units + ")" ylabel = pressure + " (" + y.units + ")" fig, ax = pf.plot_xsection(subsite, t, y, z, clabel, ylabel, stdev=None) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '_'.join((fname[0:-46], z.name)) pf.save_fig(save_dir, sfile) # Plot data with outliers removed fig, ax = pf.plot_xsection(subsite, t, y, z, clabel, ylabel, stdev=5) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '_'.join((fname[0:-46], z.name, 'rmoutliers')) pf.save_fig(save_dir, sfile)
def main(url_list, sDir, plot_type): """"" URL : path to instrument data by methods sDir : path to the directory on your machine to save files plot_type: folder name for a plot type """ "" rd_list = [] ms_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) ms = uu.split(rd + '-')[1].split('/')[0] if rd not in rd_list: rd_list.append(rd) if ms not in ms_list: ms_list.append(ms) ''' separate different instruments ''' for r in rd_list: print('\n{}'.format(r)) subsite = r.split('-')[0] array = subsite[0:2] main_sensor = r.split('-')[-1] ps_df, n_streams = cf.get_preferred_stream_info(r) # read in the analysis file dr_data = cf.refdes_datareview_json(r) # get preferred stream ps_df, n_streams = cf.get_preferred_stream_info(r) # get end times of deployments deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = cf.get_deployment_information( dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append(pd.to_datetime(deploy_info['stop_date'])) # get the list of data files and filter out collocated instruments and other streams datasets = [] for u in url_list: print(u) splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = cf.filter_collocated_instruments(main_sensor, datasets) fdatasets = cf.filter_other_streams(r, ms_list, fdatasets) ''' separate data files by methods ''' for ms in ms_list: fdatasets_sel = [x for x in fdatasets if ms in x] # create a folder to save figures save_dir = os.path.join(sDir, array, subsite, r, plot_type, ms.split('-')[0]) cf.create_dir(save_dir) # create a dictionary for science variables from analysis file stream_sci_vars_dict = dict() for x in dr_data['instrument']['data_streams']: dr_ms = '-'.join((x['method'], x['stream_name'])) if ms == dr_ms: stream_sci_vars_dict[dr_ms] = dict(vars=dict()) sci_vars = dict() for y in x['stream']['parameters']: if y['data_product_type'] == 'Science Data': sci_vars.update( {y['name']: dict(db_units=y['unit'])}) if len(sci_vars) > 0: stream_sci_vars_dict[dr_ms]['vars'] = sci_vars # initialize an empty data array for science variables in dictionary sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms) print('\nAppending data from files: {}'.format(ms)) y_unit = [] y_name = [] for fd in fdatasets_sel: ds = xr.open_dataset(fd, mask_and_scale=False) print('\nAppending data file: {}'.format(fd.split('/')[-1])) for var in list(sci_vars_dict[ms]['vars'].keys()): sh = sci_vars_dict[ms]['vars'][var] if ds[var].units == sh['db_units']: if ds[var]._FillValue not in sh['fv']: sh['fv'].append(ds[var]._FillValue) if ds[var].units not in sh['units']: sh['units'].append(ds[var].units) # time t = ds['time'].values t0 = pd.to_datetime( t.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime( t.max()).strftime('%Y-%m-%dT%H:%M:%S') # sci variable z = ds[var].values sh['t'] = np.append(sh['t'], t) sh['values'] = np.append(sh['values'], z) # add pressure to dictionary of sci vars if 'MOAS' in subsite: if 'CTD' in main_sensor: # for glider CTDs, pressure is a coordinate pressure = 'sci_water_pressure_dbar' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = 'int_ctd_pressure' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = pf.pressure_var(ds, ds.data_vars.keys()) y = ds[pressure].values sh['pressure'] = np.append(sh['pressure'], y) try: ds[pressure].units if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) except AttributeError: print('pressure attributes missing units') if 'pressure unit missing' not in y_unit: y_unit.append('pressure unit missing') try: ds[pressure].long_name if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) except AttributeError: print('pressure attributes missing long_name') if 'pressure long name missing' not in y_name: y_name.append('pressure long name missing') # create a csv file with diagnostic results: if len(y_unit) != 1: print('pressure unit varies') if 'dbar' in y_unit: y_unit = 'dbar' print(y_unit) else: y_unit = y_unit[0] if len(y_name) != 1: print('pressure long name varies') if 'Seawater Pressure' in y_name: y_name = 'Seawater Pressure' print(y_name) else: y_name = y_name[0] # create a folder to save variables statistics mDir = '/Users/leila/Documents/NSFEduSupport/github/data-review-tools/data_review/final_stats' save_dir_stat = os.path.join(mDir, array, subsite) cf.create_dir(save_dir_stat) stat_df = pd.DataFrame() for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print(sv) if len(vinfo['t']) < 1: print('no variable data to plot') else: sv_units = vinfo['units'][0] fv = vinfo['fv'][0] t0 = pd.to_datetime(min( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t = vinfo['t'] z = vinfo['values'] y = vinfo['pressure'] title = ' '.join((r, ms)) # Check if the array is all NaNs if sum(np.isnan(z)) == len(z): print('Array of all NaNs - skipping plot.') continue # Check if the array is all fill values elif len(z[z != fv]) == 0: print('Array of all fill values - skipping plot.') continue else: # reject fill values fv_ind = z != fv y_nofv = y[fv_ind] t_nofv = t[fv_ind] z_nofv = z[fv_ind] print(len(z) - len(fv_ind), ' fill values') # reject NaNs nan_ind = ~np.isnan(z_nofv) t_nofv_nonan = t_nofv[nan_ind] y_nofv_nonan = y_nofv[nan_ind] z_nofv_nonan = z_nofv[nan_ind] print(len(z) - len(nan_ind), ' NaNs') # reject extreme values ev_ind = cf.reject_extreme_values(z_nofv_nonan) t_nofv_nonan_noev = t_nofv_nonan[ev_ind] y_nofv_nonan_noev = y_nofv_nonan[ev_ind] z_nofv_nonan_noev = z_nofv_nonan[ev_ind] print( len(z) - len(ev_ind), ' Extreme Values', '|1e7|') # reject values outside global ranges: global_min, global_max = cf.get_global_ranges( r, sv) # platform not in qc-table (parad_k_par) # global_min = 0 # global_max = 2500 print('global ranges for : {}-{} {} - {}'.format( r, sv, global_min, global_max)) if isinstance(global_min, (int, float)) and isinstance( global_max, (int, float)): gr_ind = cf.reject_global_ranges( z_nofv_nonan_noev, global_min, global_max) t_nofv_nonan_noev_nogr = t_nofv_nonan_noev[ gr_ind] y_nofv_nonan_noev_nogr = y_nofv_nonan_noev[ gr_ind] z_nofv_nonan_noev_nogr = z_nofv_nonan_noev[ gr_ind] else: t_nofv_nonan_noev_nogr = t_nofv_nonan_noev y_nofv_nonan_noev_nogr = y_nofv_nonan_noev z_nofv_nonan_noev_nogr = z_nofv_nonan_noev if len(z_nofv_nonan_noev) > 0: if m == 'common_stream_placeholder': sname = '-'.join((r, sv)) else: sname = '-'.join((r, m, sv)) # group by depth range sname = '_'.join((sname, sv_units)) # if sv != 'pressure': # columns = ['tsec', 'dbar', str(sv)] # # # select depth bin size for the data group function # bin_size = 10 # min_r = int(round(min(y_nofv_nonan_noev) - bin_size)) # max_r = int(round(max(y_nofv_nonan_noev) + bin_size)) # ranges = list(range(min_r, max_r, bin_size)) # groups, d_groups = gt.group_by_depth_range(t_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr, # z_nofv_nonan_noev_nogr, columns, ranges) # # if (ms.split('-')[0]) == (ps_df[0].values[0].split('-')[0]): # if 'pressure' not in sv: # print('final_stats_{}-{}-{}-{}'.format(r, # ms.split('-')[0], # ps_df[0].values[0].split('-')[0], # sv)) # stat_data = groups.describe()[sv] # stat_data.insert(loc=0, column='parameter', value=sv, allow_duplicates=False) # stat_df = stat_df.append(stat_data) # if sv == 'optical_backscatter': # less_ind = z_nofv_nonan_noev < 0.0004 # print(sv, ' < 0.0004', len(less_ind)) # more_ind = z_nofv_nonan_noev > 0.01 # print(sv, ' > 0.01', len(more_ind)) # Plot all data clabel = sv + " (" + sv_units + ")" ylabel = y_name + " (" + y_unit + ")" fig, ax = pf.plot_xsection(subsite, t_nofv_nonan_noev, y_nofv_nonan_noev, z_nofv_nonan_noev, clabel, ylabel, stdev=None) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) pf.save_fig(save_dir, sname) # Plot data with outliers removed fig, ax = pf.plot_xsection(subsite, t_nofv_nonan_noev_nogr, y_nofv_nonan_noev_nogr, z_nofv_nonan_noev_nogr, clabel, ylabel, stdev=5) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '_'.join((sname, 'rmoutliers')) pf.save_fig(save_dir, sfile) # plot data with excluded time range removed dr = pd.read_csv( 'https://datareview.marine.rutgers.edu/notes/export' ) drn = dr.loc[dr.type == 'exclusion'] if len(drn) != 0: subsite_node = '-'.join((subsite, r.split('-')[1])) drne = drn.loc[drn.reference_designator.isin( [subsite, subsite_node, r])] t_ex = t_nofv_nonan_noev_nogr y_ex = y_nofv_nonan_noev_nogr z_ex = z_nofv_nonan_noev_nogr for i, row in drne.iterrows(): sdate = cf.format_dates(row.start_date) edate = cf.format_dates(row.end_date) ts = np.datetime64(sdate) te = np.datetime64(edate) ind = np.where((t_ex < ts) | (t_ex > te), True, False) if len(ind) != 0: t_ex = t_ex[ind] z_ex = z_ex[ind] y_ex = y_ex[ind] fig, ax = pf.plot_xsection(subsite, t_ex, y_ex, z_ex, clabel, ylabel, stdev=None) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) sfile = '_'.join((sname, 'rmsuspectdata')) pf.save_fig(save_dir, sfile)
def main(url_list, sDir, plot_type, deployment_num, start_time, end_time): """"" URL : path to instrument data by methods sDir : path to the directory on your machine to save files plot_type: folder name for a plot type """ "" rd_list = [] ms_list = [] for uu in url_list: elements = uu.split('/')[-2].split('-') rd = '-'.join((elements[1], elements[2], elements[3], elements[4])) ms = uu.split(rd + '-')[1].split('/')[0] if rd not in rd_list: rd_list.append(rd) if ms not in ms_list: ms_list.append(ms) ''' separate different instruments ''' for r in rd_list: print('\n{}'.format(r)) subsite = r.split('-')[0] array = subsite[0:2] main_sensor = r.split('-')[-1] ps_df, n_streams = cf.get_preferred_stream_info(r) # read in the analysis file dr_data = cf.refdes_datareview_json(r) # get end times of deployments deployments = [] end_times = [] for index, row in ps_df.iterrows(): deploy = row['deployment'] deploy_info = get_deployment_information(dr_data, int(deploy[-4:])) deployments.append(int(deploy[-4:])) end_times.append(pd.to_datetime(deploy_info['stop_date'])) # get the list of data files and filter out collocated instruments and other streams chat datasets = [] for u in url_list: print(u) splitter = u.split('/')[-2].split('-') rd_check = '-'.join( (splitter[1], splitter[2], splitter[3], splitter[4])) if rd_check == r: udatasets = cf.get_nc_urls([u]) datasets.append(udatasets) datasets = list(itertools.chain(*datasets)) fdatasets = cf.filter_collocated_instruments(main_sensor, datasets) fdatasets = cf.filter_other_streams(r, ms_list, fdatasets) ''' separate the data files by methods ''' for ms in ms_list: fdatasets_sel = [x for x in fdatasets if ms in x] # create a dictionary for science variables from analysis file stream_sci_vars_dict = dict() for x in dr_data['instrument']['data_streams']: dr_ms = '-'.join((x['method'], x['stream_name'])) if ms == dr_ms: stream_sci_vars_dict[dr_ms] = dict(vars=dict()) sci_vars = dict() for y in x['stream']['parameters']: if y['data_product_type'] == 'Science Data': sci_vars.update( {y['name']: dict(db_units=y['unit'])}) if len(sci_vars) > 0: stream_sci_vars_dict[dr_ms]['vars'] = sci_vars # initialize an empty data array for science variables in dictionary sci_vars_dict = cd.initialize_empty_arrays(stream_sci_vars_dict, ms) print('\nAppending data from files: {}'.format(ms)) y_unit = [] y_name = [] for fd in fdatasets_sel: ds = xr.open_dataset(fd, mask_and_scale=False) print(fd) if start_time is not None and end_time is not None: ds = ds.sel(time=slice(start_time, end_time)) if len(ds['time'].values) == 0: print( 'No data to plot for specified time range: ({} to {})' .format(start_time, end_time)) continue fname, subsite, refdes, method, stream, deployment = cf.nc_attributes( fd) if deployment_num is not None: if int(deployment.split('0')[-1]) is not deployment_num: print(type(int(deployment.split('0')[-1])), type(deployment_num)) continue save_dir = os.path.join(sDir, array, subsite, refdes, plot_type, ms.split('-')[0], deployment) cf.create_dir(save_dir) for var in list(sci_vars_dict[ms]['vars'].keys()): sh = sci_vars_dict[ms]['vars'][var] if ds[var].units == sh['db_units']: if ds[var]._FillValue not in sh['fv']: sh['fv'].append(ds[var]._FillValue) if ds[var].units not in sh['units']: sh['units'].append(ds[var].units) # time t = ds['time'].values t0 = pd.to_datetime( t.min()).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime( t.max()).strftime('%Y-%m-%dT%H:%M:%S') # sci variable z = ds[var].values sh['t'] = np.append(sh['t'], t) sh['values'] = np.append(sh['values'], z) # add pressure to dictionary of sci vars if 'MOAS' in subsite: if 'CTD' in main_sensor: # for glider CTDs, pressure is a coordinate pressure = 'sci_water_pressure_dbar' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = 'int_ctd_pressure' y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) else: pressure = pf.pressure_var(ds, ds.data_vars.keys()) y = ds[pressure].values if ds[pressure].units not in y_unit: y_unit.append(ds[pressure].units) if ds[pressure].long_name not in y_name: y_name.append(ds[pressure].long_name) sh['pressure'] = np.append(sh['pressure'], y) if len(y_unit) != 1: print('pressure unit varies UHHHHHHHHH') else: y_unit = y_unit[0] if len(y_name) != 1: print('pressure long name varies UHHHHHHHHH') else: y_name = y_name[0] for m, n in sci_vars_dict.items(): for sv, vinfo in n['vars'].items(): print(sv) if len(vinfo['t']) < 1: print('no variable data to plot') else: sv_units = vinfo['units'][0] fv = vinfo['fv'][0] t0 = pd.to_datetime(min( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t1 = pd.to_datetime(max( vinfo['t'])).strftime('%Y-%m-%dT%H:%M:%S') t = vinfo['t'] z = vinfo['values'] y = vinfo['pressure'] title = ' '.join((r, ms.split('-')[1])) # Check if the array is all NaNs if sum(np.isnan(z)) == len(z): print('Array of all NaNs - skipping plot.') # Check if the array is all fill values elif len(z[z != fv]) == 0: print('Array of all fill values - skipping plot.') else: # reject fill values fv_ind = z != fv y_nofv = y[fv_ind] t_nofv = t[fv_ind] z_nofv = z[fv_ind] print(len(z) - len(fv_ind), ' fill values') # reject NaNs nan_ind = ~np.isnan(z) t_nofv_nonan = t_nofv[nan_ind] y_nofv_nonan = y_nofv[nan_ind] z_nofv_nonan = z_nofv[nan_ind] print(len(z) - len(nan_ind), ' NaNs') # reject extreme values ev_ind = cf.reject_extreme_values(z_nofv_nonan) t_nofv_nonan_noev = t_nofv_nonan[ev_ind] colors = cm.rainbow( np.linspace(0, 1, len(t_nofv_nonan_noev))) y_nofv_nonan_noev = y_nofv_nonan[ev_ind] z_nofv_nonan_noev = z_nofv_nonan[ev_ind] print( len(z) - len(ev_ind), ' Extreme Values', '|1e7|') if len(y_nofv_nonan_noev) > 0: if m == 'common_stream_placeholder': sname = '-'.join((r, sv)) else: sname = '-'.join((r, m, sv)) # Plot all data ylabel = y_name + " (" + y_unit + ")" xlabel = sv + " (" + sv_units + ")" clabel = 'Time' clabel = sv + " (" + sv_units + ")" fig, ax = pf.plot_profiles(z_nofv_nonan_noev, y_nofv_nonan_noev, colors, xlabel, ylabel, stdev=None) ax.set_title(( title + '\n' + str(deployment_num) + ': ' + t0 + ' - ' + t1 + '\n' + 'used bin = 2 dbar to calculate an average profile (black line) and 3-STD envelope (shaded area)' ), fontsize=9) # group by depth range columns = ['time', 'pressure', str(sv)] # ranges = [0, 50, 100, 200, 400, 600] ranges = list( range(int(round(min(y_nofv_nonan_noev))), int(round(max(y_nofv_nonan_noev))), 1)) groups, d_groups = gt.group_by_depth_range( t_nofv_nonan_noev, y_nofv_nonan_noev, z_nofv_nonan_noev, columns, ranges) # describe_file = '_'.join((sname, 'statistics.csv')) # # groups.describe().to_csv(save_dir + '/' + describe_file) ind = groups.describe()[sv]['mean'].notnull() groups.describe()[sv][ind].to_csv( '{}/{}_statistics.csv'.format(save_dir, sname), index=True) tm = 1 fig, ax = pyplot.subplots(nrows=2, ncols=1) pyplot.margins(y=.08, x=.02) pyplot.grid() y_avg, n_avg, n_min, n_max, n0_std, n1_std, l_arr = [], [], [], [], [], [], [] for ii in range(len(groups)): nan_ind = d_groups[ii + tm].notnull() xtime = d_groups[ii + tm][nan_ind] colors = cm.rainbow(np.linspace(0, 1, len(xtime))) ypres = d_groups[ii + tm + 1][nan_ind] nval = d_groups[ii + tm + 2][nan_ind] tm += 2 # fig, ax = pf.plot_xsection(subsite, xtime, ypres, nval, clabel, ylabel, stdev=None) # ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) # pf.plot_profiles(nval, ypres, colors, ylabel, clabel, stdev=None) # ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) ind2 = cf.reject_outliers(nval, 5) xD = nval[ind2] yD = ypres[ind2] nZ = colors[ind2] outliers = str(len(nval) - len(xD)) leg_text = ('removed {} outliers (SD={})'.format( outliers, stdev), ) ax.scatter(xD, yD, c=nZ, s=2, edgecolor='None') ax.invert_yaxis() ax.set_xlabel(clabel, fontsize=9) ax.set_ylabel(ylabel, fontsize=9) ax.legend(leg_text, loc='best', fontsize=6) ax.set_title((title + '\n' + t0 + ' - ' + t1), fontsize=9) l_arr.append( len(nval) ) # count of data to filter out small groups y_avg.append(ypres.mean()) n_avg.append(nval.mean()) n_min.append(nval.min()) n_max.append(nval.max()) n0_std.append(nval.mean() + 3 * nval.std()) n1_std.append(nval.mean() - 3 * nval.std()) ax.plot(n_avg, y_avg, '-k') # ax.plot(n_min, y_avg, '-b') # ax.plot(n_max, y_avg, '-b') ax.fill_betweenx(y_avg, n0_std, n1_std, color='m', alpha=0.2) sfile = '_'.join((sname, 'statistics')) pf.save_fig(save_dir, sfile)