def test_generic_masked_bad_min_max_value(self): _, tpath = tempfile.mkstemp(suffix='.nc', prefix='pocean-test') shutil.copy2(self.input_file, tpath) with EnhancedDataset(tpath, 'a') as ncd: v = ncd.variables['v_component_wind_true_direction_all_geometries'] v.valid_min = 0.1 v.valid_max = 0.1 r = generic_masked(v[:], attrs=ncd.vatts(v.name)) rflat = r.flatten() assert rflat[~rflat.mask].size == 0 # Create a byte variable with a float valid_min and valid_max # to make sure it doesn't error b = ncd.createVariable('imabyte', 'b') b.valid_min = 0 b.valid_max = 600 # this ss over a byte and thus invalid b[:] = 3 r = generic_masked(b[:], attrs=ncd.vatts(b.name)) assert np.all(r.mask == False) # noqa b.valid_min = 0 b.valid_max = 2 r = generic_masked(b[:], attrs=ncd.vatts(b.name)) assert np.all(r.mask == True) # noqa c = ncd.createVariable('imanotherbyte', 'f4') c.setncattr('valid_min', '0b') c.setncattr('valid_max', '9b') c[:] = 3 r = generic_masked(c[:], attrs=ncd.vatts(c.name)) assert np.all(r.mask == False) # noqa c = ncd.createVariable('imarange', 'f4') c.valid_range = [0.0, 2.0] c[:] = 3.0 r = generic_masked(c[:], attrs=ncd.vatts(c.name)) assert np.all(r.mask == True) # noqa c.valid_range = [0.0, 2.0] c[:] = 1.0 r = generic_masked(c[:], attrs=ncd.vatts(c.name)) assert np.all(r.mask == False) # noqa if os.path.exists(tpath): os.remove(tpath)
def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes) svar = axv.station s = normalize_countable_array(svar) # T t = get_masked_datetime_array(axv.t[:], axv.t) n_times = t.size # X x = generic_masked(axv.x[:], attrs=self.vatts(axv.x.name)) # Y y = generic_masked(axv.y[:], attrs=self.vatts(axv.y.name)) # Z z = generic_masked(axv.z[:], attrs=self.vatts(axv.z.name)) n_z = z.size # denormalize table structure t = np.repeat(t, s.size * n_z) z = np.tile(np.repeat(z, s.size), n_times) s = np.tile(s, n_z * n_times) y = np.tile(y, n_times * n_z) x = np.tile(x, n_times * n_z) df_data = OrderedDict([ (axes.t, t), (axes.x, x), (axes.y, y), (axes.z, z), (axes.station, s), ]) building_index_to_drop = np.ones(t.size, dtype=bool) # Axes variables are already processed so skip them extract_vars = copy(self.variables) for ncvar in axv._asdict().values(): if ncvar is not None and ncvar.name in extract_vars: del extract_vars[ncvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning("Skipping variable {} that is completely masked". format(dnam)) continue vdata = vdata[0] else: if dvar[:].flatten().size != t.size: L.warning("Variable {} is not the correct size, skipping.". format(dnam)) continue building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes) # The index variable (trajectory_index) is identified by having an # attribute with name of instance_dimension whose value is the instance # dimension name (trajectory in this example). The index variable must # have the profile dimension as its sole dimension, and must be type # integer. Each value in the index variable is the zero-based trajectory # index that the profile belongs to i.e. profile p belongs to trajectory # i=trajectory_index(p), as in section H.2.5. r_index_var = self.filter_by_attrs( instance_dimension=lambda x: x is not None) if not r_index_var: raise ValueError( 'Could not find the "instance_dimension" attribute on any variables, ' 'is this a valid {}?'.format(self.__class__.__name__)) else: r_index_var = r_index_var[0] p_dim = self.dimensions[r_index_var.dimensions[0]] # Profile dimension # We should probably use this below to test for dimensionality of variables? # r_dim = self.dimensions[r_index_var.instance_dimension] # Trajectory dimension # The count variable (row_size) contains the number of elements for # each profile, which must be written contiguously. The count variable # is identified by having an attribute with name sample_dimension whose # value is the sample dimension (obs in this example) being counted. It # must have the profile dimension as its sole dimension, and must be # type integer o_index_var = self.filter_by_attrs( sample_dimension=lambda x: x is not None) if not o_index_var: raise ValueError( 'Could not find the "sample_dimension" attribute on any variables, ' 'is this a valid {}?'.format(self.__class__.__name__)) else: o_index_var = o_index_var[0] o_dim = self.dimensions[ o_index_var.sample_dimension] # Sample dimension profile_indexes = normalize_countable_array(axv.profile, count_if_none=p_dim.size) p = np.ma.masked_all(o_dim.size, dtype=profile_indexes.dtype) traj_indexes = normalize_countable_array(axv.trajectory) r = np.ma.masked_all(o_dim.size, dtype=traj_indexes.dtype) tvar = axv.t t = np.ma.masked_all(o_dim.size, dtype=tvar.dtype) xvar = axv.x x = np.ma.masked_all(o_dim.size, dtype=xvar.dtype) yvar = axv.y y = np.ma.masked_all(o_dim.size, dtype=yvar.dtype) si = 0 # Sample (obs) dimension zvar = axv.z z = generic_masked(zvar[:].flatten(), attrs=self.vatts(zvar.name)) for i in np.arange(profile_indexes.size): ei = si + o_index_var[i] p[si:ei] = profile_indexes[i] r[si:ei] = np.array(traj_indexes[r_index_var[i]]) t[si:ei] = tvar[i] x[si:ei] = xvar[i] y[si:ei] = yvar[i] si = ei # T nt = get_masked_datetime_array(t, tvar).flatten() # X and Y x = generic_masked(x, minv=-180, maxv=180) y = generic_masked(y, minv=-90, maxv=90) df_data = OrderedDict([(axes.t, nt), (axes.x, x), (axes.y, y), (axes.z, z), (axes.trajectory, r), (axes.profile, p)]) building_index_to_drop = np.ones(o_dim.size, dtype=bool) extract_vars = copy(self.variables) # Skip the traj and row index variables del extract_vars[o_index_var.name] del extract_vars[r_index_var.name] # Axes variables are already processed so skip them for ncvar in axv._asdict().values(): if ncvar is not None and ncvar.name in extract_vars: del extract_vars[ncvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): # Profile dimensions if dvar.dimensions == (p_dim.name, ): vdata = np.ma.masked_all(o_dim.size, dtype=dvar.dtype) si = 0 for j in np.arange(profile_indexes.size): ei = si + o_index_var[j] vdata[si:ei] = dvar[j] si = ei # Sample dimensions elif dvar.dimensions == (o_dim.name, ): vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) else: vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning( "Skipping variable {} that is completely masked". format(dnam)) continue else: L.warning( "Skipping variable {} since it didn't match any dimension sizes" .format(dnam)) continue # Mark rows with data so we don't remove them with clear_rows if vdata.size == building_index_to_drop.size: building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa # Handle scalars here at the end if vdata.size == 1: vdata = vdata[0] df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes) # Profile dimension p_var = self.filter_by_attrs(cf_role='profile_id')[0] p_dim = self.dimensions[p_var.dimensions[0]] # Station dimension s_var = self.filter_by_attrs(cf_role='timeseries_id')[0] if s_var.ndim == 1: s_dim = self.dimensions[s_var.dimensions[0]] elif s_var.ndim == 0: s_dim = None else: raise ValueError('Number of dimension on the station (timeseries_id) must be 0 or 1') # Station index r_index_var = self.filter_by_attrs(instance_dimension=lambda x: x is not None) if not r_index_var: # A reduced netCDF file, set station to 0 so it pulls the first value # of the variable that identifies the stations r_index_var = [0] else: r_index_var = r_index_var[0] # Sample (obs) dimension o_index_var = self.filter_by_attrs(sample_dimension=lambda x: x is not None) if not o_index_var: raise ValueError( 'Could not find the "sample_dimension" attribute on any variables, ' 'is this a valid {}?'.format(self.__class__.__name__) ) else: o_index_var = o_index_var[0] # Sample dimension # Since this is a flat dataframe, everything is the length of the obs dimension row_sizes = o_index_var[:] o_dim = self.dimensions[o_index_var.sample_dimension] profile_indexes = normalize_countable_array(p_var, count_if_none=p_dim.size) p = np.repeat(profile_indexes, row_sizes) stat_indexes = normalize_countable_array(s_var, count_if_none=s_dim.size) r = np.ma.masked_all(o_dim.size, dtype=stat_indexes.dtype) # Lat and Lon are on the station dimension xvar = axv.x x = np.ma.masked_all(o_dim.size, dtype=xvar.dtype) yvar = axv.y y = np.ma.masked_all(o_dim.size, dtype=yvar.dtype) si = 0 for i in np.arange(stat_indexes.size): ei = si + o_index_var[i] r[si:ei] = np.array(stat_indexes[r_index_var[i]]) x[si:ei] = xvar[i] y[si:ei] = yvar[i] si = ei x = generic_masked(x, minv=-180, maxv=180) y = generic_masked(y, minv=-90, maxv=90) # Time and Z are on the sample (obs) dimension tvar = axv.t t = get_masked_datetime_array( generic_masked(tvar[:].flatten(), attrs=self.vatts(tvar.name)), tvar ) z = generic_masked(axv.z[:].flatten(), attrs=self.vatts(axv.z.name)) df_data = OrderedDict([ (axes.t, t), (axes.x, x), (axes.y, y), (axes.z, z), (axes.station, r), (axes.profile, p) ]) building_index_to_drop = np.ones(o_dim.size, dtype=bool) extract_vars = copy(self.variables) # Skip the station and row index variables del extract_vars[o_index_var.name] del extract_vars[r_index_var.name] # Axes variables are already processed so skip them for ncvar in axv._asdict().values(): if ncvar is not None and ncvar.name in extract_vars: del extract_vars[ncvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): # Profile dimensions if dvar.dimensions == (p_dim.name,): vdata = generic_masked( np.repeat( dvar[:].flatten().astype(dvar.dtype), row_sizes ), attrs=self.vatts(dnam) ) # Sample dimensions elif dvar.dimensions == (o_dim.name,): vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) else: vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning("Skipping variable {} that is completely masked".format(dnam)) continue else: L.warning("Skipping variable {} since it didn't match any dimension sizes".format(dnam)) continue # Mark rows with data so we don't remove them with clear_rows if vdata.size == building_index_to_drop.size: building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True) # noqa # Handle scalars here at the end if vdata.size == 1: vdata = vdata[0] df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes) # Multiple profiles in the file pvar = axv.profile p_dim = self.dimensions[pvar.dimensions[0]] zvar = axv.z zs = len(self.dimensions[[ d for d in zvar.dimensions if d != p_dim.name ][0]]) # Profiles p = normalize_countable_array(pvar) p = p.repeat(zs) # Z z = generic_masked(zvar[:].flatten(), attrs=self.vatts(zvar.name)) # T tvar = axv.t t = tvar[:].repeat(zs) nt = get_masked_datetime_array(t, tvar).flatten() # X xvar = axv.x x = generic_masked(xvar[:].repeat(zs), attrs=self.vatts(xvar.name)) # Y yvar = axv.y y = generic_masked(yvar[:].repeat(zs), attrs=self.vatts(yvar.name)) df_data = OrderedDict([(axes.t, nt), (axes.x, x), (axes.y, y), (axes.z, z), (axes.profile, p)]) building_index_to_drop = np.ones(t.size, dtype=bool) extract_vars = copy(self.variables) for ncvar in axv._asdict().values(): if ncvar is not None and ncvar.name in extract_vars: del extract_vars[ncvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): # Profile dimension if dvar.dimensions == pvar.dimensions: vdata = generic_masked(dvar[:].repeat(zs).astype(dvar.dtype), attrs=self.vatts(dnam)) # Profile, z dimension elif dvar.dimensions == zvar.dimensions: vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) else: vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning( "Skipping variable {} that is completely masked". format(dnam)) continue else: L.warning( "Skipping variable {} since it didn't match any dimension sizes" .format(dnam)) continue # Mark rows with data so we don't remove them with clear_rows if vdata.size == building_index_to_drop.size: building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa # Handle scalars here at the end if vdata.size == 1: vdata = vdata[0] df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=False, clean_rows=False, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes) # T t = get_masked_datetime_array(axv.t[:], axv.t) # X x = generic_masked(axv.x[:].repeat(t.size), attrs=self.vatts(axv.x.name)) # Y y = generic_masked(axv.y[:].repeat(t.size), attrs=self.vatts(axv.y.name)) # Z if axv.z is not None: z = generic_masked(axv.z[:].repeat(t.size), attrs=self.vatts(axv.z.name)) else: z = None svar = axv.station s = normalize_countable_array(svar) s = np.repeat(s, t.size) # now repeat t per station # figure out if this is a single-station file by checking # the dimension size of the x dimension if axv.x.ndim == 1: t = np.repeat(t, len(svar)) df_data = OrderedDict([ (axes.t, t), (axes.x, x), (axes.y, y), (axes.z, z), (axes.station, s), ]) building_index_to_drop = np.ma.zeros(t.size, dtype=bool) # Axes variables are already processed so skip them extract_vars = copy(self.variables) for ncvar in axv._asdict().values(): if ncvar is not None and ncvar.name in extract_vars: del extract_vars[ncvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning("Skipping variable {} that is completely masked". format(dnam)) continue else: if dvar[:].flatten().size != t.size: L.warning("Variable {} is not the correct size, skipping.". format(dnam)) continue # Mark rows with data so we don't remove them with clear_rows if vdata.size == building_index_to_drop.size: building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa # Handle scalars here at the end if vdata.size == 1: vdata = vdata[0] df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=True, clean_rows=True): # The index variable (trajectory_index) is identified by having an # attribute with name of instance_dimension whose value is the instance # dimension name (trajectory in this example). The index variable must # have the profile dimension as its sole dimension, and must be type # integer. Each value in the index variable is the zero-based trajectory # index that the profile belongs to i.e. profile p belongs to trajectory # i=trajectory_index(p), as in section H.2.5. r_index_var = self.filter_by_attrs(instance_dimension=lambda x: x is not None)[0] p_dim = self.dimensions[r_index_var.dimensions[0]] # Profile dimension r_dim = self.dimensions[r_index_var.instance_dimension] # Trajectory dimension # The count variable (row_size) contains the number of elements for # each profile, which must be written contiguously. The count variable # is identified by having an attribute with name sample_dimension whose # value is the sample dimension (obs in this example) being counted. It # must have the profile dimension as its sole dimension, and must be # type integer o_index_var = self.filter_by_attrs(sample_dimension=lambda x: x is not None)[0] o_dim = self.dimensions[o_index_var.sample_dimension] # Sample dimension try: rvar = self.filter_by_attrs(cf_role='trajectory_id')[0] traj_indexes = normalize_array(rvar) assert traj_indexes.size == r_dim.size except BaseException: logger.warning('Could not pull trajectory values a variable with "cf_role=trajectory_id", using a computed range.') traj_indexes = np.arange(r_dim.size) try: pvar = self.filter_by_attrs(cf_role='profile_id')[0] profile_indexes = normalize_array(pvar) assert profile_indexes.size == p_dim.size except BaseException: logger.warning('Could not pull profile values from a variable with "cf_role=profile_id", using a computed range.') profile_indexes = np.arange(p_dim.size) # Profile dimension tvars = self.t_axes() if len(tvars) > 1: tvar = [ v for v in self.t_axes() if v.dimensions == (p_dim.name,) and getattr(v, 'axis', '').lower() == 't' ][0] else: tvar = tvars[0] xvars = self.x_axes() if len(xvars) > 1: xvar = [ v for v in self.x_axes() if v.dimensions == (p_dim.name,) and getattr(v, 'axis', '').lower() == 'x' ][0] else: xvar = xvars[0] yvars = self.y_axes() if len(yvars) > 1: yvar = [ v for v in self.y_axes() if v.dimensions == (p_dim.name,) and getattr(v, 'axis', '').lower() == 'y' ][0] else: yvar = yvars[0] zvars = self.z_axes() if len(zvars) > 1: zvar = [ v for v in self.z_axes() if v.dimensions == (o_dim.name,) and getattr(v, 'axis', '').lower() == 'z' ][0] else: zvar = zvars[0] p = np.ma.masked_all(o_dim.size, dtype=profile_indexes.dtype) r = np.ma.masked_all(o_dim.size, dtype=traj_indexes.dtype) t = np.ma.masked_all(o_dim.size, dtype=tvar.dtype) x = np.ma.masked_all(o_dim.size, dtype=xvar.dtype) y = np.ma.masked_all(o_dim.size, dtype=yvar.dtype) si = 0 for i in np.arange(profile_indexes.size): ei = si + o_index_var[i] p[si:ei] = profile_indexes[i] r[si:ei] = traj_indexes[r_index_var[i]] t[si:ei] = tvar[i] x[si:ei] = xvar[i] y[si:ei] = yvar[i] si = ei t_mask = False tfill = get_fill_value(tvar) if tfill is not None: t_mask = np.copy(np.ma.getmaskarray(t)) t[t_mask] = 1 t = np.ma.MaskedArray( nc4.num2date(t, tvar.units, getattr(tvar, 'calendar', 'standard')) ) # Patch the time variable back to its original mask, since num2date # breaks any missing/fill values t[t_mask] = np.ma.masked # X and Y x = generic_masked(x, minv=-180, maxv=180).round(5) y = generic_masked(y, minv=-90, maxv=90).round(5) # Distance d = np.ma.zeros(o_dim.size, dtype=np.float64) d[1:] = great_distance(start_latitude=y[0:-1], end_latitude=y[1:], start_longitude=x[0:-1], end_longitude=x[1:])['distance'] d = generic_masked(np.cumsum(d), minv=0).round(2) # Sample dimension z = generic_masked(zvar[:].flatten(), attrs=self.vatts(zvar.name)).round(5) df_data = { 't': t, 'x': x, 'y': y, 'z': z, 'trajectory': r, 'profile': p, 'distance': d } building_index_to_drop = np.ones(o_dim.size, dtype=bool) extract_vars = list(set(self.data_vars() + self.ancillary_vars())) for i, dvar in enumerate(extract_vars): # Profile dimensions if dvar.dimensions == (p_dim.name,): vdata = np.ma.masked_all(o_dim.size, dtype=dvar.dtype) si = 0 for j in np.arange(profile_indexes.size): ei = si + o_index_var[j] vdata[si:ei] = dvar[j] si = ei # Sample dimensions elif dvar.dimensions == (o_dim.name,): vdata = generic_masked(dvar[:].flatten(), attrs=self.vatts(dvar.name)).round(3) else: logger.warning("Skipping variable {}... it didn't seem like a data variable".format(dvar)) building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True) # noqa df_data[dvar.name] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes, skip=[axes.profile, axes.station]) # T t = get_masked_datetime_array(axv.t[:], axv.t).flatten() # X x = generic_masked(axv.x[:], attrs=self.vatts(axv.x.name)).flatten() # Y y = generic_masked(axv.y[:], attrs=self.vatts(axv.y.name)).flatten() # Z z = generic_masked(axv.z[:], attrs=self.vatts(axv.z.name)).flatten() # Trajectories rvar = axv.trajectory p = normalize_countable_array(rvar) # The Dimension that the trajectory id variable doesn't have is what # the trajectory data needs to be repeated by dim_diff = self.dimensions[list( set(axv.t.dimensions).difference(set(rvar.dimensions)))[0]] if dim_diff: p = p.repeat(dim_diff.size) df_data = OrderedDict([(axes.t, t), (axes.x, x), (axes.y, y), (axes.z, z), (axes.trajectory, p)]) building_index_to_drop = np.ones(t.size, dtype=bool) # Axes variables are already processed so skip them extract_vars = copy(self.variables) for ncvar in axv._asdict().values(): if ncvar is not None and ncvar.name in extract_vars: del extract_vars[ncvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning("Skipping variable {} that is completely masked". format(dnam)) continue vdata = vdata[0] else: if dvar[:].flatten().size != t.size: L.warning("Variable {} is not the correct size, skipping.". format(dnam)) continue building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes) o_index_var = self.filter_by_attrs( sample_dimension=lambda x: x is not None) if not o_index_var: raise ValueError( 'Could not find the "sample_dimension" attribute on any variables, ' 'is this a valid {}?'.format(self.__class__.__name__)) else: o_index_var = o_index_var[0] o_dim = self.dimensions[ o_index_var.sample_dimension] # Sample dimension t_dim = o_index_var.dimensions # Trajectory row_sizes = o_index_var[:] traj_data = normalize_countable_array(axv.trajectory) traj_data = np.repeat(traj_data, row_sizes) # time time_data = get_masked_datetime_array(axv.t[:], axv.t).flatten() df_data = OrderedDict([(axes.t, time_data), (axes.trajectory, traj_data)]) building_index_to_drop = np.ones(o_dim.size, dtype=bool) extract_vars = copy(self.variables) # Skip the time and row index variables del extract_vars[o_index_var.name] del extract_vars[axes.t] for i, (dnam, dvar) in enumerate(extract_vars.items()): # Trajectory dimensions if dvar.dimensions == t_dim: vdata = np.repeat( generic_masked(dvar[:], attrs=self.vatts(dnam)), row_sizes) # Sample dimensions elif dvar.dimensions == (o_dim.name, ): vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) else: vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning( "Skipping variable {} that is completely masked". format(dnam)) continue else: L.warning( "Skipping variable {} since it didn't match any dimension sizes" .format(dnam)) continue # Mark rows with data so we don't remove them with clear_rows if vdata.size == building_index_to_drop.size: building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa # Handle scalars here at the end if vdata.size == 1: vdata = vdata[0] df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes) zvar = axv.z zs = len(self.dimensions[zvar.dimensions[0]]) # Profiles pvar = axv.profile p = normalize_countable_array(pvar) ps = p.size p = p.repeat(zs) # Z z = generic_masked(zvar[:], attrs=self.vatts(zvar.name)) try: z = np.tile(z, ps) except ValueError: z = z.flatten() # T tvar = axv.t t = tvar[:].repeat(zs) nt = get_masked_datetime_array(t, tvar).flatten() # X xvar = axv.x x = generic_masked(xvar[:].repeat(zs), attrs=self.vatts(xvar.name)) # Y yvar = axv.y y = generic_masked(yvar[:].repeat(zs), attrs=self.vatts(yvar.name)) df_data = OrderedDict([(axes.t, nt), (axes.x, x), (axes.y, y), (axes.z, z), (axes.profile, p)]) building_index_to_drop = np.ones(t.size, dtype=bool) # Axes variables are already processed so skip them extract_vars = copy(self.variables) for ncvar in axv._asdict().values(): if ncvar is not None and ncvar.name in extract_vars: del extract_vars[ncvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): # Profile dimension if dvar.dimensions == pvar.dimensions: vdata = generic_masked(dvar[:].repeat(zs).astype(dvar.dtype), attrs=self.vatts(dnam)) building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa # Z dimension elif dvar.dimensions == zvar.dimensions: vdata = generic_masked(np.tile(dvar[:], ps).flatten().astype( dvar.dtype), attrs=self.vatts(dnam)) building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa # Profile, z dimension elif dvar.dimensions == pvar.dimensions + zvar.dimensions: vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa else: vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning( "Skipping variable {} that is completely masked". format(dnam)) continue vdata = vdata[0] else: L.warning( "Skipping variable {} since it didn't match any dimension sizes" .format(dnam)) continue df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=True, clean_rows=True): zvar = self.z_axes()[0] zs = len(self.dimensions[zvar.dimensions[0]]) # Profiles pvar = self.filter_by_attrs(cf_role='profile_id')[0] try: p = normalize_array(pvar) except ValueError: p = np.asarray(list(range(len(pvar))), dtype=np.integer) ps = p.size p = p.repeat(zs) logger.debug(['profile data size: ', p.size]) # Z z = generic_masked(zvar[:], attrs=self.vatts(zvar.name)).round(5) try: z = np.tile(z, ps) except ValueError: z = z.flatten() logger.debug(['z data size: ', z.size]) # T tvar = self.t_axes()[0] t = nc4.num2date(tvar[:], tvar.units, getattr(tvar, 'calendar', 'standard')) if isinstance(t, datetime): # Size one t = np.array([t.isoformat()], dtype='datetime64') t = t.repeat(zs) logger.debug(['time data size: ', t.size]) # X xvar = self.x_axes()[0] x = generic_masked(xvar[:].repeat(zs), attrs=self.vatts(xvar.name)).round(5) logger.debug(['x data size: ', x.size]) # Y yvar = self.y_axes()[0] y = generic_masked(yvar[:].repeat(zs), attrs=self.vatts(yvar.name)).round(5) logger.debug(['y data size: ', y.size]) # Distance d = np.ma.zeros(y.size, dtype=np.float64) d[1:] = great_distance(start_latitude=y[0:-1], end_latitude=y[1:], start_longitude=x[0:-1], end_longitude=x[1:])['distance'] d = generic_masked(np.cumsum(d), minv=0).round(2) logger.debug(['distance data size: ', d.size]) df_data = {'t': t, 'x': x, 'y': y, 'z': z, 'profile': p, 'distance': d} building_index_to_drop = np.ones(t.size, dtype=bool) extract_vars = list(set(self.data_vars() + self.ancillary_vars())) for i, dvar in enumerate(extract_vars): vdata = np.ma.fix_invalid( np.ma.MaskedArray(dvar[:].round(3).flatten())) building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa df_data[dvar.name] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=False, clean_rows=False): # Don't pass around the attributes store them in the class # T tvar = self.t_axes()[0] t = nc4.num2date(tvar[:], tvar.units, getattr(tvar, 'calendar', 'standard')) if isinstance(t, datetime): # Size one t = np.array([t.isoformat()], dtype='datetime64') logger.debug(['time data size: ', t.size]) svar = self.filter_by_attrs(cf_role='timeseries_id')[0] # Stations # TODO: Make sure there is a test for a file with multiple time variables try: s = normalize_array(svar) except ValueError: s = np.asarray(list(range(len(svar))), dtype=np.integer) s = np.repeat(s, t.size) logger.debug(['station data size: ', s.size]) # X xvar = self.x_axes()[0] x = generic_masked(xvar[:].repeat(t.size), attrs=self.vatts(xvar.name)).round(5) logger.debug(['x data size: ', x.size]) # Y yvar = self.y_axes()[0] y = generic_masked(yvar[:].repeat(t.size), attrs=self.vatts(yvar.name)).round(5) logger.debug(['y data size: ', y.size]) # Z zvar = self.z_axes()[0] z = generic_masked(zvar[:].repeat(t.size), attrs=self.vatts(zvar.name)) logger.debug(['z data size: ', z.size]) # now repeat t per station # figure out if this is a single-station file # do this by checking the dimensions of the Z var if zvar.ndim == 1: t = np.repeat(t, len(svar)) df_data = { 't': t, 'x': x, 'y': y, 'z': z, 'station': s, } #building_index_to_drop = np.ones(t.size, dtype=bool) extract_vars = copy(self.variables) del extract_vars[svar.name] del extract_vars[xvar.name] del extract_vars[yvar.name] del extract_vars[zvar.name] del extract_vars[tvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): if dvar[:].flatten().size > t.size: logger.warning("Variable {} is not the correct size, skipping.".format(dnam)) continue vdata = generic_masked(dvar[:].flatten(), attrs=self.vatts(dnam)) if vdata.size == 1: vdata = vdata[0] #building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True) # noqa try: if re.match(r'.* since .*',dvar.units): vdata = nc4.num2date(vdata[:], dvar.units, getattr(dvar, 'calendar', 'standard')) except AttributeError: pass df_data[dnam] = vdata #logger.info('{} - {}'.format(dnam, vdata.shape)) df = pd.DataFrame() for k, v in df_data.items(): df[k] = v # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data #if clean_rows: # df = df.iloc[~building_index_to_drop] return df