def get_vertical_attributes(df, axes=None): """ Use values in a dataframe to set vertical attributes for the eventual netCDF file Attribute names come from https://www.nodc.noaa.gov/data/formats/netcdf/v2.0/ The CRS, geospatial_bounds_vertical_crs, cannot be assumed because NCEI suggests any of * 'EPSG:5829' (instantaneous height above sea level), * 'EPSG:5831' (instantaneous depth below sea level), or * 'EPSG:5703' (NAVD88 height). Likewise, geospatial_vertical_positive cannot be assumed to be either 'up' or 'down'. Set these attributes separately according to the dataset. Note: values are cast from numpy.int to float :param df: data (Pandas DataFrame) :param axes: keys (x,y,z,t) are associated with actual column names (dictionary). z in meters. :return: nested dictionary of variable and global attributes """ axes = get_default_axes(axes) minz = round(float(df[axes.z].min()), 6) maxz = round(float(df[axes.z].max()), 6) return { 'variables': { axes.z: { 'attributes': { 'actual_min': minz, 'actual_max': maxz, } }, }, 'attributes': { 'geospatial_vertical_min': minz, 'geospatial_vertical_max': maxz, 'geospatial_vertical_units': 'm', } }
def calculated_metadata(self, df=None, geometries=True, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) if df is None: df = self.to_dataframe(clean_cols=clean_cols, clean_rows=clean_rows, axes=axes) return trajectory_profile_calculated_metadata(df, axes, geometries)
def get_calculated_attributes(df, axes=None, history=None): """ Functions to automate netCDF attribute generation from the data itself This is a wrapper for the other four functions, which could be called separately. :param df: data (Pandas DataFrame) :param axes: keys (x,y,z,t) are associated with actual column names (dictionary) :param history: history: text initializing audit trail for modifications to the original data (optional, string) :return: dictionary of global attributes """ axes = get_default_axes(axes) attrs = get_geographic_attributes(df, axes) attrs = dict_update(attrs, get_vertical_attributes(df, axes)) attrs = dict_update(attrs, get_temporal_attributes(df, axes)) attrs = dict_update(attrs, get_creation_attributes(history)) return attrs
def get_temporal_attributes(df, axes=None): """ Use values in a dataframe to set temporal attributes for the eventual netCDF file Attribute names come from https://www.nodc.noaa.gov/data/formats/netcdf/v2.0/ :param df: data (Pandas DataFrame) :param axes: keys (x,y,z,t) are associated with actual column names (dictionary). z in meters. :return: nested dictionary of variable and global attributes """ axes = get_default_axes(axes) mint = df[axes.t].min() maxt = df[axes.t].max() times = pd.DatetimeIndex(unique_justseen(df[axes.t])) dt_index_diff = times[1:] - times[:-1] dt_counts = dt_index_diff.value_counts(sort=True) if dt_counts.size > 0 and dt_counts.values[0] / (len(times) - 1) > 0.75: mode_value = dt_counts.index[0] else: # Calculate a static resolution mode_value = ((maxt - mint) / len(times)) return { 'variables': { axes.t: { 'attributes': { 'actual_min': mint.strftime('%Y-%m-%dT%H:%M:%SZ'), 'actual_max': maxt.strftime('%Y-%m-%dT%H:%M:%SZ'), } }, }, 'attributes': { 'time_coverage_start': mint.strftime('%Y-%m-%dT%H:%M:%SZ'), 'time_coverage_end': maxt.strftime('%Y-%m-%dT%H:%M:%SZ'), 'time_coverage_duration': (maxt - mint).round('1S').isoformat(), 'time_coverage_resolution': mode_value.round('1S').isoformat() } }
def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes) # Multiple profiles in the file pvar = axv.profile p_dim = self.dimensions[pvar.dimensions[0]] zvar = axv.z zs = len(self.dimensions[[ d for d in zvar.dimensions if d != p_dim.name ][0]]) # Profiles p = normalize_countable_array(pvar) p = p.repeat(zs) # Z z = generic_masked(zvar[:].flatten(), attrs=self.vatts(zvar.name)) # T tvar = axv.t t = tvar[:].repeat(zs) nt = get_masked_datetime_array(t, tvar).flatten() # X xvar = axv.x x = generic_masked(xvar[:].repeat(zs), attrs=self.vatts(xvar.name)) # Y yvar = axv.y y = generic_masked(yvar[:].repeat(zs), attrs=self.vatts(yvar.name)) df_data = OrderedDict([(axes.t, nt), (axes.x, x), (axes.y, y), (axes.z, z), (axes.profile, p)]) building_index_to_drop = np.ones(t.size, dtype=bool) extract_vars = copy(self.variables) for ncvar in axv._asdict().values(): if ncvar is not None and ncvar.name in extract_vars: del extract_vars[ncvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): # Profile dimension if dvar.dimensions == pvar.dimensions: vdata = generic_masked(dvar[:].repeat(zs).astype(dvar.dtype), attrs=self.vatts(dnam)) # Profile, z dimension elif dvar.dimensions == zvar.dimensions: vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) else: vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning( "Skipping variable {} that is completely masked". format(dnam)) continue else: L.warning( "Skipping variable {} since it didn't match any dimension sizes" .format(dnam)) continue # Mark rows with data so we don't remove them with clear_rows if vdata.size == building_index_to_drop.size: building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa # Handle scalars here at the end if vdata.size == 1: vdata = vdata[0] df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) _ = kwargs.pop('reduce_dims', False) _ = kwargs.pop('unlimited', False) with ContiguousRaggedTrajectoryProfile(output, 'w') as nc: trajectory_groups = df.groupby(axes.trajectory) unique_trajectories = list(trajectory_groups.groups.keys()) num_trajectories = len(unique_trajectories) nc.createDimension(axes.trajectory, num_trajectories) trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (axes.trajectory,)) trajectory[:] = np.array(unique_trajectories) # Calculate the max number of profiles unique_profiles = df[axes.profile].unique() num_profiles = len(unique_profiles) nc.createDimension(axes.profile, num_profiles) profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (axes.profile,)) profile[:] = np.array(unique_profiles) # Get unique obs by grouping on traj and profile and getting the max size num_obs = len(df) nc.createDimension(axes.sample, num_obs) # The trajectory this profile belongs to t_ind = nc.createVariable('trajectoryIndex', 'i4', (axes.profile,)) # Number of observations in each profile row_size = nc.createVariable('rowSize', 'i4', (axes.profile,)) # Create all of the axis variables time = nc.createVariable(axes.t, 'f8', (axes.profile,), fill_value=np.dtype('f8').type(cls.default_fill_value)) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), (axes.profile,), fill_value=df[axes.y].dtype.type(cls.default_fill_value)) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), (axes.profile,), fill_value=df[axes.x].dtype.type(cls.default_fill_value)) # Axes variables are already processed so skip them data_columns = [ d for d in df.columns if d not in axes ] attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {})) for i, (_, trg) in enumerate(trajectory_groups): for j, (_, pfg) in enumerate(trg.groupby(axes.profile)): time[j] = get_ncdata_from_series(pfg[axes.t], time)[0] latitude[j] = get_ncdata_from_series(pfg[axes.y], latitude)[0] longitude[j] = get_ncdata_from_series(pfg[axes.x], longitude)[0] row_size[j] = len(pfg) t_ind[j] = i # Add back in the z axes that was removed when calculating data_columns data_columns = data_columns + [axes.z] skips = ['trajectoryIndex', 'rowSize'] for c in [ d for d in data_columns if d not in skips ]: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series( nc, var_name, (axes.sample,), df[c], zlib=True, complevel=1 ) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(df[c], v) try: v[:] = vvalues except BaseException: L.exception('Failed to add {}'.format(c)) continue # Metadata variables if 'crs' not in nc.variables: nc.createVariable('crs', 'i4') # Set attributes nc.update_attributes(attributes) return ContiguousRaggedTrajectoryProfile(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) data_columns = [ d for d in df.columns if d not in axes ] unlimited = kwargs.pop('unlimited', False) with IncompleteMultidimensionalProfile(output, 'w') as nc: profile_group = df.groupby(axes.profile) if unlimited is True: max_profiles = None else: max_profiles = df[axes.profile].unique().size nc.createDimension(axes.profile, max_profiles) max_zs = profile_group.size().max() nc.createDimension(axes.z, max_zs) # Metadata variables nc.createVariable('crs', 'i4') profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (axes.profile,)) # Create all of the variables time = nc.createVariable(axes.t, 'f8', (axes.profile,)) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), (axes.profile,)) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), (axes.profile,)) z = nc.createVariable(axes.z, get_dtype(df[axes.z]), (axes.profile, axes.z), fill_value=df[axes.z].dtype.type(cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {})) for i, (uid, pdf) in enumerate(profile_group): profile[i] = uid time[i] = nc4.date2num(pdf[axes.t].iloc[0], units=cls.default_time_unit) latitude[i] = pdf[axes.y].iloc[0] longitude[i] = pdf[axes.x].iloc[0] zvalues = pdf[axes.z].fillna(z._FillValue).values sl = slice(0, zvalues.size) z[i, sl] = zvalues for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series( nc, var_name, (axes.profile, axes.z), pdf[c], zlib=True, complevel=1 ) attributes[var_name] = dict_update(attributes.get(var_name, {}), { 'coordinates' : '{} {} {} {}'.format( axes.t, axes.z, axes.x, axes.y ) }) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(pdf[c], v) sl = slice(0, vvalues.size) v[i, sl] = vvalues # Set global attributes nc.update_attributes(attributes) return IncompleteMultidimensionalProfile(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) data_columns = [d for d in df.columns if d not in axes] with OrthogonalMultidimensionalTimeseries(output, 'w') as nc: station_group = df.groupby(axes.station) num_stations = len(station_group) # assume all groups are the same size and have identical times _, sdf = list(station_group)[0] t = sdf[axes.t] # Metadata variables nc.createVariable('crs', 'i4') # Create all of the variables nc.createDimension(axes.t, t.size) nc.createDimension(axes.station, num_stations) station = nc.createVariable(axes.station, get_dtype(df.station), (axes.station, )) time = nc.createVariable(axes.t, 'f8', (axes.t, )) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), (axes.station, )) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), (axes.station, )) z = nc.createVariable(axes.z, get_dtype(df[axes.z]), (axes.station, ), fill_value=df[axes.z].dtype.type( cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {})) # tolist() converts to a python datetime object without timezone and has NaTs. g = t.tolist() # date2num convers NaTs to np.nan gg = nc4.date2num(g, units=cls.default_time_unit) # masked_invalid moves np.nan to a masked value time[:] = np.ma.masked_invalid(gg) for i, (uid, sdf) in enumerate(station_group): station[i] = uid latitude[i] = sdf[axes.y].iloc[0] longitude[i] = sdf[axes.x].iloc[0] # TODO: write a test for a Z with a _FillValue z[i] = sdf[axes.z].iloc[0] for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, (axes.station, axes.t), sdf[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), { 'coordinates': '{} {} {} {}'.format(axes.t, axes.z, axes.x, axes.y) }) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(sdf[c], v) try: v[i, :] = vvalues except BaseException: L.debug( '{} was not written. Likely a metadata variable'. format(v.name)) # Set global attributes nc.update_attributes(attributes) return OrthogonalMultidimensionalTimeseries(output, **kwargs)
def get_geographic_attributes(df, axes=None): """ Use values in a dataframe to set geographic attributes for the eventual netCDF file Attribute names come from https://www.nodc.noaa.gov/data/formats/netcdf/v2.0/ The coordinate reference system (CRS) is assumed to be EPSG:4326, which is WGS84 and is used with GPS satellite navigation (http://spatialreference.org/ref/epsg/wgs-84/). This is NCEI's default. Coordinate values are latitude (decimal degrees_north) and longitude (decimal degrees_east). Longitude values are limited to [-180, 180). :param df: data (Pandas DataFrame) :param axes: keys (x,y,z,t) are associated with actual column names (dictionary) :return: nested dictionary of variable and global attributes """ axes = get_default_axes(axes) carry_miny = round(float(df[axes.y].min()), 6) carry_maxy = round(float(df[axes.y].max()), 6) carry_minx = round(float(df[axes.x].min()), 6) carry_maxx = round(float(df[axes.x].max()), 6) notnull = df[axes.x].notnull() & df[axes.y].notnull() coords = list(zip(df.loc[notnull, axes.x], df.loc[notnull, axes.y])) if len(set(coords)) == 1: geoclass = Point elif len(coords) > 2: geoclass = Polygon else: geoclass = LineString p = geoclass(coords) dateline = LineString([(180, 90), (-180, -90)]) # If we cross the dateline normalize the coordinates before polygon if dateline.crosses(p): newx = (df.loc[notnull, axes.x] + 360) % 360 p = geoclass(zip(newx, df.loc[notnull, axes.y])) p = fix_geom(p) geometry_bbox = box(*p.bounds).wkt geometry_wkt = p.convex_hull.wkt return { 'variables': { axes.y: { 'attributes': { 'actual_min': carry_miny, 'actual_max': carry_maxy, } }, axes.x: { 'attributes': { 'actual_min': carry_minx, 'actual_max': carry_maxx, } }, }, 'attributes': { 'geospatial_lat_min': carry_miny, 'geospatial_lat_max': carry_maxy, 'geospatial_lon_min': carry_minx, 'geospatial_lon_max': carry_maxx, 'geospatial_bbox': geometry_bbox, 'geospatial_bounds': geometry_wkt, 'geospatial_bounds_crs': 'EPSG:4326', } }
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes reduce_dims = kwargs.pop('reduce_dims', False) unlimited = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not supported in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) with RaggedTimeseriesProfile(output, 'w') as nc: station_groups = df.groupby(axes.station) unique_stations = list(station_groups.groups.keys()) num_stations = len(unique_stations) # Calculate the max number of profiles profile_groups = df.groupby(axes.profile) unique_profiles = list(profile_groups.groups.keys()) num_profiles = len(unique_profiles) nc.createDimension(daxes.profile, num_profiles) if reduce_dims is True and num_stations == 1: # If a singular station, remove the dimension station_dimensions = () s_ind = None else: station_dimensions = (daxes.station,) nc.createDimension(daxes.station, num_stations) # The station this profile belongs to s_ind = nc.createVariable('stationIndex', 'i4', (daxes.profile,)) station = nc.createVariable(axes.station, get_dtype(unique_stations), station_dimensions) profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (daxes.profile,)) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), station_dimensions) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), station_dimensions) # Get unique obs by grouping on traj and profile and getting the max size if unlimited is True: nc.createDimension(daxes.sample, None) else: nc.createDimension(daxes.sample, len(df)) # Number of observations in each profile row_size = nc.createVariable('rowSize', 'i4', (daxes.profile,)) # Axes variables are already processed so skip them data_columns = [ d for d in df.columns if d not in axes ] data_columns += [axes.t, axes.z] # time isn't really special, its dimensioned by obs attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) for i, (sname, srg) in enumerate(station_groups): station[i] = sname latitude[i] = df[axes.y][df[axes.station] == sname].dropna().iloc[0] longitude[i] = df[axes.x][df[axes.station] == sname].dropna().iloc[0] for j, (pname, pfg) in enumerate(profile_groups): profile[j] = pname row_size[j] = len(pfg) if s_ind is not None: s_ind[j] = np.asscalar(np.argwhere(station[:] == pfg[axes.station].dropna().iloc[0])) # Add back in the z axes that was removed when calculating data_columns # and ignore variables that were stored in the profile index skips = ['stationIndex', 'rowSize'] for c in [ d for d in data_columns if d not in skips ]: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series( nc, var_name, (daxes.sample,), df[c], zlib=True, complevel=1 ) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(df[c], v) try: if unlimited is True: v[:] = vvalues else: v[:] = vvalues.reshape(v.shape) except BaseException: L.exception('Failed to add {}'.format(c)) continue # Metadata variables nc.createVariable('crs', 'i4') # Set attributes nc.update_attributes(attributes) return RaggedTimeseriesProfile(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) data_columns = [d for d in df.columns if d not in axes] reduce_dims = kwargs.pop('reduce_dims', False) unlimited = kwargs.pop('unlimited', False) with IncompleteMultidimensionalTrajectory(output, 'w') as nc: trajectory_group = df.groupby(axes.trajectory) if unlimited is True: max_obs = None else: max_obs = trajectory_group.size().max() nc.createDimension(axes.sample, max_obs) num_trajectories = len(trajectory_group) if reduce_dims is True and num_trajectories == 1: # If a singlular trajectory, we can reduce that dimension if it is of size 1 def ts(t_index, size): return np.s_[0:size] default_dimensions = (axes.sample, ) trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory])) else: def ts(t_index, size): return np.s_[t_index, 0:size] default_dimensions = (axes.trajectory, axes.sample) nc.createDimension(axes.trajectory, num_trajectories) trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (axes.trajectory, )) # Create all of the variables time = nc.createVariable(axes.t, 'f8', default_dimensions, fill_value=np.dtype('f8').type( cls.default_fill_value)) z = nc.createVariable(axes.z, get_dtype(df[axes.z]), default_dimensions, fill_value=df[axes.z].dtype.type( cls.default_fill_value)) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), default_dimensions, fill_value=df[axes.y].dtype.type( cls.default_fill_value)) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), default_dimensions, fill_value=df[axes.x].dtype.type( cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {})) for i, (uid, gdf) in enumerate(trajectory_group): trajectory[i] = uid # tolist() converts to a python datetime object without timezone and has NaTs. g = gdf[axes.t].tolist() # date2num convers NaTs to np.nan gg = nc4.date2num(g, units=cls.default_time_unit) # masked_invalid moves np.nan to a masked value time[ts(i, gg.size)] = np.ma.masked_invalid(gg) lats = gdf[axes.y].fillna(get_fill_value(latitude)).values latitude[ts(i, lats.size)] = lats lons = gdf[axes.x].fillna(get_fill_value(longitude)).values longitude[ts(i, lons.size)] = lons zs = gdf[axes.z].fillna(get_fill_value(z)).values z[ts(i, zs.size)] = zs for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, default_dimensions, gdf[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), { 'coordinates': '{} {} {} {}'.format(axes.t, axes.z, axes.x, axes.y) }) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(gdf[c], v) v[ts(i, vvalues.size)] = vvalues # Metadata variables if 'crs' not in nc.variables: nc.createVariable('crs', 'i4') # Set attributes nc.update_attributes(attributes) return IncompleteMultidimensionalTrajectory(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes # Should never be a CR file with one trajectory so we ignore the "reduce_dims" attribute _ = kwargs.pop('reduce_dims', False) # noqa unlimited = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not support in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) with ContiguousRaggedTrajectory(output, 'w') as nc: trajectory_groups = df.groupby(axes.trajectory) unique_trajectories = list(trajectory_groups.groups.keys()) num_trajectories = len(unique_trajectories) nc.createDimension(daxes.trajectory, num_trajectories) trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (daxes.trajectory, )) # Get unique obs by grouping on traj getting the max size if unlimited is True: nc.createDimension(daxes.sample, None) else: nc.createDimension(daxes.sample, len(df)) # Number of observations in each trajectory row_size = nc.createVariable('rowSize', 'i4', (daxes.trajectory, )) attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) # Variables defined on only the trajectory axis traj_vars = kwargs.pop('traj_vars', []) traj_columns = [p for p in traj_vars if p in df.columns] for c in traj_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: create_ncvar_from_series(nc, var_name, (daxes.trajectory, ), df[c], zlib=True, complevel=1) for i, (trajid, trg) in enumerate(trajectory_groups): trajectory[i] = trajid row_size[i] = len(trg) # Save any trajectory variables using the first value found # in the column. for c in traj_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: continue v = nc.variables[var_name] vvalues = get_ncdata_from_series(trg[c], v)[0] try: v[i] = vvalues except BaseException: L.exception('Failed to add {}'.format(c)) continue # Add all of the columns based on the sample dimension. Take all columns and remove the # trajectory, rowSize and other trajectory based columns. sample_columns = [ f for f in df.columns if f not in traj_columns + ['rowSize', axes.trajectory] ] for c in sample_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, (daxes.sample, ), df[c], zlib=True, complevel=1) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(df[c], v) try: if unlimited is True: v[:] = vvalues else: v[:] = vvalues.reshape(v.shape) except BaseException: L.exception('Failed to add {}'.format(c)) continue # Metadata variables if 'crs' not in nc.variables: nc.createVariable('crs', 'i4') # Set attributes nc.update_attributes(attributes) return ContiguousRaggedTrajectory(output, **kwargs)
def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes) o_index_var = self.filter_by_attrs( sample_dimension=lambda x: x is not None) if not o_index_var: raise ValueError( 'Could not find the "sample_dimension" attribute on any variables, ' 'is this a valid {}?'.format(self.__class__.__name__)) else: o_index_var = o_index_var[0] o_dim = self.dimensions[ o_index_var.sample_dimension] # Sample dimension t_dim = o_index_var.dimensions # Trajectory row_sizes = o_index_var[:] traj_data = normalize_countable_array(axv.trajectory) traj_data = np.repeat(traj_data, row_sizes) # time time_data = get_masked_datetime_array(axv.t[:], axv.t).flatten() df_data = OrderedDict([(axes.t, time_data), (axes.trajectory, traj_data)]) building_index_to_drop = np.ones(o_dim.size, dtype=bool) extract_vars = copy(self.variables) # Skip the time and row index variables del extract_vars[o_index_var.name] del extract_vars[axes.t] for i, (dnam, dvar) in enumerate(extract_vars.items()): # Trajectory dimensions if dvar.dimensions == t_dim: vdata = np.repeat( generic_masked(dvar[:], attrs=self.vatts(dnam)), row_sizes) # Sample dimensions elif dvar.dimensions == (o_dim.name, ): vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) else: vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning( "Skipping variable {} that is completely masked". format(dnam)) continue else: L.warning( "Skipping variable {} since it didn't match any dimension sizes" .format(dnam)) continue # Mark rows with data so we don't remove them with clear_rows if vdata.size == building_index_to_drop.size: building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa # Handle scalars here at the end if vdata.size == 1: vdata = vdata[0] df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes data_columns = [d for d in df.columns if d not in axes] reduce_dims = kwargs.pop('reduce_dims', False) unlimited = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not supported in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) # Make a new index that is the Cartesian product of all of the values from all of the # values of the old index. This is so don't have to iterate over anything. The full column # of data will be able to be shaped to the size of the final unique sized dimensions. index_order = [axes.t, axes.z, axes.station] df = df.set_index(index_order) df = df.reindex( pd.MultiIndex.from_product(df.index.levels, names=index_order)) unique_z = df.index.get_level_values(axes.z).unique().values unique_t = df.index.get_level_values( axes.t).unique().tolist() # tolist converts to Timestamp all_stations = df.index.get_level_values(axes.station) unique_s = all_stations.unique() with OrthogonalMultidimensionalTimeseriesProfile(output, 'w') as nc: if reduce_dims is True and unique_s.size == 1: # If a singular trajectory, we can reduce that dimension if it is of size 1 default_dimensions = (daxes.t, daxes.z) station_dimensions = () else: default_dimensions = (daxes.t, daxes.z, daxes.station) station_dimensions = (daxes.station, ) nc.createDimension(daxes.station, unique_s.size) station = nc.createVariable(axes.station, get_dtype(unique_s), station_dimensions) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), station_dimensions) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), station_dimensions) # Assign over loop because VLEN variables (strings) have to be assigned by integer index # and we need to find the lat/lon based on station index for si, st in enumerate(unique_s): station[si] = st latitude[si] = df[axes.y][all_stations == st].dropna().iloc[0] longitude[si] = df[axes.x][all_stations == st].dropna().iloc[0] # Metadata variables nc.createVariable('crs', 'i4') # Create all of the variables if unlimited is True: nc.createDimension(daxes.t, None) else: nc.createDimension(daxes.t, len(unique_t)) time = nc.createVariable(axes.t, 'f8', (daxes.t, )) time[:] = date2num(unique_t, units=cls.default_time_unit).astype('f8') nc.createDimension(daxes.z, unique_z.size) z = nc.createVariable(axes.z, get_dtype(unique_z), (daxes.z, )) z[:] = unique_z attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) # Variables defined on only the time axis and not the depth axis detach_z_vars = kwargs.pop('detach_z', []) detach_z_columnms = [p for p in detach_z_vars if p in data_columns] for c in detach_z_columnms: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series( nc, var_name, default_dimensions[ 0::2], # this removes the second dimension (z) df[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), { 'coordinates': '{} {} {}'.format(axes.t, axes.x, axes.y) }) else: v = nc.variables[var_name] # Because we need access to the fillvalues here, we ask not to return # the values with them already filled. vvalues = get_ncdata_from_series(df[c], v, fillna=False) # Reshape to the full array, with Z vvalues = vvalues.reshape(len(unique_t), unique_z.size, unique_s.size) # The Z axis is always the second axis, take the mean over that axis vvalues = np.apply_along_axis(np.nanmean, 1, vvalues).flatten() # Now reshape to the array without Z vvalues = vvalues.reshape(len(unique_t), unique_s.size) try: v[:] = vvalues.reshape(v.shape) except BaseException: L.exception('Failed to add {}'.format(c)) continue full_columns = [ f for f in data_columns if f not in detach_z_columnms ] for c in full_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, default_dimensions, df[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), { 'coordinates': '{} {} {} {}'.format(axes.t, axes.z, axes.x, axes.y) }) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(df[c], v) v[:] = vvalues.reshape(v.shape) nc.update_attributes(attributes) return OrthogonalMultidimensionalTimeseriesProfile(output, **kwargs)
def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes) zvar = axv.z zs = len(self.dimensions[zvar.dimensions[0]]) # Profiles pvar = axv.profile p = normalize_countable_array(pvar) ps = p.size p = p.repeat(zs) # Z z = generic_masked(zvar[:], attrs=self.vatts(zvar.name)) try: z = np.tile(z, ps) except ValueError: z = z.flatten() # T tvar = axv.t t = tvar[:].repeat(zs) nt = get_masked_datetime_array(t, tvar).flatten() # X xvar = axv.x x = generic_masked(xvar[:].repeat(zs), attrs=self.vatts(xvar.name)) # Y yvar = axv.y y = generic_masked(yvar[:].repeat(zs), attrs=self.vatts(yvar.name)) df_data = OrderedDict([(axes.t, nt), (axes.x, x), (axes.y, y), (axes.z, z), (axes.profile, p)]) building_index_to_drop = np.ones(t.size, dtype=bool) # Axes variables are already processed so skip them extract_vars = copy(self.variables) for ncvar in axv._asdict().values(): if ncvar is not None and ncvar.name in extract_vars: del extract_vars[ncvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): # Profile dimension if dvar.dimensions == pvar.dimensions: vdata = generic_masked(dvar[:].repeat(zs).astype(dvar.dtype), attrs=self.vatts(dnam)) building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa # Z dimension elif dvar.dimensions == zvar.dimensions: vdata = generic_masked(np.tile(dvar[:], ps).flatten().astype( dvar.dtype), attrs=self.vatts(dnam)) building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa # Profile, z dimension elif dvar.dimensions == pvar.dimensions + zvar.dimensions: vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa else: vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning( "Skipping variable {} that is completely masked". format(dnam)) continue vdata = vdata[0] else: L.warning( "Skipping variable {} since it didn't match any dimension sizes" .format(dnam)) continue df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes data_columns = [ d for d in df.columns if d not in axes ] reduce_dims = kwargs.pop('reduce_dims', False) unlimited = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not support in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) with IncompleteMultidimensionalTrajectory(output, 'w') as nc: trajectory_group = df.groupby(axes.trajectory) if unlimited is True: max_obs = None else: max_obs = trajectory_group.size().max() nc.createDimension(daxes.sample, max_obs) num_trajectories = len(trajectory_group) if reduce_dims is True and num_trajectories == 1: # If a singlular trajectory, we can reduce that dimension if it is of size 1 def ts(t_index, size): return np.s_[0:size] default_dimensions = (daxes.sample,) trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory])) else: def ts(t_index, size): return np.s_[t_index, 0:size] default_dimensions = (daxes.trajectory, daxes.sample) nc.createDimension(daxes.trajectory, num_trajectories) trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (daxes.trajectory,)) # Create all of the variables time = nc.createVariable(axes.t, 'f8', default_dimensions, fill_value=np.dtype('f8').type(cls.default_fill_value)) z = nc.createVariable(axes.z, get_dtype(df[axes.z]), default_dimensions, fill_value=df[axes.z].dtype.type(cls.default_fill_value)) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), default_dimensions, fill_value=df[axes.y].dtype.type(cls.default_fill_value)) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), default_dimensions, fill_value=df[axes.x].dtype.type(cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) # Create vars based on full dataframe (to get all variables) for c in data_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series( nc, var_name, default_dimensions, df[c], zlib=True, complevel=1 ) attributes[var_name] = dict_update(attributes.get(var_name, {}), { 'coordinates': '{} {} {} {}'.format( axes.t, axes.z, axes.x, axes.y ) }) for i, (uid, gdf) in enumerate(trajectory_group): trajectory[i] = uid times = get_ncdata_from_series(gdf[axes.t], time) time[ts(i, times.size)] = times lats = get_ncdata_from_series(gdf[axes.y], latitude) latitude[ts(i, lats.size)] = lats lons = get_ncdata_from_series(gdf[axes.x], longitude) longitude[ts(i, lons.size)] = lons zs = gdf[axes.z].fillna(get_fill_value(z)).values z[ts(i, zs.size)] = zs for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) v = nc.variables[var_name] vvalues = get_ncdata_from_series(gdf[c], v) slicer = ts(i, vvalues.size) v[slicer] = vvalues # Metadata variables if 'crs' not in nc.variables: nc.createVariable('crs', 'i4') # Set attributes nc.update_attributes(attributes) return IncompleteMultidimensionalTrajectory(output, **kwargs)
def test_get_default_axes(self): assert get_default_axes() == ( 'trajectory', 'station', 'profile', 'obs', 't', 'x', 'y', 'z', ) new_defaults = { 'trajectory': 'a', 'station': 'b', 'profile': 'c', 'sample': 'h', 't': 'd', 'x': 'e', 'y': 'f', 'z': 'g', } assert get_default_axes(new_defaults) == ( 'a', 'b', 'c', 'h', 'd', 'e', 'f', 'g', ) new_defaults = { 'trajectory': 'a', 'station': 'b', 'profile': 'c' } assert get_default_axes(new_defaults) == ( 'a', 'b', 'c', 'obs', 't', 'x', 'y', 'z', ) # Time is not a valid axis key bad_defaults = { 'time': 'a' } with self.assertRaises(TypeError): get_default_axes(bad_defaults) # Can't have duplicate values bad_defaults = { 'x': 'a', 'y': 'a' } with self.assertRaises(ValueError): get_default_axes(bad_defaults) # but you can with the sample dimension bad_defaults = { 't': 'time', 'sample': 'time' } assert get_default_axes(bad_defaults) == ( 'trajectory', 'station', 'profile', 'time', 'time', 'x', 'y', 'z', )
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes data_columns = [d for d in df.columns if d not in axes] unlimited = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not support in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) with IncompleteMultidimensionalProfile(output, 'w') as nc: profile_group = df.groupby(axes.profile) if unlimited is True: max_profiles = None else: max_profiles = df[axes.profile].unique().size nc.createDimension(daxes.profile, max_profiles) max_zs = profile_group.size().max() nc.createDimension(daxes.z, max_zs) # Metadata variables nc.createVariable('crs', 'i4') profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (daxes.profile, )) # Create all of the variables time = nc.createVariable(axes.t, 'f8', (daxes.profile, )) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), (daxes.profile, )) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), (daxes.profile, )) z = nc.createVariable( axes.z, get_dtype(df[axes.z]), (daxes.profile, daxes.z), fill_value=df[axes.z].dtype.type(cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) # Create vars based on full dataframe (to get all variables) for c in data_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, (daxes.profile, daxes.z), df[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), { 'coordinates': '{} {} {} {}'.format(axes.t, axes.z, axes.x, axes.y) }) # Write values for each profile within profile_group for i, (uid, pdf) in enumerate(profile_group): profile[i] = uid time[i] = date2num(pdf[axes.t].iloc[0], units=cls.default_time_unit) latitude[i] = pdf[axes.y].iloc[0] longitude[i] = pdf[axes.x].iloc[0] zvalues = pdf[axes.z].fillna(z._FillValue).values sl = slice(0, zvalues.size) z[i, sl] = zvalues for c in data_columns: var_name = cf_safe_name(c) v = nc.variables[var_name] vvalues = get_ncdata_from_series(pdf[c], v) sl = slice(0, vvalues.size) v[i, sl] = vvalues # Set global attributes nc.update_attributes(attributes) return IncompleteMultidimensionalProfile(output, **kwargs)
def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes) # Profile dimension p_var = self.filter_by_attrs(cf_role='profile_id')[0] p_dim = self.dimensions[p_var.dimensions[0]] # Station dimension s_var = self.filter_by_attrs(cf_role='timeseries_id')[0] if s_var.ndim == 1: s_dim = self.dimensions[s_var.dimensions[0]] elif s_var.ndim == 0: s_dim = None else: raise ValueError('Number of dimension on the station (timeseries_id) must be 0 or 1') # Station index r_index_var = self.filter_by_attrs(instance_dimension=lambda x: x is not None) if not r_index_var: # A reduced netCDF file, set station to 0 so it pulls the first value # of the variable that identifies the stations r_index_var = [0] else: r_index_var = r_index_var[0] # Sample (obs) dimension o_index_var = self.filter_by_attrs(sample_dimension=lambda x: x is not None) if not o_index_var: raise ValueError( 'Could not find the "sample_dimension" attribute on any variables, ' 'is this a valid {}?'.format(self.__class__.__name__) ) else: o_index_var = o_index_var[0] # Sample dimension # Since this is a flat dataframe, everything is the length of the obs dimension row_sizes = o_index_var[:] o_dim = self.dimensions[o_index_var.sample_dimension] profile_indexes = normalize_countable_array(p_var, count_if_none=p_dim.size) p = np.repeat(profile_indexes, row_sizes) stat_indexes = normalize_countable_array(s_var, count_if_none=s_dim.size) r = np.ma.masked_all(o_dim.size, dtype=stat_indexes.dtype) # Lat and Lon are on the station dimension xvar = axv.x x = np.ma.masked_all(o_dim.size, dtype=xvar.dtype) yvar = axv.y y = np.ma.masked_all(o_dim.size, dtype=yvar.dtype) si = 0 for i in np.arange(stat_indexes.size): ei = si + o_index_var[i] r[si:ei] = np.array(stat_indexes[r_index_var[i]]) x[si:ei] = xvar[i] y[si:ei] = yvar[i] si = ei x = generic_masked(x, minv=-180, maxv=180) y = generic_masked(y, minv=-90, maxv=90) # Time and Z are on the sample (obs) dimension tvar = axv.t t = get_masked_datetime_array( generic_masked(tvar[:].flatten(), attrs=self.vatts(tvar.name)), tvar ) z = generic_masked(axv.z[:].flatten(), attrs=self.vatts(axv.z.name)) df_data = OrderedDict([ (axes.t, t), (axes.x, x), (axes.y, y), (axes.z, z), (axes.station, r), (axes.profile, p) ]) building_index_to_drop = np.ones(o_dim.size, dtype=bool) extract_vars = copy(self.variables) # Skip the station and row index variables del extract_vars[o_index_var.name] del extract_vars[r_index_var.name] # Axes variables are already processed so skip them for ncvar in axv._asdict().values(): if ncvar is not None and ncvar.name in extract_vars: del extract_vars[ncvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): # Profile dimensions if dvar.dimensions == (p_dim.name,): vdata = generic_masked( np.repeat( dvar[:].flatten().astype(dvar.dtype), row_sizes ), attrs=self.vatts(dnam) ) # Sample dimensions elif dvar.dimensions == (o_dim.name,): vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) else: vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning("Skipping variable {} that is completely masked".format(dnam)) continue else: L.warning("Skipping variable {} since it didn't match any dimension sizes".format(dnam)) continue # Mark rows with data so we don't remove them with clear_rows if vdata.size == building_index_to_drop.size: building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True) # noqa # Handle scalars here at the end if vdata.size == 1: vdata = vdata[0] df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes) # The index variable (trajectory_index) is identified by having an # attribute with name of instance_dimension whose value is the instance # dimension name (trajectory in this example). The index variable must # have the profile dimension as its sole dimension, and must be type # integer. Each value in the index variable is the zero-based trajectory # index that the profile belongs to i.e. profile p belongs to trajectory # i=trajectory_index(p), as in section H.2.5. r_index_var = self.filter_by_attrs( instance_dimension=lambda x: x is not None) if not r_index_var: raise ValueError( 'Could not find the "instance_dimension" attribute on any variables, ' 'is this a valid {}?'.format(self.__class__.__name__)) else: r_index_var = r_index_var[0] p_dim = self.dimensions[r_index_var.dimensions[0]] # Profile dimension # We should probably use this below to test for dimensionality of variables? # r_dim = self.dimensions[r_index_var.instance_dimension] # Trajectory dimension # The count variable (row_size) contains the number of elements for # each profile, which must be written contiguously. The count variable # is identified by having an attribute with name sample_dimension whose # value is the sample dimension (obs in this example) being counted. It # must have the profile dimension as its sole dimension, and must be # type integer o_index_var = self.filter_by_attrs( sample_dimension=lambda x: x is not None) if not o_index_var: raise ValueError( 'Could not find the "sample_dimension" attribute on any variables, ' 'is this a valid {}?'.format(self.__class__.__name__)) else: o_index_var = o_index_var[0] o_dim = self.dimensions[ o_index_var.sample_dimension] # Sample dimension profile_indexes = normalize_countable_array(axv.profile, count_if_none=p_dim.size) p = np.ma.masked_all(o_dim.size, dtype=profile_indexes.dtype) traj_indexes = normalize_countable_array(axv.trajectory) r = np.ma.masked_all(o_dim.size, dtype=traj_indexes.dtype) tvar = axv.t t = np.ma.masked_all(o_dim.size, dtype=tvar.dtype) xvar = axv.x x = np.ma.masked_all(o_dim.size, dtype=xvar.dtype) yvar = axv.y y = np.ma.masked_all(o_dim.size, dtype=yvar.dtype) si = 0 # Sample (obs) dimension zvar = axv.z z = generic_masked(zvar[:].flatten(), attrs=self.vatts(zvar.name)) for i in np.arange(profile_indexes.size): ei = si + o_index_var[i] p[si:ei] = profile_indexes[i] r[si:ei] = np.array(traj_indexes[r_index_var[i]]) t[si:ei] = tvar[i] x[si:ei] = xvar[i] y[si:ei] = yvar[i] si = ei # T nt = get_masked_datetime_array(t, tvar).flatten() # X and Y x = generic_masked(x, minv=-180, maxv=180) y = generic_masked(y, minv=-90, maxv=90) df_data = OrderedDict([(axes.t, nt), (axes.x, x), (axes.y, y), (axes.z, z), (axes.trajectory, r), (axes.profile, p)]) building_index_to_drop = np.ones(o_dim.size, dtype=bool) extract_vars = copy(self.variables) # Skip the traj and row index variables del extract_vars[o_index_var.name] del extract_vars[r_index_var.name] # Axes variables are already processed so skip them for ncvar in axv._asdict().values(): if ncvar is not None and ncvar.name in extract_vars: del extract_vars[ncvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): # Profile dimensions if dvar.dimensions == (p_dim.name, ): vdata = np.ma.masked_all(o_dim.size, dtype=dvar.dtype) si = 0 for j in np.arange(profile_indexes.size): ei = si + o_index_var[j] vdata[si:ei] = dvar[j] si = ei # Sample dimensions elif dvar.dimensions == (o_dim.name, ): vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) else: vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning( "Skipping variable {} that is completely masked". format(dnam)) continue else: L.warning( "Skipping variable {} since it didn't match any dimension sizes" .format(dnam)) continue # Mark rows with data so we don't remove them with clear_rows if vdata.size == building_index_to_drop.size: building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa # Handle scalars here at the end if vdata.size == 1: vdata = vdata[0] df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes, skip=[axes.profile, axes.station]) # T t = get_masked_datetime_array(axv.t[:], axv.t).flatten() # X x = generic_masked(axv.x[:], attrs=self.vatts(axv.x.name)).flatten() # Y y = generic_masked(axv.y[:], attrs=self.vatts(axv.y.name)).flatten() # Z z = generic_masked(axv.z[:], attrs=self.vatts(axv.z.name)).flatten() # Trajectories rvar = axv.trajectory p = normalize_countable_array(rvar) # The Dimension that the trajectory id variable doesn't have is what # the trajectory data needs to be repeated by dim_diff = self.dimensions[list( set(axv.t.dimensions).difference(set(rvar.dimensions)))[0]] if dim_diff: p = p.repeat(dim_diff.size) df_data = OrderedDict([(axes.t, t), (axes.x, x), (axes.y, y), (axes.z, z), (axes.trajectory, p)]) building_index_to_drop = np.ones(t.size, dtype=bool) # Axes variables are already processed so skip them extract_vars = copy(self.variables) for ncvar in axv._asdict().values(): if ncvar is not None and ncvar.name in extract_vars: del extract_vars[ncvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning("Skipping variable {} that is completely masked". format(dnam)) continue vdata = vdata[0] else: if dvar[:].flatten().size != t.size: L.warning("Variable {} is not the correct size, skipping.". format(dnam)) continue building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes) svar = axv.station s = normalize_countable_array(svar) # T t = get_masked_datetime_array(axv.t[:], axv.t) n_times = t.size # X x = generic_masked(axv.x[:], attrs=self.vatts(axv.x.name)) # Y y = generic_masked(axv.y[:], attrs=self.vatts(axv.y.name)) # Z z = generic_masked(axv.z[:], attrs=self.vatts(axv.z.name)) n_z = z.size # denormalize table structure t = np.repeat(t, s.size * n_z) z = np.tile(np.repeat(z, s.size), n_times) s = np.tile(s, n_z * n_times) y = np.tile(y, n_times * n_z) x = np.tile(x, n_times * n_z) df_data = OrderedDict([ (axes.t, t), (axes.x, x), (axes.y, y), (axes.z, z), (axes.station, s), ]) building_index_to_drop = np.ones(t.size, dtype=bool) # Axes variables are already processed so skip them extract_vars = copy(self.variables) for ncvar in axv._asdict().values(): if ncvar is not None and ncvar.name in extract_vars: del extract_vars[ncvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning("Skipping variable {} that is completely masked". format(dnam)) continue vdata = vdata[0] else: if dvar[:].flatten().size != t.size: L.warning("Variable {} is not the correct size, skipping.". format(dnam)) continue building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes _ = kwargs.pop('reduce_dims', False) _ = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not support in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) with ContiguousRaggedTrajectoryProfile(output, 'w') as nc: trajectory_groups = df.groupby(axes.trajectory) unique_trajectories = list(trajectory_groups.groups.keys()) num_trajectories = len(unique_trajectories) nc.createDimension(daxes.trajectory, num_trajectories) trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (daxes.trajectory, )) trajectory[:] = np.array(unique_trajectories) # Calculate the max number of profiles unique_profiles = df[axes.profile].unique() num_profiles = len(unique_profiles) nc.createDimension(daxes.profile, num_profiles) profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (daxes.profile, )) profile[:] = np.array(unique_profiles) # Get unique obs by grouping on traj and profile and getting the max size num_obs = len(df) nc.createDimension(daxes.sample, num_obs) # The trajectory this profile belongs to t_ind = nc.createVariable('trajectoryIndex', 'i4', (daxes.profile, )) # Number of observations in each profile row_size = nc.createVariable('rowSize', 'i4', (daxes.profile, )) # Create all of the axis variables time = nc.createVariable(axes.t, 'f8', (daxes.profile, ), fill_value=np.dtype('f8').type( cls.default_fill_value)) latitude = nc.createVariable( axes.y, get_dtype(df[axes.y]), (daxes.profile, ), fill_value=df[axes.y].dtype.type(cls.default_fill_value)) longitude = nc.createVariable( axes.x, get_dtype(df[axes.x]), (daxes.profile, ), fill_value=df[axes.x].dtype.type(cls.default_fill_value)) # Axes variables are already processed so skip them data_columns = [d for d in df.columns if d not in axes] attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) # Variables defined on only the profile axis profile_vars = kwargs.pop('profile_vars', []) profile_columns = [p for p in profile_vars if p in data_columns] for c in profile_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: create_ncvar_from_series(nc, var_name, (daxes.profile, ), df[c], zlib=True, complevel=1) for i, (_, trg) in enumerate(trajectory_groups): for j, (_, pfg) in enumerate(trg.groupby(axes.profile)): time[j] = get_ncdata_from_series(pfg[axes.t], time).astype('f8')[0] latitude[j] = get_ncdata_from_series( pfg[axes.y], latitude)[0] longitude[j] = get_ncdata_from_series( pfg[axes.x], longitude)[0] row_size[j] = len(pfg) t_ind[j] = i # Save any profile variables on the "profile" index using the first value found # in the column. for c in profile_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: continue v = nc.variables[var_name] vvalues = get_ncdata_from_series(pfg[c], v)[0] try: v[j] = vvalues except BaseException: L.exception('Failed to add {}'.format(c)) continue # Add back in the z axes that was removed when calculating data_columns # and ignore variables that were stored in the profile index sample_columns = [ f for f in data_columns + [axes.z] if f not in profile_columns ] skips = ['trajectoryIndex', 'rowSize'] for c in [d for d in sample_columns if d not in skips]: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, (daxes.sample, ), df[c], zlib=True, complevel=1) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(df[c], v) try: v[:] = vvalues.reshape(v.shape) except BaseException: L.exception('Failed to add {}'.format(c)) continue # Metadata variables if 'crs' not in nc.variables: nc.createVariable('crs', 'i4') # Set attributes nc.update_attributes(attributes) return ContiguousRaggedTrajectoryProfile(output, **kwargs)
def to_dataframe(self, clean_cols=False, clean_rows=False, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) axv = get_mapped_axes_variables(self, axes) # T t = get_masked_datetime_array(axv.t[:], axv.t) # X x = generic_masked(axv.x[:].repeat(t.size), attrs=self.vatts(axv.x.name)) # Y y = generic_masked(axv.y[:].repeat(t.size), attrs=self.vatts(axv.y.name)) # Z if axv.z is not None: z = generic_masked(axv.z[:].repeat(t.size), attrs=self.vatts(axv.z.name)) else: z = None svar = axv.station s = normalize_countable_array(svar) s = np.repeat(s, t.size) # now repeat t per station # figure out if this is a single-station file by checking # the dimension size of the x dimension if axv.x.ndim == 1: t = np.repeat(t, len(svar)) df_data = OrderedDict([ (axes.t, t), (axes.x, x), (axes.y, y), (axes.z, z), (axes.station, s), ]) building_index_to_drop = np.ma.zeros(t.size, dtype=bool) # Axes variables are already processed so skip them extract_vars = copy(self.variables) for ncvar in axv._asdict().values(): if ncvar is not None and ncvar.name in extract_vars: del extract_vars[ncvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam)) # Carry through size 1 variables if vdata.size == 1: if vdata[0] is np.ma.masked: L.warning("Skipping variable {} that is completely masked". format(dnam)) continue else: if dvar[:].flatten().size != t.size: L.warning("Variable {} is not the correct size, skipping.". format(dnam)) continue # Mark rows with data so we don't remove them with clear_rows if vdata.size == building_index_to_drop.size: building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa # Handle scalars here at the end if vdata.size == 1: vdata = vdata[0] df_data[dnam] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) data_columns = [d for d in df.columns if d not in axes] reduce_dims = kwargs.pop('reduce_dims', False) unlimited = kwargs.pop('unlimited', False) # Downcast anything from int64 to int32 df = downcast_dataframe(df) # Make a new index that is the Cartesian product of all of the values from all of the # values of the old index. This is so don't have to iterate over anything. The full column # of data will be able to be shaped to the size of the final unique sized dimensions. index_order = [axes.t, axes.z, axes.station] df = df.set_index(index_order) df = df.reindex( pd.MultiIndex.from_product(df.index.levels, names=index_order)) unique_z = df.index.get_level_values(axes.z).unique().values unique_t = df.index.get_level_values( axes.t).unique().tolist() # tolist converts to Timestamp all_stations = df.index.get_level_values(axes.station) unique_s = all_stations.unique() with OrthogonalMultidimensionalTimeseriesProfile(output, 'w') as nc: if reduce_dims is True and unique_s.size == 1: # If a singlular trajectory, we can reduce that dimension if it is of size 1 def ts(): return np.s_[:, :] default_dimensions = (axes.t, axes.z) station_dimensions = () else: def ts(): return np.s_[:, :, :] default_dimensions = (axes.t, axes.z, axes.station) station_dimensions = (axes.station, ) nc.createDimension(axes.station, unique_s.size) station = nc.createVariable(axes.station, get_dtype(unique_s), station_dimensions) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), station_dimensions) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), station_dimensions) # Assign over loop because VLEN variables (strings) have to be assigned by integer index # and we need to find the lat/lon based on station index for si, st in enumerate(unique_s): station[si] = st latitude[si] = df[axes.y][all_stations == st].dropna().iloc[0] longitude[si] = df[axes.x][all_stations == st].dropna().iloc[0] # Metadata variables nc.createVariable('crs', 'i4') # Create all of the variables if unlimited is True: nc.createDimension(axes.t, None) else: nc.createDimension(axes.t, len(unique_t)) time = nc.createVariable(axes.t, 'f8', (axes.t, )) time[:] = nc4.date2num(unique_t, units=cls.default_time_unit) nc.createDimension(axes.z, unique_z.size) z = nc.createVariable(axes.z, get_dtype(unique_z), (axes.z, )) z[:] = unique_z attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {})) for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, default_dimensions, df[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), { 'coordinates': '{} {} {} {}'.format(axes.t, axes.z, axes.x, axes.y) }) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(df[c], v) v[ts()] = vvalues.reshape(len(unique_t), unique_z.size, unique_s.size) nc.update_attributes(attributes) return OrthogonalMultidimensionalTimeseriesProfile(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes data_columns = [d for d in df.columns if d not in axes] reduce_dims = kwargs.pop('reduce_dims', False) _ = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not support in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) with OrthogonalMultidimensionalTimeseries(output, 'w') as nc: station_group = df.groupby(axes.station) num_stations = len(station_group) has_z = axes.z is not None if reduce_dims is True and num_stations == 1: # If a station, we can reduce that dimension if it is of size 1 def ts(i): return np.s_[:] default_dimensions = (daxes.t, ) station_dimensions = () else: def ts(i): return np.s_[i, :] default_dimensions = (daxes.station, daxes.t) station_dimensions = (daxes.station, ) nc.createDimension(daxes.station, num_stations) # Set the coordinates attribute correctly coordinates = [axes.t, axes.x, axes.y] if has_z is True: coordinates.insert(1, axes.z) coordinates = ' '.join(coordinates) # assume all groups are the same size and have identical times _, sdf = list(station_group)[0] t = sdf[axes.t] # Metadata variables nc.createVariable('crs', 'i4') # Create all of the variables nc.createDimension(daxes.t, t.size) time = nc.createVariable(axes.t, 'f8', (daxes.t, )) station = nc.createVariable(axes.station, get_dtype(df[axes.station]), station_dimensions) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), station_dimensions) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), station_dimensions) if has_z is True: z = nc.createVariable(axes.z, get_dtype(df[axes.z]), station_dimensions, fill_value=df[axes.z].dtype.type( cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) time[:] = get_ncdata_from_series(t, time) # Create vars based on full dataframe (to get all variables) for c in data_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, default_dimensions, df[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), {'coordinates': coordinates}) for i, (uid, sdf) in enumerate(station_group): station[i] = uid latitude[i] = sdf[axes.y].iloc[0] longitude[i] = sdf[axes.x].iloc[0] if has_z is True: # TODO: write a test for a Z with a _FillValue z[i] = sdf[axes.z].iloc[0] for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) v = nc.variables[var_name] vvalues = get_ncdata_from_series(sdf[c], v) try: v[ts(i)] = vvalues except BaseException: L.debug( '{} was not written. Likely a metadata variable'. format(v.name)) # Set global attributes nc.update_attributes(attributes) return OrthogonalMultidimensionalTimeseries(output, **kwargs)