def nc_attributes(self, axes): atts = super(OrthogonalMultidimensionalTimeseriesProfile, self).nc_attributes() return dict_update( atts, { 'global': { 'featureType': 'timeSeriesProfile', 'cdm_data_type': 'TimeseriesProfile' }, axes.station: { 'cf_role': 'timeseries_id', 'long_name': 'station identifier' }, axes.x: { 'axis': 'X' }, axes.y: { 'axis': 'Y' }, axes.z: { 'axis': 'Z' }, axes.t: { 'units': self.default_time_unit, 'standard_name': 'time', 'axis': 'T' } })
def create_dataset(file, reader_class, deployments_path, subset, template, profile_id_type, **filter_args): # Remove None filters from the arguments filter_args = {k: v for k, v in filter_args.items() if v is not None} # Figure out the netCDF output path based on the file and the deployments_path dep_path = Path(deployments_path) file_path = Path(file) individual_dep_path = None for pp in file_path.parents: if dep_path == pp: break individual_dep_path = pp config_path = individual_dep_path / 'config' # Extract the filters from the config and override with passed in filters that are not None attrs = read_attrs(config_path, template=template) file_filters = attrs.pop('filters', {}) filters = dict_update(file_filters, filter_args) processed_df, mode = process_dataset(file, reader_class, **filters) if processed_df is None: return 1 output_path = individual_dep_path / mode / 'netcdf' return create_netcdf(attrs, processed_df, output_path, mode, profile_id_type, subset=subset)
def nc_attributes(self): atts = super(OrthogonalMultidimensionalTimeseries, self).nc_attributes() return dict_update(atts, { 'global' : { 'featureType': 'timeseries', 'cdm_data_type': 'Timeseries' }, 'station' : { 'cf_role': 'timeseries_id', 'long_name' : 'station identifier' }, 'time': { 'units': self.default_time_unit, 'standard_name': 'time', 'axis': 'T' }, 'latitude': { 'axis': 'Y' }, 'longitude': { 'axis': 'X' }, 'z': { 'axis': 'Z' } })
def nc_attributes(self, axes, daxes): atts = super(IncompleteMultidimensionalProfile, self).nc_attributes() return dict_update( atts, { 'global': { 'featureType': 'profile', 'cdm_data_type': 'Profile' }, axes.profile: { 'cf_role': 'profile_id', 'long_name': 'profile identifier' }, axes.x: { 'axis': 'X' }, axes.y: { 'axis': 'Y' }, axes.z: { 'axis': 'Z' }, axes.t: { 'units': self.default_time_unit, 'standard_name': 'time', 'axis': 'T' } })
def nc_attributes(self, axes, daxes): atts = super(ContiguousRaggedTrajectory, self).nc_attributes() return dict_update( atts, { 'global': { 'featureType': 'trajectory', 'cdm_data_type': 'Trajectory' }, axes.trajectory: { 'cf_role': 'trajectory_id', 'long_name': 'trajectory identifier', 'ioos_category': 'identifier' }, axes.x: { 'axis': 'X' }, axes.y: { 'axis': 'Y' }, axes.z: { 'axis': 'Z' }, axes.t: { 'units': self.default_time_unit, 'standard_name': 'time', 'axis': 'T' }, 'rowSize': { 'sample_dimension': daxes.sample } })
def get_calculated_attributes(df, axes=None, history=None): """ Functions to automate netCDF attribute generation from the data itself This is a wrapper for the other four functions, which could be called separately. :param df: data (Pandas DataFrame) :param axes: keys (x,y,z,t) are associated with actual column names (dictionary) :param history: history: text initializing audit trail for modifications to the original data (optional, string) :return: dictionary of global attributes """ axes = get_default_axes(axes) attrs = get_geographic_attributes(df, axes) attrs = dict_update(attrs, get_vertical_attributes(df, axes)) attrs = dict_update(attrs, get_temporal_attributes(df, axes)) attrs = dict_update(attrs, get_creation_attributes(history)) return attrs
def read_attrs(config_path=None, template=None): def cfg_file(name): return os.path.join( config_path, name ) template = template or 'trajectory' if os.path.isfile(template): default_attrs_path = template else: template_dir = os.path.join(os.path.dirname(__file__), 'templates') default_attrs_path = os.path.join(template_dir, '{}.json'.format(template)) if not os.path.isfile(default_attrs_path): L.error("Template path {} not found, using defaults.".format(default_attrs_path)) default_attrs_path = os.path.join(template_dir, 'trajectory.json') # Load in template defaults defaults = dict(MetaInterface.from_jsonfile(default_attrs_path)) # Load instruments ins = {} if config_path: ins_attrs_path = cfg_file("instruments.json") if os.path.isfile(ins_attrs_path): ins = dict(MetaInterface.from_jsonfile(ins_attrs_path)) # Load deployment attributes (including some global attributes) deps = {} if config_path: deps_attrs_path = cfg_file("deployment.json") if os.path.isfile(deps_attrs_path): deps = dict(MetaInterface.from_jsonfile(deps_attrs_path)) # Update, highest precedence updates last one = dict_update(defaults, ins) two = dict_update(one, deps) return two
def nc_attributes(self, axes, daxes): atts = super(RaggedTimeseriesProfile, self).nc_attributes() return dict_update( atts, { 'global': { 'featureType': 'timeSeriesProfile', 'cdm_data_type': 'TimeseriesProfile', 'cdm_timeseries_variables': axes.station, 'cdm_profile_variables': axes.profile, 'subsetVariables': '{x},{y},{t},{station}'.format(**axes._asdict()) }, axes.station: { 'cf_role': 'timeseries_id', 'long_name': 'station identifier', 'ioos_category': 'identifier' }, axes.profile: { 'cf_role': 'profile_id', 'long_name': 'profile identifier', 'ioos_category': 'identifier' }, axes.x: { 'axis': 'X' }, axes.y: { 'axis': 'Y' }, axes.z: { 'axis': 'Z' }, axes.t: { 'units': self.default_time_unit, 'standard_name': 'time', 'axis': 'T' }, 'stationIndex': { 'long_name': 'which station this profile belongs to', 'instance_dimension': daxes.station }, 'rowSize': { 'long_name': 'number of obs in this profile', 'sample_dimension': daxes.sample } })
def nc_attributes(self): atts = super(IncompleteMultidimensionalTrajectory, self).nc_attributes() return dict_update(atts, { 'global' : { 'featureType': 'trajectory', 'cdm_data_type': 'Trajectory' }, 'trajectory' : { 'cf_role': 'trajectory_id', 'long_name' : 'trajectory identifier' }, 'distance' : { 'long_name': 'Great circle distance between trajectory points', 'standard_name': 'distance_between_trajectory_points', 'units': 'm' } })
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes data_columns = [d for d in df.columns if d not in axes] unlimited = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not support in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) with IncompleteMultidimensionalProfile(output, 'w') as nc: profile_group = df.groupby(axes.profile) if unlimited is True: max_profiles = None else: max_profiles = df[axes.profile].unique().size nc.createDimension(daxes.profile, max_profiles) max_zs = profile_group.size().max() nc.createDimension(daxes.z, max_zs) # Metadata variables nc.createVariable('crs', 'i4') profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (daxes.profile, )) # Create all of the variables time = nc.createVariable(axes.t, 'f8', (daxes.profile, )) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), (daxes.profile, )) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), (daxes.profile, )) z = nc.createVariable( axes.z, get_dtype(df[axes.z]), (daxes.profile, daxes.z), fill_value=df[axes.z].dtype.type(cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) # Create vars based on full dataframe (to get all variables) for c in data_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, (daxes.profile, daxes.z), df[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), { 'coordinates': '{} {} {} {}'.format(axes.t, axes.z, axes.x, axes.y) }) # Write values for each profile within profile_group for i, (uid, pdf) in enumerate(profile_group): profile[i] = uid time[i] = date2num(pdf[axes.t].iloc[0], units=cls.default_time_unit) latitude[i] = pdf[axes.y].iloc[0] longitude[i] = pdf[axes.x].iloc[0] zvalues = pdf[axes.z].fillna(z._FillValue).values sl = slice(0, zvalues.size) z[i, sl] = zvalues for c in data_columns: var_name = cf_safe_name(c) v = nc.variables[var_name] vvalues = get_ncdata_from_series(pdf[c], v) sl = slice(0, vvalues.size) v[i, sl] = vvalues # Set global attributes nc.update_attributes(attributes) return IncompleteMultidimensionalProfile(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): reserved_columns = ['station', 't', 'x', 'y', 'z'] data_columns = [ d for d in df.columns if d not in reserved_columns ] with OrthogonalMultidimensionalTimeseries(output, 'w') as nc: station_group = df.groupby('station') num_stations = len(station_group) # assume all groups are the same size and have identical times _, sdf = list(station_group)[0] t = sdf.t # Metadata variables nc.createVariable('crs', 'i4') # Create all of the variables nc.createDimension('time', t.size) nc.createDimension('station', num_stations) station = nc.createVariable('station', get_dtype(df.station), ('station',)) time = nc.createVariable('time', 'f8', ('time',)) latitude = nc.createVariable('latitude', get_dtype(df.y), ('station',)) longitude = nc.createVariable('longitude', get_dtype(df.x), ('station',)) z = nc.createVariable('z', get_dtype(df.z), ('station',), fill_value=df.z.dtype.type(cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(), kwargs.pop('attributes', {})) logger.info(df.t.values.dtype) time[:] = nc4.date2num(t.tolist(), units=cls.default_time_unit) for i, (uid, sdf) in enumerate(station_group): station[i] = uid latitude[i] = sdf.y.iloc[0] longitude[i] = sdf.x.iloc[0] # TODO: write a test for a Z with a _FillValue z[i] = sdf.z.iloc[0] for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) if var_name not in nc.variables: if var_name not in attributes: attributes[var_name] = {} if sdf[c].dtype == np.dtype('datetime64[ns]'): fv = np.dtype('f8').type(cls.default_fill_value) v = nc.createVariable(var_name, 'f8', ('station', 'time',), fill_value=fv) tvalues = pd.Series(nc4.date2num(sdf[c].tolist(), units=cls.default_time_unit)) attributes[var_name] = dict_update(attributes[var_name], { 'units': cls.default_time_unit }) elif np.issubdtype(sdf[c].dtype, 'S') or sdf[c].dtype == object: # AttributeError: cannot set _FillValue attribute for VLEN or compound variable v = nc.createVariable(var_name, get_dtype(sdf[c]), ('station', 'time',)) else: v = nc.createVariable(var_name, get_dtype(sdf[c]), ('station', 'time',), fill_value=sdf[c].dtype.type(cls.default_fill_value)) attributes[var_name] = dict_update(attributes[var_name], { 'coordinates' : 'time latitude longitude z', }) else: v = nc.variables[var_name] if sdf[c].dtype == np.dtype('datetime64[ns]'): vvalues = tvalues.fillna(v._FillValue).values elif hasattr(v, '_FillValue'): vvalues = sdf[c].fillna(v._FillValue).values else: # Use an empty string... better than nothing! vvalues = sdf[c].fillna('').values try: v[i, :] = vvalues except BaseException: logger.error('{} NOPE'.format(v.name)) # Set global attributes nc.update_attributes(attributes) return OrthogonalMultidimensionalTimeseries(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) _ = kwargs.pop('reduce_dims', False) _ = kwargs.pop('unlimited', False) with ContiguousRaggedTrajectoryProfile(output, 'w') as nc: trajectory_groups = df.groupby(axes.trajectory) unique_trajectories = list(trajectory_groups.groups.keys()) num_trajectories = len(unique_trajectories) nc.createDimension(axes.trajectory, num_trajectories) trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (axes.trajectory,)) trajectory[:] = np.array(unique_trajectories) # Calculate the max number of profiles unique_profiles = df[axes.profile].unique() num_profiles = len(unique_profiles) nc.createDimension(axes.profile, num_profiles) profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (axes.profile,)) profile[:] = np.array(unique_profiles) # Get unique obs by grouping on traj and profile and getting the max size num_obs = len(df) nc.createDimension(axes.sample, num_obs) # The trajectory this profile belongs to t_ind = nc.createVariable('trajectoryIndex', 'i4', (axes.profile,)) # Number of observations in each profile row_size = nc.createVariable('rowSize', 'i4', (axes.profile,)) # Create all of the axis variables time = nc.createVariable(axes.t, 'f8', (axes.profile,), fill_value=np.dtype('f8').type(cls.default_fill_value)) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), (axes.profile,), fill_value=df[axes.y].dtype.type(cls.default_fill_value)) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), (axes.profile,), fill_value=df[axes.x].dtype.type(cls.default_fill_value)) # Axes variables are already processed so skip them data_columns = [ d for d in df.columns if d not in axes ] attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {})) for i, (_, trg) in enumerate(trajectory_groups): for j, (_, pfg) in enumerate(trg.groupby(axes.profile)): time[j] = get_ncdata_from_series(pfg[axes.t], time)[0] latitude[j] = get_ncdata_from_series(pfg[axes.y], latitude)[0] longitude[j] = get_ncdata_from_series(pfg[axes.x], longitude)[0] row_size[j] = len(pfg) t_ind[j] = i # Add back in the z axes that was removed when calculating data_columns data_columns = data_columns + [axes.z] skips = ['trajectoryIndex', 'rowSize'] for c in [ d for d in data_columns if d not in skips ]: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series( nc, var_name, (axes.sample,), df[c], zlib=True, complevel=1 ) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(df[c], v) try: v[:] = vvalues except BaseException: L.exception('Failed to add {}'.format(c)) continue # Metadata variables if 'crs' not in nc.variables: nc.createVariable('crs', 'i4') # Set attributes nc.update_attributes(attributes) return ContiguousRaggedTrajectoryProfile(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) data_columns = [ d for d in df.columns if d not in axes ] unlimited = kwargs.pop('unlimited', False) with IncompleteMultidimensionalProfile(output, 'w') as nc: profile_group = df.groupby(axes.profile) if unlimited is True: max_profiles = None else: max_profiles = df[axes.profile].unique().size nc.createDimension(axes.profile, max_profiles) max_zs = profile_group.size().max() nc.createDimension(axes.z, max_zs) # Metadata variables nc.createVariable('crs', 'i4') profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (axes.profile,)) # Create all of the variables time = nc.createVariable(axes.t, 'f8', (axes.profile,)) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), (axes.profile,)) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), (axes.profile,)) z = nc.createVariable(axes.z, get_dtype(df[axes.z]), (axes.profile, axes.z), fill_value=df[axes.z].dtype.type(cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {})) for i, (uid, pdf) in enumerate(profile_group): profile[i] = uid time[i] = nc4.date2num(pdf[axes.t].iloc[0], units=cls.default_time_unit) latitude[i] = pdf[axes.y].iloc[0] longitude[i] = pdf[axes.x].iloc[0] zvalues = pdf[axes.z].fillna(z._FillValue).values sl = slice(0, zvalues.size) z[i, sl] = zvalues for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series( nc, var_name, (axes.profile, axes.z), pdf[c], zlib=True, complevel=1 ) attributes[var_name] = dict_update(attributes.get(var_name, {}), { 'coordinates' : '{} {} {} {}'.format( axes.t, axes.z, axes.x, axes.y ) }) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(pdf[c], v) sl = slice(0, vvalues.size) v[i, sl] = vvalues # Set global attributes nc.update_attributes(attributes) return IncompleteMultidimensionalProfile(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) data_columns = [d for d in df.columns if d not in axes] with OrthogonalMultidimensionalTimeseries(output, 'w') as nc: station_group = df.groupby(axes.station) num_stations = len(station_group) # assume all groups are the same size and have identical times _, sdf = list(station_group)[0] t = sdf[axes.t] # Metadata variables nc.createVariable('crs', 'i4') # Create all of the variables nc.createDimension(axes.t, t.size) nc.createDimension(axes.station, num_stations) station = nc.createVariable(axes.station, get_dtype(df.station), (axes.station, )) time = nc.createVariable(axes.t, 'f8', (axes.t, )) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), (axes.station, )) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), (axes.station, )) z = nc.createVariable(axes.z, get_dtype(df[axes.z]), (axes.station, ), fill_value=df[axes.z].dtype.type( cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {})) # tolist() converts to a python datetime object without timezone and has NaTs. g = t.tolist() # date2num convers NaTs to np.nan gg = nc4.date2num(g, units=cls.default_time_unit) # masked_invalid moves np.nan to a masked value time[:] = np.ma.masked_invalid(gg) for i, (uid, sdf) in enumerate(station_group): station[i] = uid latitude[i] = sdf[axes.y].iloc[0] longitude[i] = sdf[axes.x].iloc[0] # TODO: write a test for a Z with a _FillValue z[i] = sdf[axes.z].iloc[0] for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, (axes.station, axes.t), sdf[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), { 'coordinates': '{} {} {} {}'.format(axes.t, axes.z, axes.x, axes.y) }) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(sdf[c], v) try: v[i, :] = vvalues except BaseException: L.debug( '{} was not written. Likely a metadata variable'. format(v.name)) # Set global attributes nc.update_attributes(attributes) return OrthogonalMultidimensionalTimeseries(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes data_columns = [d for d in df.columns if d not in axes] reduce_dims = kwargs.pop('reduce_dims', False) _ = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not support in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) with OrthogonalMultidimensionalTimeseries(output, 'w') as nc: station_group = df.groupby(axes.station) num_stations = len(station_group) has_z = axes.z is not None if reduce_dims is True and num_stations == 1: # If a station, we can reduce that dimension if it is of size 1 def ts(i): return np.s_[:] default_dimensions = (daxes.t, ) station_dimensions = () else: def ts(i): return np.s_[i, :] default_dimensions = (daxes.station, daxes.t) station_dimensions = (daxes.station, ) nc.createDimension(daxes.station, num_stations) # Set the coordinates attribute correctly coordinates = [axes.t, axes.x, axes.y] if has_z is True: coordinates.insert(1, axes.z) coordinates = ' '.join(coordinates) # assume all groups are the same size and have identical times _, sdf = list(station_group)[0] t = sdf[axes.t] # Metadata variables nc.createVariable('crs', 'i4') # Create all of the variables nc.createDimension(daxes.t, t.size) time = nc.createVariable(axes.t, 'f8', (daxes.t, )) station = nc.createVariable(axes.station, get_dtype(df[axes.station]), station_dimensions) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), station_dimensions) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), station_dimensions) if has_z is True: z = nc.createVariable(axes.z, get_dtype(df[axes.z]), station_dimensions, fill_value=df[axes.z].dtype.type( cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) time[:] = get_ncdata_from_series(t, time) # Create vars based on full dataframe (to get all variables) for c in data_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, default_dimensions, df[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), {'coordinates': coordinates}) for i, (uid, sdf) in enumerate(station_group): station[i] = uid latitude[i] = sdf[axes.y].iloc[0] longitude[i] = sdf[axes.x].iloc[0] if has_z is True: # TODO: write a test for a Z with a _FillValue z[i] = sdf[axes.z].iloc[0] for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) v = nc.variables[var_name] vvalues = get_ncdata_from_series(sdf[c], v) try: v[ts(i)] = vvalues except BaseException: L.debug( '{} was not written. Likely a metadata variable'. format(v.name)) # Set global attributes nc.update_attributes(attributes) return OrthogonalMultidimensionalTimeseries(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) data_columns = [d for d in df.columns if d not in axes] reduce_dims = kwargs.pop('reduce_dims', False) unlimited = kwargs.pop('unlimited', False) with IncompleteMultidimensionalTrajectory(output, 'w') as nc: trajectory_group = df.groupby(axes.trajectory) if unlimited is True: max_obs = None else: max_obs = trajectory_group.size().max() nc.createDimension(axes.sample, max_obs) num_trajectories = len(trajectory_group) if reduce_dims is True and num_trajectories == 1: # If a singlular trajectory, we can reduce that dimension if it is of size 1 def ts(t_index, size): return np.s_[0:size] default_dimensions = (axes.sample, ) trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory])) else: def ts(t_index, size): return np.s_[t_index, 0:size] default_dimensions = (axes.trajectory, axes.sample) nc.createDimension(axes.trajectory, num_trajectories) trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (axes.trajectory, )) # Create all of the variables time = nc.createVariable(axes.t, 'f8', default_dimensions, fill_value=np.dtype('f8').type( cls.default_fill_value)) z = nc.createVariable(axes.z, get_dtype(df[axes.z]), default_dimensions, fill_value=df[axes.z].dtype.type( cls.default_fill_value)) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), default_dimensions, fill_value=df[axes.y].dtype.type( cls.default_fill_value)) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), default_dimensions, fill_value=df[axes.x].dtype.type( cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {})) for i, (uid, gdf) in enumerate(trajectory_group): trajectory[i] = uid # tolist() converts to a python datetime object without timezone and has NaTs. g = gdf[axes.t].tolist() # date2num convers NaTs to np.nan gg = nc4.date2num(g, units=cls.default_time_unit) # masked_invalid moves np.nan to a masked value time[ts(i, gg.size)] = np.ma.masked_invalid(gg) lats = gdf[axes.y].fillna(get_fill_value(latitude)).values latitude[ts(i, lats.size)] = lats lons = gdf[axes.x].fillna(get_fill_value(longitude)).values longitude[ts(i, lons.size)] = lons zs = gdf[axes.z].fillna(get_fill_value(z)).values z[ts(i, zs.size)] = zs for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, default_dimensions, gdf[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), { 'coordinates': '{} {} {} {}'.format(axes.t, axes.z, axes.x, axes.y) }) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(gdf[c], v) v[ts(i, vvalues.size)] = vvalues # Metadata variables if 'crs' not in nc.variables: nc.createVariable('crs', 'i4') # Set attributes nc.update_attributes(attributes) return IncompleteMultidimensionalTrajectory(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes # Should never be a CR file with one trajectory so we ignore the "reduce_dims" attribute _ = kwargs.pop('reduce_dims', False) # noqa unlimited = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not support in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) with ContiguousRaggedTrajectory(output, 'w') as nc: trajectory_groups = df.groupby(axes.trajectory) unique_trajectories = list(trajectory_groups.groups.keys()) num_trajectories = len(unique_trajectories) nc.createDimension(daxes.trajectory, num_trajectories) trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (daxes.trajectory, )) # Get unique obs by grouping on traj getting the max size if unlimited is True: nc.createDimension(daxes.sample, None) else: nc.createDimension(daxes.sample, len(df)) # Number of observations in each trajectory row_size = nc.createVariable('rowSize', 'i4', (daxes.trajectory, )) attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) # Variables defined on only the trajectory axis traj_vars = kwargs.pop('traj_vars', []) traj_columns = [p for p in traj_vars if p in df.columns] for c in traj_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: create_ncvar_from_series(nc, var_name, (daxes.trajectory, ), df[c], zlib=True, complevel=1) for i, (trajid, trg) in enumerate(trajectory_groups): trajectory[i] = trajid row_size[i] = len(trg) # Save any trajectory variables using the first value found # in the column. for c in traj_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: continue v = nc.variables[var_name] vvalues = get_ncdata_from_series(trg[c], v)[0] try: v[i] = vvalues except BaseException: L.exception('Failed to add {}'.format(c)) continue # Add all of the columns based on the sample dimension. Take all columns and remove the # trajectory, rowSize and other trajectory based columns. sample_columns = [ f for f in df.columns if f not in traj_columns + ['rowSize', axes.trajectory] ] for c in sample_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, (daxes.sample, ), df[c], zlib=True, complevel=1) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(df[c], v) try: if unlimited is True: v[:] = vvalues else: v[:] = vvalues.reshape(v.shape) except BaseException: L.exception('Failed to add {}'.format(c)) continue # Metadata variables if 'crs' not in nc.variables: nc.createVariable('crs', 'i4') # Set attributes nc.update_attributes(attributes) return ContiguousRaggedTrajectory(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): reserved_columns = ['trajectory', 't', 'x', 'y', 'z', 'distance'] data_columns = [ d for d in df.columns if d not in reserved_columns ] with IncompleteMultidimensionalTrajectory(output, 'w') as nc: trajectory_group = df.groupby('trajectory') max_obs = trajectory_group.size().max() unique_trajectories = df.trajectory.unique() nc.createDimension('trajectory', unique_trajectories.size) nc.createDimension('obs', max_obs) # Metadata variables nc.createVariable('crs', 'i4') trajectory = nc.createVariable('trajectory', get_dtype(df.trajectory), ('trajectory',)) # Create all of the variables time = nc.createVariable('time', 'i4', ('trajectory', 'obs'), fill_value=int(cls.default_fill_value)) z = nc.createVariable('z', get_dtype(df.z), ('trajectory', 'obs'), fill_value=df.z.dtype.type(cls.default_fill_value)) latitude = nc.createVariable('latitude', get_dtype(df.y), ('trajectory', 'obs'), fill_value=df.y.dtype.type(cls.default_fill_value)) longitude = nc.createVariable('longitude', get_dtype(df.x), ('trajectory', 'obs'), fill_value=df.x.dtype.type(cls.default_fill_value)) if 'distance' in df: distance = nc.createVariable('distance', get_dtype(df.distance), ('trajectory', 'obs'), fill_value=df.distance.dtype.type(cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(), kwargs.pop('attributes', {})) for i, (uid, gdf) in enumerate(trajectory_group): trajectory[i] = uid # tolist() converts to a python datetime object without timezone g = gdf.t.fillna(999999).tolist() # 999999 is a dummy value NaTs = gdf.t.isnull() timenums = np.ma.MaskedArray(nc4.date2num(g, units=cls.default_time_unit)) timenums.mask = NaTs time[i, :] = timenums latitude[i, :] = gdf.y.fillna(latitude._FillValue).values longitude[i, :] = gdf.x.fillna(longitude._FillValue).values z[i, :] = gdf.z.fillna(z._FillValue).values if 'distance' in gdf: distance[i, :] = gdf.distance.fillna(distance._FillValue).values for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) if var_name not in nc.variables: if np.issubdtype(gdf[c].dtype, 'S') or gdf[c].dtype == object: # AttributeError: cannot set _FillValue attribute for VLEN or compound variable v = nc.createVariable(var_name, get_dtype(gdf[c]), ('trajectory', 'obs')) else: v = nc.createVariable(var_name, get_dtype(gdf[c]), ('trajectory', 'obs'), fill_value=gdf[c].dtype.type(cls.default_fill_value)) if var_name not in attributes: attributes[var_name] = {} attributes[var_name] = dict_update(attributes[var_name], { 'coordinates' : 'time latitude longitude z', }) else: v = nc.variables[var_name] if hasattr(v, '_FillValue'): vvalues = gdf[c].fillna(v._FillValue).values else: # Use an empty string... better than nothing! vvalues = gdf[c].fillna('').values sl = slice(0, vvalues.size) v[i, sl] = vvalues # Set global attributes nc.update_attributes(attributes) return IncompleteMultidimensionalTrajectory(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes reduce_dims = kwargs.pop('reduce_dims', False) unlimited = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not supported in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) with RaggedTimeseriesProfile(output, 'w') as nc: station_groups = df.groupby(axes.station) unique_stations = list(station_groups.groups.keys()) num_stations = len(unique_stations) # Calculate the max number of profiles profile_groups = df.groupby(axes.profile) unique_profiles = list(profile_groups.groups.keys()) num_profiles = len(unique_profiles) nc.createDimension(daxes.profile, num_profiles) if reduce_dims is True and num_stations == 1: # If a singular station, remove the dimension station_dimensions = () s_ind = None else: station_dimensions = (daxes.station,) nc.createDimension(daxes.station, num_stations) # The station this profile belongs to s_ind = nc.createVariable('stationIndex', 'i4', (daxes.profile,)) station = nc.createVariable(axes.station, get_dtype(unique_stations), station_dimensions) profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (daxes.profile,)) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), station_dimensions) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), station_dimensions) # Get unique obs by grouping on traj and profile and getting the max size if unlimited is True: nc.createDimension(daxes.sample, None) else: nc.createDimension(daxes.sample, len(df)) # Number of observations in each profile row_size = nc.createVariable('rowSize', 'i4', (daxes.profile,)) # Axes variables are already processed so skip them data_columns = [ d for d in df.columns if d not in axes ] data_columns += [axes.t, axes.z] # time isn't really special, its dimensioned by obs attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) for i, (sname, srg) in enumerate(station_groups): station[i] = sname latitude[i] = df[axes.y][df[axes.station] == sname].dropna().iloc[0] longitude[i] = df[axes.x][df[axes.station] == sname].dropna().iloc[0] for j, (pname, pfg) in enumerate(profile_groups): profile[j] = pname row_size[j] = len(pfg) if s_ind is not None: s_ind[j] = np.asscalar(np.argwhere(station[:] == pfg[axes.station].dropna().iloc[0])) # Add back in the z axes that was removed when calculating data_columns # and ignore variables that were stored in the profile index skips = ['stationIndex', 'rowSize'] for c in [ d for d in data_columns if d not in skips ]: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series( nc, var_name, (daxes.sample,), df[c], zlib=True, complevel=1 ) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(df[c], v) try: if unlimited is True: v[:] = vvalues else: v[:] = vvalues.reshape(v.shape) except BaseException: L.exception('Failed to add {}'.format(c)) continue # Metadata variables nc.createVariable('crs', 'i4') # Set attributes nc.update_attributes(attributes) return RaggedTimeseriesProfile(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes data_columns = [ d for d in df.columns if d not in axes ] reduce_dims = kwargs.pop('reduce_dims', False) unlimited = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not support in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) with IncompleteMultidimensionalTrajectory(output, 'w') as nc: trajectory_group = df.groupby(axes.trajectory) if unlimited is True: max_obs = None else: max_obs = trajectory_group.size().max() nc.createDimension(daxes.sample, max_obs) num_trajectories = len(trajectory_group) if reduce_dims is True and num_trajectories == 1: # If a singlular trajectory, we can reduce that dimension if it is of size 1 def ts(t_index, size): return np.s_[0:size] default_dimensions = (daxes.sample,) trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory])) else: def ts(t_index, size): return np.s_[t_index, 0:size] default_dimensions = (daxes.trajectory, daxes.sample) nc.createDimension(daxes.trajectory, num_trajectories) trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (daxes.trajectory,)) # Create all of the variables time = nc.createVariable(axes.t, 'f8', default_dimensions, fill_value=np.dtype('f8').type(cls.default_fill_value)) z = nc.createVariable(axes.z, get_dtype(df[axes.z]), default_dimensions, fill_value=df[axes.z].dtype.type(cls.default_fill_value)) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), default_dimensions, fill_value=df[axes.y].dtype.type(cls.default_fill_value)) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), default_dimensions, fill_value=df[axes.x].dtype.type(cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) # Create vars based on full dataframe (to get all variables) for c in data_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series( nc, var_name, default_dimensions, df[c], zlib=True, complevel=1 ) attributes[var_name] = dict_update(attributes.get(var_name, {}), { 'coordinates': '{} {} {} {}'.format( axes.t, axes.z, axes.x, axes.y ) }) for i, (uid, gdf) in enumerate(trajectory_group): trajectory[i] = uid times = get_ncdata_from_series(gdf[axes.t], time) time[ts(i, times.size)] = times lats = get_ncdata_from_series(gdf[axes.y], latitude) latitude[ts(i, lats.size)] = lats lons = get_ncdata_from_series(gdf[axes.x], longitude) longitude[ts(i, lons.size)] = lons zs = gdf[axes.z].fillna(get_fill_value(z)).values z[ts(i, zs.size)] = zs for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) v = nc.variables[var_name] vvalues = get_ncdata_from_series(gdf[c], v) slicer = ts(i, vvalues.size) v[slicer] = vvalues # Metadata variables if 'crs' not in nc.variables: nc.createVariable('crs', 'i4') # Set attributes nc.update_attributes(attributes) return IncompleteMultidimensionalTrajectory(output, **kwargs)
def create_profile_netcdf(attrs, profile, output_path, mode, profile_id_type=ProfileIdTypes.EPOCH): try: # Path to hold file while we create it tmp_handle, tmp_path = tempfile.mkstemp(suffix='.nc', prefix='gutils_glider_netcdf_') profile_time = profile.t.dropna().iloc[0] if profile_id_type == ProfileIdTypes.EPOCH: # We are using the epoch as the profile_index! profile_index = calendar.timegm(profile_time.utctimetuple()) # Figure out which profile index to use (epoch or integer) elif profile_id_type == ProfileIdTypes.COUNT: # Get all existing netCDF outputs and find out the index of this netCDF file. That # will be the profile_id of this file. This is effectively keeping a tally of netCDF # files that have been created and only works if NETCDF FILES ARE WRITTEN IN # ASCENDING ORDER. # There is a race condition here if files are being in parallel and one should be # sure that when this function is being run there can be no more files writtten. # This file being written is the last profile available. netcdf_files_same_mode = list(glob( os.path.join( output_path, '*_{}.nc'.format(mode) ) )) profile_index = len(netcdf_files_same_mode) elif profile_id_type == ProfileIdTypes.FRAME: profile_index = profile.profile.iloc[0] else: raise ValueError('{} is not a valid profile type'.format(profile_id_type)) # Create final filename filename = "{0}_{1:010d}_{2:%Y%m%dT%H%M%S}Z_{3}.nc".format( attrs['glider'], profile_index, profile_time, mode ) output_file = os.path.join(output_path, filename) # Add in the trajectory dimension to make pocean happy traj_name = '{}-{}'.format( attrs['glider'], attrs['trajectory_date'] ) profile = profile.assign(trajectory=traj_name) # We add this back in later profile.drop('profile', axis=1, inplace=True) # Compute U/V scalar values uv_txy = get_uv_data(profile) if 'u_orig' in profile.columns and 'v_orig' in profile.columns: profile.drop(['u_orig', 'v_orig'], axis=1, inplace=True) # Compute profile scalar values profile_txy = get_profile_data(profile, method=None) # Calculate some geographic global attributes attrs = dict_update(attrs, get_geographic_attributes(profile)) # Calculate some vertical global attributes attrs = dict_update(attrs, get_vertical_attributes(profile)) # Calculate some temporal global attributes attrs = dict_update(attrs, get_temporal_attributes(profile)) # Set the creation dates and history attrs = dict_update(attrs, get_creation_attributes(profile)) # Changing column names here from the default 't z x y' axes = { 't': 'time', 'z': 'depth', 'x': 'lon', 'y': 'lat', 'sample': 'time' } profile = profile.rename(columns=axes) # Use pocean to create NetCDF file with IncompleteMultidimensionalTrajectory.from_dataframe( profile, tmp_path, axes=axes, reduce_dims=True, mode='a') as ncd: # We only want to apply metadata from the `attrs` map if the variable is already in # the netCDF file or it is a scalar variable (no shape defined). This avoids # creating measured variables that were not measured in this profile. prof_attrs = attrs.copy() vars_to_update = OrderedDict() for vname, vobj in prof_attrs['variables'].items(): if vname in ncd.variables or ('shape' not in vobj and 'type' in vobj): if 'shape' in vobj: # Assign coordinates vobj['attributes']['coordinates'] = '{} {} {} {}'.format( axes.get('t'), axes.get('z'), axes.get('x'), axes.get('y'), ) vars_to_update[vname] = vobj else: # L.debug("Skipping missing variable: {}".format(vname)) pass prof_attrs['variables'] = vars_to_update ncd.apply_meta(prof_attrs) # Set trajectory value ncd.id = traj_name ncd.variables['trajectory'][0] = traj_name # Set profile_* data set_profile_data(ncd, profile_txy, profile_index) # Set *_uv data set_uv_data(ncd, uv_txy) # Move to final destination safe_makedirs(os.path.dirname(output_file)) os.chmod(tmp_path, 0o664) shutil.move(tmp_path, output_file) L.info('Created: {}'.format(output_file)) return output_file except BaseException: raise finally: os.close(tmp_handle) if os.path.exists(tmp_path): os.remove(tmp_path)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes _ = kwargs.pop('reduce_dims', False) _ = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not support in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) with ContiguousRaggedTrajectoryProfile(output, 'w') as nc: trajectory_groups = df.groupby(axes.trajectory) unique_trajectories = list(trajectory_groups.groups.keys()) num_trajectories = len(unique_trajectories) nc.createDimension(daxes.trajectory, num_trajectories) trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (daxes.trajectory, )) trajectory[:] = np.array(unique_trajectories) # Calculate the max number of profiles unique_profiles = df[axes.profile].unique() num_profiles = len(unique_profiles) nc.createDimension(daxes.profile, num_profiles) profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (daxes.profile, )) profile[:] = np.array(unique_profiles) # Get unique obs by grouping on traj and profile and getting the max size num_obs = len(df) nc.createDimension(daxes.sample, num_obs) # The trajectory this profile belongs to t_ind = nc.createVariable('trajectoryIndex', 'i4', (daxes.profile, )) # Number of observations in each profile row_size = nc.createVariable('rowSize', 'i4', (daxes.profile, )) # Create all of the axis variables time = nc.createVariable(axes.t, 'f8', (daxes.profile, ), fill_value=np.dtype('f8').type( cls.default_fill_value)) latitude = nc.createVariable( axes.y, get_dtype(df[axes.y]), (daxes.profile, ), fill_value=df[axes.y].dtype.type(cls.default_fill_value)) longitude = nc.createVariable( axes.x, get_dtype(df[axes.x]), (daxes.profile, ), fill_value=df[axes.x].dtype.type(cls.default_fill_value)) # Axes variables are already processed so skip them data_columns = [d for d in df.columns if d not in axes] attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) # Variables defined on only the profile axis profile_vars = kwargs.pop('profile_vars', []) profile_columns = [p for p in profile_vars if p in data_columns] for c in profile_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: create_ncvar_from_series(nc, var_name, (daxes.profile, ), df[c], zlib=True, complevel=1) for i, (_, trg) in enumerate(trajectory_groups): for j, (_, pfg) in enumerate(trg.groupby(axes.profile)): time[j] = get_ncdata_from_series(pfg[axes.t], time).astype('f8')[0] latitude[j] = get_ncdata_from_series( pfg[axes.y], latitude)[0] longitude[j] = get_ncdata_from_series( pfg[axes.x], longitude)[0] row_size[j] = len(pfg) t_ind[j] = i # Save any profile variables on the "profile" index using the first value found # in the column. for c in profile_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: continue v = nc.variables[var_name] vvalues = get_ncdata_from_series(pfg[c], v)[0] try: v[j] = vvalues except BaseException: L.exception('Failed to add {}'.format(c)) continue # Add back in the z axes that was removed when calculating data_columns # and ignore variables that were stored in the profile index sample_columns = [ f for f in data_columns + [axes.z] if f not in profile_columns ] skips = ['trajectoryIndex', 'rowSize'] for c in [d for d in sample_columns if d not in skips]: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, (daxes.sample, ), df[c], zlib=True, complevel=1) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(df[c], v) try: v[:] = vvalues.reshape(v.shape) except BaseException: L.exception('Failed to add {}'.format(c)) continue # Metadata variables if 'crs' not in nc.variables: nc.createVariable('crs', 'i4') # Set attributes nc.update_attributes(attributes) return ContiguousRaggedTrajectoryProfile(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes data_columns = [d for d in df.columns if d not in axes] reduce_dims = kwargs.pop('reduce_dims', False) unlimited = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not supported in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) # Make a new index that is the Cartesian product of all of the values from all of the # values of the old index. This is so don't have to iterate over anything. The full column # of data will be able to be shaped to the size of the final unique sized dimensions. index_order = [axes.t, axes.z, axes.station] df = df.set_index(index_order) df = df.reindex( pd.MultiIndex.from_product(df.index.levels, names=index_order)) unique_z = df.index.get_level_values(axes.z).unique().values unique_t = df.index.get_level_values( axes.t).unique().tolist() # tolist converts to Timestamp all_stations = df.index.get_level_values(axes.station) unique_s = all_stations.unique() with OrthogonalMultidimensionalTimeseriesProfile(output, 'w') as nc: if reduce_dims is True and unique_s.size == 1: # If a singular trajectory, we can reduce that dimension if it is of size 1 default_dimensions = (daxes.t, daxes.z) station_dimensions = () else: default_dimensions = (daxes.t, daxes.z, daxes.station) station_dimensions = (daxes.station, ) nc.createDimension(daxes.station, unique_s.size) station = nc.createVariable(axes.station, get_dtype(unique_s), station_dimensions) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), station_dimensions) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), station_dimensions) # Assign over loop because VLEN variables (strings) have to be assigned by integer index # and we need to find the lat/lon based on station index for si, st in enumerate(unique_s): station[si] = st latitude[si] = df[axes.y][all_stations == st].dropna().iloc[0] longitude[si] = df[axes.x][all_stations == st].dropna().iloc[0] # Metadata variables nc.createVariable('crs', 'i4') # Create all of the variables if unlimited is True: nc.createDimension(daxes.t, None) else: nc.createDimension(daxes.t, len(unique_t)) time = nc.createVariable(axes.t, 'f8', (daxes.t, )) time[:] = date2num(unique_t, units=cls.default_time_unit).astype('f8') nc.createDimension(daxes.z, unique_z.size) z = nc.createVariable(axes.z, get_dtype(unique_z), (daxes.z, )) z[:] = unique_z attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) # Variables defined on only the time axis and not the depth axis detach_z_vars = kwargs.pop('detach_z', []) detach_z_columnms = [p for p in detach_z_vars if p in data_columns] for c in detach_z_columnms: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series( nc, var_name, default_dimensions[ 0::2], # this removes the second dimension (z) df[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), { 'coordinates': '{} {} {}'.format(axes.t, axes.x, axes.y) }) else: v = nc.variables[var_name] # Because we need access to the fillvalues here, we ask not to return # the values with them already filled. vvalues = get_ncdata_from_series(df[c], v, fillna=False) # Reshape to the full array, with Z vvalues = vvalues.reshape(len(unique_t), unique_z.size, unique_s.size) # The Z axis is always the second axis, take the mean over that axis vvalues = np.apply_along_axis(np.nanmean, 1, vvalues).flatten() # Now reshape to the array without Z vvalues = vvalues.reshape(len(unique_t), unique_s.size) try: v[:] = vvalues.reshape(v.shape) except BaseException: L.exception('Failed to add {}'.format(c)) continue full_columns = [ f for f in data_columns if f not in detach_z_columnms ] for c in full_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, default_dimensions, df[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), { 'coordinates': '{} {} {} {}'.format(axes.t, axes.z, axes.x, axes.y) }) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(df[c], v) v[:] = vvalues.reshape(v.shape) nc.update_attributes(attributes) return OrthogonalMultidimensionalTimeseriesProfile(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) data_columns = [d for d in df.columns if d not in axes] reduce_dims = kwargs.pop('reduce_dims', False) unlimited = kwargs.pop('unlimited', False) # Downcast anything from int64 to int32 df = downcast_dataframe(df) # Make a new index that is the Cartesian product of all of the values from all of the # values of the old index. This is so don't have to iterate over anything. The full column # of data will be able to be shaped to the size of the final unique sized dimensions. index_order = [axes.t, axes.z, axes.station] df = df.set_index(index_order) df = df.reindex( pd.MultiIndex.from_product(df.index.levels, names=index_order)) unique_z = df.index.get_level_values(axes.z).unique().values unique_t = df.index.get_level_values( axes.t).unique().tolist() # tolist converts to Timestamp all_stations = df.index.get_level_values(axes.station) unique_s = all_stations.unique() with OrthogonalMultidimensionalTimeseriesProfile(output, 'w') as nc: if reduce_dims is True and unique_s.size == 1: # If a singlular trajectory, we can reduce that dimension if it is of size 1 def ts(): return np.s_[:, :] default_dimensions = (axes.t, axes.z) station_dimensions = () else: def ts(): return np.s_[:, :, :] default_dimensions = (axes.t, axes.z, axes.station) station_dimensions = (axes.station, ) nc.createDimension(axes.station, unique_s.size) station = nc.createVariable(axes.station, get_dtype(unique_s), station_dimensions) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), station_dimensions) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), station_dimensions) # Assign over loop because VLEN variables (strings) have to be assigned by integer index # and we need to find the lat/lon based on station index for si, st in enumerate(unique_s): station[si] = st latitude[si] = df[axes.y][all_stations == st].dropna().iloc[0] longitude[si] = df[axes.x][all_stations == st].dropna().iloc[0] # Metadata variables nc.createVariable('crs', 'i4') # Create all of the variables if unlimited is True: nc.createDimension(axes.t, None) else: nc.createDimension(axes.t, len(unique_t)) time = nc.createVariable(axes.t, 'f8', (axes.t, )) time[:] = nc4.date2num(unique_t, units=cls.default_time_unit) nc.createDimension(axes.z, unique_z.size) z = nc.createVariable(axes.z, get_dtype(unique_z), (axes.z, )) z[:] = unique_z attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {})) for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, default_dimensions, df[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), { 'coordinates': '{} {} {} {}'.format(axes.t, axes.z, axes.x, axes.y) }) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(df[c], v) v[ts()] = vvalues.reshape(len(unique_t), unique_z.size, unique_s.size) nc.update_attributes(attributes) return OrthogonalMultidimensionalTimeseriesProfile(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): reserved_columns = [ 'trajectory', 'profile', 't', 'x', 'y', 'z', 'distance' ] data_columns = [d for d in df.columns if d not in reserved_columns] with IncompleteMultidimensionalProfile(output, 'w') as nc: profile_group = df.groupby('profile') max_zs = profile_group.size().max() unique_profiles = df.profile.unique() nc.createDimension('profile', unique_profiles.size) nc.createDimension('z', max_zs) # Metadata variables nc.createVariable('crs', 'i4') profile = nc.createVariable('profile', get_dtype(df.profile), ('profile', )) # Create all of the variables time = nc.createVariable('time', 'i4', ('profile', )) latitude = nc.createVariable('latitude', get_dtype(df.y), ('profile', )) longitude = nc.createVariable('longitude', get_dtype(df.x), ('profile', )) if 'distance' in df: distance = nc.createVariable('distance', get_dtype(df.distance), ('profile', )) z = nc.createVariable('z', get_dtype(df.z), ('profile', 'z'), fill_value=df.z.dtype.type( cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(), kwargs.pop('attributes', {})) for i, (uid, pdf) in enumerate(profile_group): profile[i] = uid time[i] = nc4.date2num(pdf.t.iloc[0], units=cls.default_time_unit) latitude[i] = pdf.y.iloc[0] longitude[i] = pdf.x.iloc[0] if 'distance' in pdf: distance[i] = pdf.distance.iloc[0] zvalues = pdf.z.fillna(z._FillValue).values sl = slice(0, zvalues.size) z[i, sl] = zvalues for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) if var_name not in nc.variables: if np.issubdtype(pdf[c].dtype, 'S') or pdf[c].dtype == object: # AttributeError: cannot set _FillValue attribute for VLEN or compound variable v = nc.createVariable(var_name, get_dtype(pdf[c]), ('profile', 'z')) else: v = nc.createVariable(var_name, get_dtype(pdf[c]), ('profile', 'z'), fill_value=pdf[c].dtype.type( cls.default_fill_value)) if var_name not in attributes: attributes[var_name] = {} attributes[var_name] = dict_update( attributes[var_name], { 'coordinates': 'time latitude longitude z', }) else: v = nc.variables[var_name] if hasattr(v, '_FillValue'): vvalues = pdf[c].fillna(v._FillValue).values else: # Use an empty string... better than nothing! vvalues = pdf[c].fillna('').values sl = slice(0, vvalues.size) v[i, sl] = vvalues # Set global attributes nc.update_attributes(attributes) return IncompleteMultidimensionalProfile(output, **kwargs)