def test_is_mine(klass, fp): with CFDataset.load(fp) as dsg: assert dsg.__class__ == klass allsubs = list(all_subclasses(CFDataset)) subs = [s for s in allsubs if s != klass] with CFDataset(fp) as dsg: logger.debug('\nTesting {}'.format(klass.__name__)) assert klass.is_mine(dsg) is True for s in subs: if hasattr(s, 'is_mine'): logger.debug(' * Trying {}...'.format(s.__name__)) assert s.is_mine(dsg) is False
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) data_columns = [d for d in df.columns if d not in axes] with OrthogonalMultidimensionalTimeseries(output, 'w') as nc: station_group = df.groupby(axes.station) num_stations = len(station_group) # assume all groups are the same size and have identical times _, sdf = list(station_group)[0] t = sdf[axes.t] # Metadata variables nc.createVariable('crs', 'i4') # Create all of the variables nc.createDimension(axes.t, t.size) nc.createDimension(axes.station, num_stations) station = nc.createVariable(axes.station, get_dtype(df.station), (axes.station, )) time = nc.createVariable(axes.t, 'f8', (axes.t, )) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), (axes.station, )) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), (axes.station, )) z = nc.createVariable(axes.z, get_dtype(df[axes.z]), (axes.station, ), fill_value=df[axes.z].dtype.type( cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {})) # tolist() converts to a python datetime object without timezone and has NaTs. g = t.tolist() # date2num convers NaTs to np.nan gg = nc4.date2num(g, units=cls.default_time_unit) # masked_invalid moves np.nan to a masked value time[:] = np.ma.masked_invalid(gg) for i, (uid, sdf) in enumerate(station_group): station[i] = uid latitude[i] = sdf[axes.y].iloc[0] longitude[i] = sdf[axes.x].iloc[0] # TODO: write a test for a Z with a _FillValue z[i] = sdf[axes.z].iloc[0] for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, (axes.station, axes.t), sdf[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), { 'coordinates': '{} {} {} {}'.format(axes.t, axes.z, axes.x, axes.y) }) else: v = nc.variables[var_name] vvalues = get_ncdata_from_series(sdf[c], v) try: v[i, :] = vvalues except BaseException: L.debug( '{} was not written. Likely a metadata variable'. format(v.name)) # Set global attributes nc.update_attributes(attributes) return OrthogonalMultidimensionalTimeseries(output, **kwargs)
def from_dataframe(cls, df, output, **kwargs): axes = get_default_axes(kwargs.pop('axes', {})) daxes = axes data_columns = [d for d in df.columns if d not in axes] reduce_dims = kwargs.pop('reduce_dims', False) _ = kwargs.pop('unlimited', False) unique_dims = kwargs.pop('unique_dims', False) if unique_dims is True: # Rename the dimension to avoid a dimension and coordinate having the same name # which is not support in xarray changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() } daxes = get_default_axes(changed_axes) # Downcast anything from int64 to int32 # Convert any timezone aware datetimes to native UTC times df = downcast_dataframe(nativize_times(df)) with OrthogonalMultidimensionalTimeseries(output, 'w') as nc: station_group = df.groupby(axes.station) num_stations = len(station_group) has_z = axes.z is not None if reduce_dims is True and num_stations == 1: # If a station, we can reduce that dimension if it is of size 1 def ts(i): return np.s_[:] default_dimensions = (daxes.t, ) station_dimensions = () else: def ts(i): return np.s_[i, :] default_dimensions = (daxes.station, daxes.t) station_dimensions = (daxes.station, ) nc.createDimension(daxes.station, num_stations) # Set the coordinates attribute correctly coordinates = [axes.t, axes.x, axes.y] if has_z is True: coordinates.insert(1, axes.z) coordinates = ' '.join(coordinates) # assume all groups are the same size and have identical times _, sdf = list(station_group)[0] t = sdf[axes.t] # Metadata variables nc.createVariable('crs', 'i4') # Create all of the variables nc.createDimension(daxes.t, t.size) time = nc.createVariable(axes.t, 'f8', (daxes.t, )) station = nc.createVariable(axes.station, get_dtype(df[axes.station]), station_dimensions) latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), station_dimensions) longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), station_dimensions) if has_z is True: z = nc.createVariable(axes.z, get_dtype(df[axes.z]), station_dimensions, fill_value=df[axes.z].dtype.type( cls.default_fill_value)) attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {})) time[:] = get_ncdata_from_series(t, time) # Create vars based on full dataframe (to get all variables) for c in data_columns: var_name = cf_safe_name(c) if var_name not in nc.variables: v = create_ncvar_from_series(nc, var_name, default_dimensions, df[c], zlib=True, complevel=1) attributes[var_name] = dict_update( attributes.get(var_name, {}), {'coordinates': coordinates}) for i, (uid, sdf) in enumerate(station_group): station[i] = uid latitude[i] = sdf[axes.y].iloc[0] longitude[i] = sdf[axes.x].iloc[0] if has_z is True: # TODO: write a test for a Z with a _FillValue z[i] = sdf[axes.z].iloc[0] for c in data_columns: # Create variable if it doesn't exist var_name = cf_safe_name(c) v = nc.variables[var_name] vvalues = get_ncdata_from_series(sdf[c], v) try: v[ts(i)] = vvalues except BaseException: L.debug( '{} was not written. Likely a metadata variable'. format(v.name)) # Set global attributes nc.update_attributes(attributes) return OrthogonalMultidimensionalTimeseries(output, **kwargs)
def to_dataframe(self, clean_cols=True, clean_rows=True): zvar = self.z_axes()[0] zs = len(self.dimensions[zvar.dimensions[0]]) # Profiles pvar = self.filter_by_attrs(cf_role='profile_id')[0] try: p = normalize_array(pvar) except ValueError: p = np.asarray(list(range(len(pvar))), dtype=np.integer) ps = p.size p = p.repeat(zs) logger.debug(['profile data size: ', p.size]) # Z z = generic_masked(zvar[:], attrs=self.vatts(zvar.name)).round(5) try: z = np.tile(z, ps) except ValueError: z = z.flatten() logger.debug(['z data size: ', z.size]) # T tvar = self.t_axes()[0] t = nc4.num2date(tvar[:], tvar.units, getattr(tvar, 'calendar', 'standard')) if isinstance(t, datetime): # Size one t = np.array([t.isoformat()], dtype='datetime64') t = t.repeat(zs) logger.debug(['time data size: ', t.size]) # X xvar = self.x_axes()[0] x = generic_masked(xvar[:].repeat(zs), attrs=self.vatts(xvar.name)).round(5) logger.debug(['x data size: ', x.size]) # Y yvar = self.y_axes()[0] y = generic_masked(yvar[:].repeat(zs), attrs=self.vatts(yvar.name)).round(5) logger.debug(['y data size: ', y.size]) # Distance d = np.ma.zeros(y.size, dtype=np.float64) d[1:] = great_distance(start_latitude=y[0:-1], end_latitude=y[1:], start_longitude=x[0:-1], end_longitude=x[1:])['distance'] d = generic_masked(np.cumsum(d), minv=0).round(2) logger.debug(['distance data size: ', d.size]) df_data = {'t': t, 'x': x, 'y': y, 'z': z, 'profile': p, 'distance': d} building_index_to_drop = np.ones(t.size, dtype=bool) extract_vars = list(set(self.data_vars() + self.ancillary_vars())) for i, dvar in enumerate(extract_vars): vdata = np.ma.fix_invalid( np.ma.MaskedArray(dvar[:].round(3).flatten())) building_index_to_drop = (building_index_to_drop == True) & ( vdata.mask == True) # noqa df_data[dvar.name] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=True, clean_rows=True): # Z zvar = self.z_axes()[0] z = np.ma.fix_invalid(np.ma.MaskedArray(zvar[:])) z = z.flatten().round(5) logger.debug(['z data size: ', z.size]) # T tvar = self.t_axes()[0] t = np.ma.MaskedArray(nc4.num2date(tvar[:], tvar.units, getattr(tvar, 'calendar', 'standard'))).flatten() # Patch the time variable back to its original mask, since num2date # breaks any missing/fill values if hasattr(tvar[0], 'mask'): t.mask = tvar[:].mask logger.debug(['time data size: ', t.size]) # X xvar = self.x_axes()[0] x = np.ma.fix_invalid(np.ma.MaskedArray(xvar[:])).flatten().round(5) logger.debug(['x data size: ', x.size]) # Y yvar = self.y_axes()[0] y = np.ma.fix_invalid(np.ma.MaskedArray(yvar[:])).flatten().round(5) logger.debug(['y data size: ', y.size]) # Trajectories pvar = self.filter_by_attrs(cf_role='trajectory_id')[0] try: p = normalize_array(pvar) except BaseException: logger.exception('Could not pull trajectory values from the variable, using indexes.') p = np.asarray(list(range(len(pvar))), dtype=np.integer) # The Dimension that the trajectory id variable doesn't have is what # the trajectory data needs to be repeated by dim_diff = self.dimensions[list(set(tvar.dimensions).difference(set(pvar.dimensions)))[0]] if dim_diff: p = p.repeat(dim_diff.size) logger.debug(['trajectory data size: ', p.size]) # Distance d = np.append([0], great_distance(start_latitude=y[0:-1], end_latitude=y[1:], start_longitude=x[0:-1], end_longitude=x[1:])['distance']) d = np.ma.fix_invalid(np.ma.MaskedArray(np.cumsum(d)).astype(np.float64).round(2)) logger.debug(['distance data size: ', d.size]) df_data = { 't': t, 'x': x, 'y': y, 'z': z, 'trajectory': p, 'distance': d } building_index_to_drop = np.ones(t.size, dtype=bool) extract_vars = list(set(self.data_vars() + self.ancillary_vars())) for i, dvar in enumerate(extract_vars): vdata = np.ma.fix_invalid(np.ma.MaskedArray(dvar[:].round(3).flatten())) building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True) # noqa df_data[dvar.name] = vdata df = pd.DataFrame(df_data) # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data if clean_rows: df = df.iloc[~building_index_to_drop] return df
def to_dataframe(self, clean_cols=False, clean_rows=False): # Don't pass around the attributes store them in the class # T tvar = self.t_axes()[0] t = nc4.num2date(tvar[:], tvar.units, getattr(tvar, 'calendar', 'standard')) if isinstance(t, datetime): # Size one t = np.array([t.isoformat()], dtype='datetime64') logger.debug(['time data size: ', t.size]) svar = self.filter_by_attrs(cf_role='timeseries_id')[0] # Stations # TODO: Make sure there is a test for a file with multiple time variables try: s = normalize_array(svar) except ValueError: s = np.asarray(list(range(len(svar))), dtype=np.integer) s = np.repeat(s, t.size) logger.debug(['station data size: ', s.size]) # X xvar = self.x_axes()[0] x = generic_masked(xvar[:].repeat(t.size), attrs=self.vatts(xvar.name)).round(5) logger.debug(['x data size: ', x.size]) # Y yvar = self.y_axes()[0] y = generic_masked(yvar[:].repeat(t.size), attrs=self.vatts(yvar.name)).round(5) logger.debug(['y data size: ', y.size]) # Z zvar = self.z_axes()[0] z = generic_masked(zvar[:].repeat(t.size), attrs=self.vatts(zvar.name)) logger.debug(['z data size: ', z.size]) # now repeat t per station # figure out if this is a single-station file # do this by checking the dimensions of the Z var if zvar.ndim == 1: t = np.repeat(t, len(svar)) df_data = { 't': t, 'x': x, 'y': y, 'z': z, 'station': s, } #building_index_to_drop = np.ones(t.size, dtype=bool) extract_vars = copy(self.variables) del extract_vars[svar.name] del extract_vars[xvar.name] del extract_vars[yvar.name] del extract_vars[zvar.name] del extract_vars[tvar.name] for i, (dnam, dvar) in enumerate(extract_vars.items()): if dvar[:].flatten().size > t.size: logger.warning("Variable {} is not the correct size, skipping.".format(dnam)) continue vdata = generic_masked(dvar[:].flatten(), attrs=self.vatts(dnam)) if vdata.size == 1: vdata = vdata[0] #building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True) # noqa try: if re.match(r'.* since .*',dvar.units): vdata = nc4.num2date(vdata[:], dvar.units, getattr(dvar, 'calendar', 'standard')) except AttributeError: pass df_data[dnam] = vdata #logger.info('{} - {}'.format(dnam, vdata.shape)) df = pd.DataFrame() for k, v in df_data.items(): df[k] = v # Drop all data columns with no data if clean_cols: df = df.dropna(axis=1, how='all') # Drop all data rows with no data variable data #if clean_rows: # df = df.iloc[~building_index_to_drop] return df