コード例 #1
0
def get_vertical_attributes(df, axes=None):
    """ Use values in a dataframe to set vertical attributes for the eventual netCDF file
    Attribute names come from https://www.nodc.noaa.gov/data/formats/netcdf/v2.0/
    The CRS, geospatial_bounds_vertical_crs, cannot be assumed because NCEI suggests any of
      * 'EPSG:5829' (instantaneous height above sea level),
      * 'EPSG:5831' (instantaneous depth below sea level), or
      * 'EPSG:5703' (NAVD88 height).
    Likewise, geospatial_vertical_positive cannot be assumed to be either 'up' or 'down'.
    Set these attributes separately according to the dataset.
    Note: values are cast from numpy.int to float

    :param df: data (Pandas DataFrame)
    :param axes: keys (x,y,z,t) are associated with actual column names (dictionary). z in meters.
    :return: nested dictionary of variable and global attributes
    """
    axes = get_default_axes(axes)
    minz = round(float(df[axes.z].min()), 6)
    maxz = round(float(df[axes.z].max()), 6)

    return {
        'variables': {
            axes.z: {
                'attributes': {
                    'actual_min': minz,
                    'actual_max': maxz,
                }
            },
        },
        'attributes': {
            'geospatial_vertical_min': minz,
            'geospatial_vertical_max': maxz,
            'geospatial_vertical_units': 'm',
        }
    }
コード例 #2
0
ファイル: cr.py プロジェクト: pyoceans/pocean-core
 def calculated_metadata(self,
                         df=None,
                         geometries=True,
                         clean_cols=True,
                         clean_rows=True,
                         **kwargs):
     axes = get_default_axes(kwargs.pop('axes', {}))
     if df is None:
         df = self.to_dataframe(clean_cols=clean_cols,
                                clean_rows=clean_rows,
                                axes=axes)
     return trajectory_profile_calculated_metadata(df, axes, geometries)
コード例 #3
0
def get_calculated_attributes(df, axes=None, history=None):
    """ Functions to automate netCDF attribute generation from the data itself
    This is a wrapper for the other four functions, which could be called separately.

    :param df: data (Pandas DataFrame)
    :param axes: keys (x,y,z,t) are associated with actual column names (dictionary)
    :param history: history: text initializing audit trail for modifications to the original data (optional, string)
    :return: dictionary of global attributes
    """

    axes = get_default_axes(axes)
    attrs = get_geographic_attributes(df, axes)
    attrs = dict_update(attrs, get_vertical_attributes(df, axes))
    attrs = dict_update(attrs, get_temporal_attributes(df, axes))
    attrs = dict_update(attrs, get_creation_attributes(history))

    return attrs
コード例 #4
0
def get_temporal_attributes(df, axes=None):
    """ Use values in a dataframe to set temporal attributes for the eventual netCDF file
    Attribute names come from https://www.nodc.noaa.gov/data/formats/netcdf/v2.0/

    :param df: data (Pandas DataFrame)
    :param axes: keys (x,y,z,t) are associated with actual column names (dictionary). z in meters.
    :return: nested dictionary of variable and global attributes
    """

    axes = get_default_axes(axes)
    mint = df[axes.t].min()
    maxt = df[axes.t].max()

    times = pd.DatetimeIndex(unique_justseen(df[axes.t]))
    dt_index_diff = times[1:] - times[:-1]
    dt_counts = dt_index_diff.value_counts(sort=True)

    if dt_counts.size > 0 and dt_counts.values[0] / (len(times) - 1) > 0.75:
        mode_value = dt_counts.index[0]
    else:
        # Calculate a static resolution
        mode_value = ((maxt - mint) / len(times))

    return {
        'variables': {
            axes.t: {
                'attributes': {
                    'actual_min': mint.strftime('%Y-%m-%dT%H:%M:%SZ'),
                    'actual_max': maxt.strftime('%Y-%m-%dT%H:%M:%SZ'),
                }
            },
        },
        'attributes': {
            'time_coverage_start': mint.strftime('%Y-%m-%dT%H:%M:%SZ'),
            'time_coverage_end': maxt.strftime('%Y-%m-%dT%H:%M:%SZ'),
            'time_coverage_duration': (maxt - mint).round('1S').isoformat(),
            'time_coverage_resolution': mode_value.round('1S').isoformat()
        }
    }
コード例 #5
0
    def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))

        axv = get_mapped_axes_variables(self, axes)

        # Multiple profiles in the file
        pvar = axv.profile
        p_dim = self.dimensions[pvar.dimensions[0]]

        zvar = axv.z
        zs = len(self.dimensions[[
            d for d in zvar.dimensions if d != p_dim.name
        ][0]])

        # Profiles
        p = normalize_countable_array(pvar)
        p = p.repeat(zs)

        # Z
        z = generic_masked(zvar[:].flatten(), attrs=self.vatts(zvar.name))

        # T
        tvar = axv.t
        t = tvar[:].repeat(zs)
        nt = get_masked_datetime_array(t, tvar).flatten()

        # X
        xvar = axv.x
        x = generic_masked(xvar[:].repeat(zs), attrs=self.vatts(xvar.name))

        # Y
        yvar = axv.y
        y = generic_masked(yvar[:].repeat(zs), attrs=self.vatts(yvar.name))

        df_data = OrderedDict([(axes.t, nt), (axes.x, x), (axes.y, y),
                               (axes.z, z), (axes.profile, p)])

        building_index_to_drop = np.ones(t.size, dtype=bool)

        extract_vars = copy(self.variables)
        for ncvar in axv._asdict().values():
            if ncvar is not None and ncvar.name in extract_vars:
                del extract_vars[ncvar.name]

        for i, (dnam, dvar) in enumerate(extract_vars.items()):

            # Profile dimension
            if dvar.dimensions == pvar.dimensions:
                vdata = generic_masked(dvar[:].repeat(zs).astype(dvar.dtype),
                                       attrs=self.vatts(dnam))

            # Profile, z dimension
            elif dvar.dimensions == zvar.dimensions:
                vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype),
                                       attrs=self.vatts(dnam))

            else:
                vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype),
                                       attrs=self.vatts(dnam))
                # Carry through size 1 variables
                if vdata.size == 1:
                    if vdata[0] is np.ma.masked:
                        L.warning(
                            "Skipping variable {} that is completely masked".
                            format(dnam))
                        continue
                else:
                    L.warning(
                        "Skipping variable {} since it didn't match any dimension sizes"
                        .format(dnam))
                    continue

            # Mark rows with data so we don't remove them with clear_rows
            if vdata.size == building_index_to_drop.size:
                building_index_to_drop = (building_index_to_drop == True) & (
                    vdata.mask == True)  # noqa

            # Handle scalars here at the end
            if vdata.size == 1:
                vdata = vdata[0]

            df_data[dnam] = vdata

        df = pd.DataFrame(df_data)

        # Drop all data columns with no data
        if clean_cols:
            df = df.dropna(axis=1, how='all')

        # Drop all data rows with no data variable data
        if clean_rows:
            df = df.iloc[~building_index_to_drop]

        return df
コード例 #6
0
ファイル: cr.py プロジェクト: lizferguson5/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))

        _ = kwargs.pop('reduce_dims', False)
        _ = kwargs.pop('unlimited', False)

        with ContiguousRaggedTrajectoryProfile(output, 'w') as nc:

            trajectory_groups = df.groupby(axes.trajectory)
            unique_trajectories = list(trajectory_groups.groups.keys())
            num_trajectories = len(unique_trajectories)

            nc.createDimension(axes.trajectory, num_trajectories)
            trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (axes.trajectory,))
            trajectory[:] = np.array(unique_trajectories)

            # Calculate the max number of profiles
            unique_profiles = df[axes.profile].unique()
            num_profiles = len(unique_profiles)

            nc.createDimension(axes.profile, num_profiles)
            profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (axes.profile,))
            profile[:] = np.array(unique_profiles)

            # Get unique obs by grouping on traj and profile and getting the max size
            num_obs = len(df)
            nc.createDimension(axes.sample, num_obs)

            # The trajectory this profile belongs to
            t_ind = nc.createVariable('trajectoryIndex', 'i4', (axes.profile,))
            # Number of observations in each profile
            row_size = nc.createVariable('rowSize', 'i4', (axes.profile,))

            # Create all of the axis variables
            time = nc.createVariable(axes.t, 'f8', (axes.profile,), fill_value=np.dtype('f8').type(cls.default_fill_value))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), (axes.profile,), fill_value=df[axes.y].dtype.type(cls.default_fill_value))
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), (axes.profile,), fill_value=df[axes.x].dtype.type(cls.default_fill_value))

            # Axes variables are already processed so skip them
            data_columns = [ d for d in df.columns if d not in axes ]
            attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {}))

            for i, (_, trg) in enumerate(trajectory_groups):
                for j, (_, pfg) in enumerate(trg.groupby(axes.profile)):
                    time[j] = get_ncdata_from_series(pfg[axes.t], time)[0]
                    latitude[j] = get_ncdata_from_series(pfg[axes.y], latitude)[0]
                    longitude[j] = get_ncdata_from_series(pfg[axes.x], longitude)[0]
                    row_size[j] = len(pfg)
                    t_ind[j] = i

            # Add back in the z axes that was removed when calculating data_columns
            data_columns = data_columns + [axes.z]
            skips = ['trajectoryIndex', 'rowSize']
            for c in [ d for d in data_columns if d not in skips ]:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(
                        nc,
                        var_name,
                        (axes.sample,),
                        df[c],
                        zlib=True,
                        complevel=1
                    )
                else:
                    v = nc.variables[var_name]
                vvalues = get_ncdata_from_series(df[c], v)
                try:
                    v[:] = vvalues
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return ContiguousRaggedTrajectoryProfile(output, **kwargs)
コード例 #7
0
ファイル: im.py プロジェクト: lucmehl/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        data_columns = [ d for d in df.columns if d not in axes ]

        unlimited = kwargs.pop('unlimited', False)

        with IncompleteMultidimensionalProfile(output, 'w') as nc:

            profile_group = df.groupby(axes.profile)

            if unlimited is True:
                max_profiles = None
            else:
                max_profiles = df[axes.profile].unique().size
            nc.createDimension(axes.profile, max_profiles)

            max_zs = profile_group.size().max()
            nc.createDimension(axes.z, max_zs)

            # Metadata variables
            nc.createVariable('crs', 'i4')

            profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (axes.profile,))

            # Create all of the variables
            time = nc.createVariable(axes.t, 'f8', (axes.profile,))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), (axes.profile,))
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), (axes.profile,))
            z = nc.createVariable(axes.z, get_dtype(df[axes.z]), (axes.profile, axes.z), fill_value=df[axes.z].dtype.type(cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {}))

            for i, (uid, pdf) in enumerate(profile_group):
                profile[i] = uid

                time[i] = nc4.date2num(pdf[axes.t].iloc[0], units=cls.default_time_unit)
                latitude[i] = pdf[axes.y].iloc[0]
                longitude[i] = pdf[axes.x].iloc[0]

                zvalues = pdf[axes.z].fillna(z._FillValue).values
                sl = slice(0, zvalues.size)
                z[i, sl] = zvalues
                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    if var_name not in nc.variables:
                        v = create_ncvar_from_series(
                            nc,
                            var_name,
                            (axes.profile, axes.z),
                            pdf[c],
                            zlib=True,
                            complevel=1
                        )
                        attributes[var_name] = dict_update(attributes.get(var_name, {}), {
                            'coordinates' : '{} {} {} {}'.format(
                                axes.t, axes.z, axes.x, axes.y
                            )
                        })
                    else:
                        v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(pdf[c], v)

                    sl = slice(0, vvalues.size)
                    v[i, sl] = vvalues

            # Set global attributes
            nc.update_attributes(attributes)

        return IncompleteMultidimensionalProfile(output, **kwargs)
コード例 #8
0
ファイル: om.py プロジェクト: lucmehl/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        data_columns = [d for d in df.columns if d not in axes]

        with OrthogonalMultidimensionalTimeseries(output, 'w') as nc:

            station_group = df.groupby(axes.station)
            num_stations = len(station_group)

            # assume all groups are the same size and have identical times
            _, sdf = list(station_group)[0]
            t = sdf[axes.t]

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            nc.createDimension(axes.t, t.size)
            nc.createDimension(axes.station, num_stations)
            station = nc.createVariable(axes.station, get_dtype(df.station),
                                        (axes.station, ))

            time = nc.createVariable(axes.t, 'f8', (axes.t, ))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         (axes.station, ))
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          (axes.station, ))
            z = nc.createVariable(axes.z,
                                  get_dtype(df[axes.z]), (axes.station, ),
                                  fill_value=df[axes.z].dtype.type(
                                      cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes),
                                     kwargs.pop('attributes', {}))

            # tolist() converts to a python datetime object without timezone and has NaTs.
            g = t.tolist()
            # date2num convers NaTs to np.nan
            gg = nc4.date2num(g, units=cls.default_time_unit)
            # masked_invalid moves np.nan to a masked value
            time[:] = np.ma.masked_invalid(gg)

            for i, (uid, sdf) in enumerate(station_group):
                station[i] = uid
                latitude[i] = sdf[axes.y].iloc[0]
                longitude[i] = sdf[axes.x].iloc[0]

                # TODO: write a test for a Z with a _FillValue
                z[i] = sdf[axes.z].iloc[0]

                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    if var_name not in nc.variables:
                        v = create_ncvar_from_series(nc,
                                                     var_name,
                                                     (axes.station, axes.t),
                                                     sdf[c],
                                                     zlib=True,
                                                     complevel=1)
                        attributes[var_name] = dict_update(
                            attributes.get(var_name, {}), {
                                'coordinates':
                                '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                     axes.y)
                            })
                    else:
                        v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(sdf[c], v)
                    try:
                        v[i, :] = vvalues
                    except BaseException:
                        L.debug(
                            '{} was not written. Likely a metadata variable'.
                            format(v.name))

            # Set global attributes
            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseries(output, **kwargs)
コード例 #9
0
def get_geographic_attributes(df, axes=None):
    """ Use values in a dataframe to set geographic attributes for the eventual netCDF file
    Attribute names come from https://www.nodc.noaa.gov/data/formats/netcdf/v2.0/
    The coordinate reference system (CRS) is assumed to be EPSG:4326, which is WGS84 and is used with
    GPS satellite navigation (http://spatialreference.org/ref/epsg/wgs-84/).  This is NCEI's default.
    Coordinate values are latitude (decimal degrees_north) and longitude (decimal degrees_east).
    Longitude values are limited to [-180, 180).

    :param df: data (Pandas DataFrame)
    :param axes: keys (x,y,z,t) are associated with actual column names (dictionary)
    :return: nested dictionary of variable and global attributes
    """
    axes = get_default_axes(axes)

    carry_miny = round(float(df[axes.y].min()), 6)
    carry_maxy = round(float(df[axes.y].max()), 6)
    carry_minx = round(float(df[axes.x].min()), 6)
    carry_maxx = round(float(df[axes.x].max()), 6)

    notnull = df[axes.x].notnull() & df[axes.y].notnull()
    coords = list(zip(df.loc[notnull, axes.x], df.loc[notnull, axes.y]))

    if len(set(coords)) == 1:
        geoclass = Point
    elif len(coords) > 2:
        geoclass = Polygon
    else:
        geoclass = LineString

    p = geoclass(coords)
    dateline = LineString([(180, 90), (-180, -90)])
    # If we cross the dateline normalize the coordinates before polygon
    if dateline.crosses(p):
        newx = (df.loc[notnull, axes.x] + 360) % 360
        p = geoclass(zip(newx, df.loc[notnull, axes.y]))
        p = fix_geom(p)

    geometry_bbox = box(*p.bounds).wkt
    geometry_wkt = p.convex_hull.wkt

    return {
        'variables': {
            axes.y: {
                'attributes': {
                    'actual_min': carry_miny,
                    'actual_max': carry_maxy,
                }
            },
            axes.x: {
                'attributes': {
                    'actual_min': carry_minx,
                    'actual_max': carry_maxx,
                }
            },
        },
        'attributes': {
            'geospatial_lat_min': carry_miny,
            'geospatial_lat_max': carry_maxy,
            'geospatial_lon_min': carry_minx,
            'geospatial_lon_max': carry_maxx,
            'geospatial_bbox': geometry_bbox,
            'geospatial_bounds': geometry_wkt,
            'geospatial_bounds_crs': 'EPSG:4326',
        }
    }
コード例 #10
0
ファイル: r.py プロジェクト: pyoceans/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not supported in xarray
            changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with RaggedTimeseriesProfile(output, 'w') as nc:

            station_groups = df.groupby(axes.station)
            unique_stations = list(station_groups.groups.keys())
            num_stations = len(unique_stations)

            # Calculate the max number of profiles
            profile_groups = df.groupby(axes.profile)
            unique_profiles = list(profile_groups.groups.keys())
            num_profiles = len(unique_profiles)
            nc.createDimension(daxes.profile, num_profiles)

            if reduce_dims is True and num_stations == 1:
                # If a singular station, remove the dimension
                station_dimensions = ()
                s_ind = None
            else:
                station_dimensions = (daxes.station,)
                nc.createDimension(daxes.station, num_stations)
                # The station this profile belongs to
                s_ind = nc.createVariable('stationIndex', 'i4', (daxes.profile,))

            station = nc.createVariable(axes.station, get_dtype(unique_stations), station_dimensions)
            profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (daxes.profile,))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), station_dimensions)
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), station_dimensions)

            # Get unique obs by grouping on traj and profile and getting the max size
            if unlimited is True:
                nc.createDimension(daxes.sample, None)
            else:
                nc.createDimension(daxes.sample, len(df))

            # Number of observations in each profile
            row_size = nc.createVariable('rowSize', 'i4', (daxes.profile,))

            # Axes variables are already processed so skip them
            data_columns = [ d for d in df.columns if d not in axes ]
            data_columns += [axes.t, axes.z]  # time isn't really special, its dimensioned by obs
            attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {}))

            for i, (sname, srg) in enumerate(station_groups):
                station[i] = sname
                latitude[i] = df[axes.y][df[axes.station] == sname].dropna().iloc[0]
                longitude[i] = df[axes.x][df[axes.station] == sname].dropna().iloc[0]

            for j, (pname, pfg) in enumerate(profile_groups):
                profile[j] = pname
                row_size[j] = len(pfg)
                if s_ind is not None:
                    s_ind[j] = np.asscalar(np.argwhere(station[:] == pfg[axes.station].dropna().iloc[0]))

            # Add back in the z axes that was removed when calculating data_columns
            # and ignore variables that were stored in the profile index
            skips = ['stationIndex', 'rowSize']
            for c in [ d for d in data_columns if d not in skips ]:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(
                        nc,
                        var_name,
                        (daxes.sample,),
                        df[c],
                        zlib=True,
                        complevel=1
                    )
                else:
                    v = nc.variables[var_name]
                vvalues = get_ncdata_from_series(df[c], v)
                try:
                    if unlimited is True:
                        v[:] = vvalues
                    else:
                        v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return RaggedTimeseriesProfile(output, **kwargs)
コード例 #11
0
ファイル: im.py プロジェクト: lucmehl/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        data_columns = [d for d in df.columns if d not in axes]

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        with IncompleteMultidimensionalTrajectory(output, 'w') as nc:

            trajectory_group = df.groupby(axes.trajectory)

            if unlimited is True:
                max_obs = None
            else:
                max_obs = trajectory_group.size().max()
            nc.createDimension(axes.sample, max_obs)

            num_trajectories = len(trajectory_group)
            if reduce_dims is True and num_trajectories == 1:
                # If a singlular trajectory, we can reduce that dimension if it is of size 1
                def ts(t_index, size):
                    return np.s_[0:size]

                default_dimensions = (axes.sample, )
                trajectory = nc.createVariable(axes.trajectory,
                                               get_dtype(df[axes.trajectory]))
            else:

                def ts(t_index, size):
                    return np.s_[t_index, 0:size]

                default_dimensions = (axes.trajectory, axes.sample)
                nc.createDimension(axes.trajectory, num_trajectories)
                trajectory = nc.createVariable(axes.trajectory,
                                               get_dtype(df[axes.trajectory]),
                                               (axes.trajectory, ))

            # Create all of the variables
            time = nc.createVariable(axes.t,
                                     'f8',
                                     default_dimensions,
                                     fill_value=np.dtype('f8').type(
                                         cls.default_fill_value))
            z = nc.createVariable(axes.z,
                                  get_dtype(df[axes.z]),
                                  default_dimensions,
                                  fill_value=df[axes.z].dtype.type(
                                      cls.default_fill_value))
            latitude = nc.createVariable(axes.y,
                                         get_dtype(df[axes.y]),
                                         default_dimensions,
                                         fill_value=df[axes.y].dtype.type(
                                             cls.default_fill_value))
            longitude = nc.createVariable(axes.x,
                                          get_dtype(df[axes.x]),
                                          default_dimensions,
                                          fill_value=df[axes.x].dtype.type(
                                              cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes),
                                     kwargs.pop('attributes', {}))

            for i, (uid, gdf) in enumerate(trajectory_group):
                trajectory[i] = uid

                # tolist() converts to a python datetime object without timezone and has NaTs.
                g = gdf[axes.t].tolist()
                # date2num convers NaTs to np.nan
                gg = nc4.date2num(g, units=cls.default_time_unit)
                # masked_invalid moves np.nan to a masked value
                time[ts(i, gg.size)] = np.ma.masked_invalid(gg)

                lats = gdf[axes.y].fillna(get_fill_value(latitude)).values
                latitude[ts(i, lats.size)] = lats

                lons = gdf[axes.x].fillna(get_fill_value(longitude)).values
                longitude[ts(i, lons.size)] = lons

                zs = gdf[axes.z].fillna(get_fill_value(z)).values
                z[ts(i, zs.size)] = zs

                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    if var_name not in nc.variables:
                        v = create_ncvar_from_series(nc,
                                                     var_name,
                                                     default_dimensions,
                                                     gdf[c],
                                                     zlib=True,
                                                     complevel=1)
                        attributes[var_name] = dict_update(
                            attributes.get(var_name, {}), {
                                'coordinates':
                                '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                     axes.y)
                            })
                    else:
                        v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(gdf[c], v)
                    v[ts(i, vvalues.size)] = vvalues

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return IncompleteMultidimensionalTrajectory(output, **kwargs)
コード例 #12
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes

        # Should never be a CR file with one trajectory so we ignore the "reduce_dims" attribute
        _ = kwargs.pop('reduce_dims', False)  # noqa
        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with ContiguousRaggedTrajectory(output, 'w') as nc:

            trajectory_groups = df.groupby(axes.trajectory)
            unique_trajectories = list(trajectory_groups.groups.keys())
            num_trajectories = len(unique_trajectories)
            nc.createDimension(daxes.trajectory, num_trajectories)
            trajectory = nc.createVariable(axes.trajectory,
                                           get_dtype(df[axes.trajectory]),
                                           (daxes.trajectory, ))

            # Get unique obs by grouping on traj getting the max size
            if unlimited is True:
                nc.createDimension(daxes.sample, None)
            else:
                nc.createDimension(daxes.sample, len(df))

            # Number of observations in each trajectory
            row_size = nc.createVariable('rowSize', 'i4', (daxes.trajectory, ))

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Variables defined on only the trajectory axis
            traj_vars = kwargs.pop('traj_vars', [])
            traj_columns = [p for p in traj_vars if p in df.columns]
            for c in traj_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    create_ncvar_from_series(nc,
                                             var_name, (daxes.trajectory, ),
                                             df[c],
                                             zlib=True,
                                             complevel=1)

            for i, (trajid, trg) in enumerate(trajectory_groups):
                trajectory[i] = trajid
                row_size[i] = len(trg)

                # Save any trajectory variables using the first value found
                # in the column.
                for c in traj_columns:
                    var_name = cf_safe_name(c)
                    if var_name not in nc.variables:
                        continue
                    v = nc.variables[var_name]
                    vvalues = get_ncdata_from_series(trg[c], v)[0]
                    try:
                        v[i] = vvalues
                    except BaseException:
                        L.exception('Failed to add {}'.format(c))
                        continue

            # Add all of the columns based on the sample dimension. Take all columns and remove the
            # trajectory, rowSize and other trajectory based columns.
            sample_columns = [
                f for f in df.columns
                if f not in traj_columns + ['rowSize', axes.trajectory]
            ]
            for c in sample_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name, (daxes.sample, ),
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                else:
                    v = nc.variables[var_name]
                vvalues = get_ncdata_from_series(df[c], v)
                try:
                    if unlimited is True:
                        v[:] = vvalues
                    else:
                        v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return ContiguousRaggedTrajectory(output, **kwargs)
コード例 #13
0
    def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))

        axv = get_mapped_axes_variables(self, axes)

        o_index_var = self.filter_by_attrs(
            sample_dimension=lambda x: x is not None)
        if not o_index_var:
            raise ValueError(
                'Could not find the "sample_dimension" attribute on any variables, '
                'is this a valid {}?'.format(self.__class__.__name__))
        else:
            o_index_var = o_index_var[0]
            o_dim = self.dimensions[
                o_index_var.sample_dimension]  # Sample dimension
            t_dim = o_index_var.dimensions

        # Trajectory
        row_sizes = o_index_var[:]
        traj_data = normalize_countable_array(axv.trajectory)
        traj_data = np.repeat(traj_data, row_sizes)

        # time
        time_data = get_masked_datetime_array(axv.t[:], axv.t).flatten()

        df_data = OrderedDict([(axes.t, time_data),
                               (axes.trajectory, traj_data)])

        building_index_to_drop = np.ones(o_dim.size, dtype=bool)

        extract_vars = copy(self.variables)
        # Skip the time and row index variables
        del extract_vars[o_index_var.name]
        del extract_vars[axes.t]

        for i, (dnam, dvar) in enumerate(extract_vars.items()):

            # Trajectory dimensions
            if dvar.dimensions == t_dim:
                vdata = np.repeat(
                    generic_masked(dvar[:], attrs=self.vatts(dnam)), row_sizes)

            # Sample dimensions
            elif dvar.dimensions == (o_dim.name, ):
                vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype),
                                       attrs=self.vatts(dnam))

            else:
                vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype),
                                       attrs=self.vatts(dnam))
                # Carry through size 1 variables
                if vdata.size == 1:
                    if vdata[0] is np.ma.masked:
                        L.warning(
                            "Skipping variable {} that is completely masked".
                            format(dnam))
                        continue
                else:
                    L.warning(
                        "Skipping variable {} since it didn't match any dimension sizes"
                        .format(dnam))
                    continue

            # Mark rows with data so we don't remove them with clear_rows
            if vdata.size == building_index_to_drop.size:
                building_index_to_drop = (building_index_to_drop == True) & (
                    vdata.mask == True)  # noqa

            # Handle scalars here at the end
            if vdata.size == 1:
                vdata = vdata[0]

            df_data[dnam] = vdata

        df = pd.DataFrame(df_data)

        # Drop all data columns with no data
        if clean_cols:
            df = df.dropna(axis=1, how='all')

        # Drop all data rows with no data variable data
        if clean_rows:
            df = df.iloc[~building_index_to_drop]

        return df
コード例 #14
0
ファイル: om.py プロジェクト: pyoceans/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes
        data_columns = [d for d in df.columns if d not in axes]

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not supported in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        # Make a new index that is the Cartesian product of all of the values from all of the
        # values of the old index. This is so don't have to iterate over anything. The full column
        # of data will be able to be shaped to the size of the final unique sized dimensions.
        index_order = [axes.t, axes.z, axes.station]
        df = df.set_index(index_order)
        df = df.reindex(
            pd.MultiIndex.from_product(df.index.levels, names=index_order))

        unique_z = df.index.get_level_values(axes.z).unique().values
        unique_t = df.index.get_level_values(
            axes.t).unique().tolist()  # tolist converts to Timestamp
        all_stations = df.index.get_level_values(axes.station)
        unique_s = all_stations.unique()

        with OrthogonalMultidimensionalTimeseriesProfile(output, 'w') as nc:

            if reduce_dims is True and unique_s.size == 1:
                # If a singular trajectory, we can reduce that dimension if it is of size 1
                default_dimensions = (daxes.t, daxes.z)
                station_dimensions = ()
            else:
                default_dimensions = (daxes.t, daxes.z, daxes.station)
                station_dimensions = (daxes.station, )
                nc.createDimension(daxes.station, unique_s.size)

            station = nc.createVariable(axes.station, get_dtype(unique_s),
                                        station_dimensions)
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         station_dimensions)
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          station_dimensions)
            # Assign over loop because VLEN variables (strings) have to be assigned by integer index
            # and we need to find the lat/lon based on station index
            for si, st in enumerate(unique_s):
                station[si] = st
                latitude[si] = df[axes.y][all_stations == st].dropna().iloc[0]
                longitude[si] = df[axes.x][all_stations == st].dropna().iloc[0]

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            if unlimited is True:
                nc.createDimension(daxes.t, None)
            else:
                nc.createDimension(daxes.t, len(unique_t))
            time = nc.createVariable(axes.t, 'f8', (daxes.t, ))
            time[:] = date2num(unique_t,
                               units=cls.default_time_unit).astype('f8')

            nc.createDimension(daxes.z, unique_z.size)
            z = nc.createVariable(axes.z, get_dtype(unique_z), (daxes.z, ))
            z[:] = unique_z

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Variables defined on only the time axis and not the depth axis
            detach_z_vars = kwargs.pop('detach_z', [])
            detach_z_columnms = [p for p in detach_z_vars if p in data_columns]
            for c in detach_z_columnms:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(
                        nc,
                        var_name,
                        default_dimensions[
                            0::2],  # this removes the second dimension (z)
                        df[c],
                        zlib=True,
                        complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}), {
                            'coordinates':
                            '{} {} {}'.format(axes.t, axes.x, axes.y)
                        })
                else:
                    v = nc.variables[var_name]

                # Because we need access to the fillvalues here, we ask not to return
                # the values with them already filled.
                vvalues = get_ncdata_from_series(df[c], v, fillna=False)
                # Reshape to the full array, with Z
                vvalues = vvalues.reshape(len(unique_t), unique_z.size,
                                          unique_s.size)
                # The Z axis is always the second axis, take the mean over that axis
                vvalues = np.apply_along_axis(np.nanmean, 1, vvalues).flatten()
                # Now reshape to the array without Z
                vvalues = vvalues.reshape(len(unique_t), unique_s.size)
                try:
                    v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            full_columns = [
                f for f in data_columns if f not in detach_z_columnms
            ]
            for c in full_columns:
                # Create variable if it doesn't exist
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name,
                                                 default_dimensions,
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}), {
                            'coordinates':
                            '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                 axes.y)
                        })
                else:
                    v = nc.variables[var_name]

                vvalues = get_ncdata_from_series(df[c], v)
                v[:] = vvalues.reshape(v.shape)

            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseriesProfile(output, **kwargs)
コード例 #15
0
ファイル: om.py プロジェクト: lucmehl/pocean-core
    def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))

        axv = get_mapped_axes_variables(self, axes)

        zvar = axv.z
        zs = len(self.dimensions[zvar.dimensions[0]])

        # Profiles
        pvar = axv.profile
        p = normalize_countable_array(pvar)
        ps = p.size
        p = p.repeat(zs)

        # Z
        z = generic_masked(zvar[:], attrs=self.vatts(zvar.name))
        try:
            z = np.tile(z, ps)
        except ValueError:
            z = z.flatten()

        # T
        tvar = axv.t
        t = tvar[:].repeat(zs)
        nt = get_masked_datetime_array(t, tvar).flatten()

        # X
        xvar = axv.x
        x = generic_masked(xvar[:].repeat(zs), attrs=self.vatts(xvar.name))

        # Y
        yvar = axv.y
        y = generic_masked(yvar[:].repeat(zs), attrs=self.vatts(yvar.name))

        df_data = OrderedDict([(axes.t, nt), (axes.x, x), (axes.y, y),
                               (axes.z, z), (axes.profile, p)])

        building_index_to_drop = np.ones(t.size, dtype=bool)

        # Axes variables are already processed so skip them
        extract_vars = copy(self.variables)
        for ncvar in axv._asdict().values():
            if ncvar is not None and ncvar.name in extract_vars:
                del extract_vars[ncvar.name]

        for i, (dnam, dvar) in enumerate(extract_vars.items()):

            # Profile dimension
            if dvar.dimensions == pvar.dimensions:
                vdata = generic_masked(dvar[:].repeat(zs).astype(dvar.dtype),
                                       attrs=self.vatts(dnam))
                building_index_to_drop = (building_index_to_drop == True) & (
                    vdata.mask == True)  # noqa

            # Z dimension
            elif dvar.dimensions == zvar.dimensions:
                vdata = generic_masked(np.tile(dvar[:], ps).flatten().astype(
                    dvar.dtype),
                                       attrs=self.vatts(dnam))
                building_index_to_drop = (building_index_to_drop == True) & (
                    vdata.mask == True)  # noqa

            # Profile, z dimension
            elif dvar.dimensions == pvar.dimensions + zvar.dimensions:
                vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype),
                                       attrs=self.vatts(dnam))
                building_index_to_drop = (building_index_to_drop == True) & (
                    vdata.mask == True)  # noqa

            else:
                vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype),
                                       attrs=self.vatts(dnam))
                # Carry through size 1 variables
                if vdata.size == 1:
                    if vdata[0] is np.ma.masked:
                        L.warning(
                            "Skipping variable {} that is completely masked".
                            format(dnam))
                        continue
                    vdata = vdata[0]
                else:
                    L.warning(
                        "Skipping variable {} since it didn't match any dimension sizes"
                        .format(dnam))
                    continue

            df_data[dnam] = vdata

        df = pd.DataFrame(df_data)

        # Drop all data columns with no data
        if clean_cols:
            df = df.dropna(axis=1, how='all')

        # Drop all data rows with no data variable data
        if clean_rows:
            df = df.iloc[~building_index_to_drop]

        return df
コード例 #16
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes
        data_columns = [ d for d in df.columns if d not in axes ]

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with IncompleteMultidimensionalTrajectory(output, 'w') as nc:

            trajectory_group = df.groupby(axes.trajectory)

            if unlimited is True:
                max_obs = None
            else:
                max_obs = trajectory_group.size().max()
            nc.createDimension(daxes.sample, max_obs)

            num_trajectories = len(trajectory_group)
            if reduce_dims is True and num_trajectories == 1:
                # If a singlular trajectory, we can reduce that dimension if it is of size 1
                def ts(t_index, size):
                    return np.s_[0:size]
                default_dimensions = (daxes.sample,)
                trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]))
            else:
                def ts(t_index, size):
                    return np.s_[t_index, 0:size]
                default_dimensions = (daxes.trajectory, daxes.sample)
                nc.createDimension(daxes.trajectory, num_trajectories)
                trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (daxes.trajectory,))

            # Create all of the variables
            time = nc.createVariable(axes.t, 'f8', default_dimensions, fill_value=np.dtype('f8').type(cls.default_fill_value))
            z = nc.createVariable(axes.z, get_dtype(df[axes.z]), default_dimensions, fill_value=df[axes.z].dtype.type(cls.default_fill_value))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), default_dimensions, fill_value=df[axes.y].dtype.type(cls.default_fill_value))
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), default_dimensions, fill_value=df[axes.x].dtype.type(cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {}))

            # Create vars based on full dataframe (to get all variables)
            for c in data_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(
                        nc,
                        var_name,
                        default_dimensions,
                        df[c],
                        zlib=True,
                        complevel=1
                    )
                    attributes[var_name] = dict_update(attributes.get(var_name, {}), {
                        'coordinates': '{} {} {} {}'.format(
                            axes.t, axes.z, axes.x, axes.y
                        )
                    })

            for i, (uid, gdf) in enumerate(trajectory_group):
                trajectory[i] = uid

                times = get_ncdata_from_series(gdf[axes.t], time)
                time[ts(i, times.size)] = times

                lats = get_ncdata_from_series(gdf[axes.y], latitude)
                latitude[ts(i, lats.size)] = lats

                lons = get_ncdata_from_series(gdf[axes.x], longitude)
                longitude[ts(i, lons.size)] = lons

                zs = gdf[axes.z].fillna(get_fill_value(z)).values
                z[ts(i, zs.size)] = zs

                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(gdf[c], v)
                    slicer = ts(i, vvalues.size)
                    v[slicer] = vvalues

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return IncompleteMultidimensionalTrajectory(output, **kwargs)
コード例 #17
0
    def test_get_default_axes(self):
        assert get_default_axes() == (
            'trajectory',
            'station',
            'profile',
            'obs',
            't',
            'x',
            'y',
            'z',
        )

        new_defaults = {
            'trajectory': 'a',
            'station':    'b',
            'profile':    'c',
            'sample':     'h',
            't':          'd',
            'x':          'e',
            'y':          'f',
            'z':          'g',
        }
        assert get_default_axes(new_defaults) == (
            'a',
            'b',
            'c',
            'h',
            'd',
            'e',
            'f',
            'g',
        )

        new_defaults = {
            'trajectory': 'a',
            'station':    'b',
            'profile':    'c'
        }
        assert get_default_axes(new_defaults) == (
            'a',
            'b',
            'c',
            'obs',
            't',
            'x',
            'y',
            'z',
        )

        # Time is not a valid axis key
        bad_defaults = {
            'time': 'a'
        }
        with self.assertRaises(TypeError):
            get_default_axes(bad_defaults)

        # Can't have duplicate values
        bad_defaults = {
            'x': 'a',
            'y': 'a'
        }
        with self.assertRaises(ValueError):
            get_default_axes(bad_defaults)

        # but you can with the sample dimension
        bad_defaults = {
            't': 'time',
            'sample': 'time'
        }
        assert get_default_axes(bad_defaults) == (
            'trajectory',
            'station',
            'profile',
            'time',
            'time',
            'x',
            'y',
            'z',
        )
コード例 #18
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes
        data_columns = [d for d in df.columns if d not in axes]

        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with IncompleteMultidimensionalProfile(output, 'w') as nc:

            profile_group = df.groupby(axes.profile)

            if unlimited is True:
                max_profiles = None
            else:
                max_profiles = df[axes.profile].unique().size
            nc.createDimension(daxes.profile, max_profiles)

            max_zs = profile_group.size().max()
            nc.createDimension(daxes.z, max_zs)

            # Metadata variables
            nc.createVariable('crs', 'i4')

            profile = nc.createVariable(axes.profile,
                                        get_dtype(df[axes.profile]),
                                        (daxes.profile, ))

            # Create all of the variables
            time = nc.createVariable(axes.t, 'f8', (daxes.profile, ))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         (daxes.profile, ))
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          (daxes.profile, ))
            z = nc.createVariable(
                axes.z,
                get_dtype(df[axes.z]), (daxes.profile, daxes.z),
                fill_value=df[axes.z].dtype.type(cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Create vars based on full dataframe (to get all variables)
            for c in data_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name,
                                                 (daxes.profile, daxes.z),
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}), {
                            'coordinates':
                            '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                 axes.y)
                        })

            # Write values for each profile within profile_group
            for i, (uid, pdf) in enumerate(profile_group):
                profile[i] = uid

                time[i] = date2num(pdf[axes.t].iloc[0],
                                   units=cls.default_time_unit)
                latitude[i] = pdf[axes.y].iloc[0]
                longitude[i] = pdf[axes.x].iloc[0]

                zvalues = pdf[axes.z].fillna(z._FillValue).values
                sl = slice(0, zvalues.size)
                z[i, sl] = zvalues

                for c in data_columns:
                    var_name = cf_safe_name(c)
                    v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(pdf[c], v)

                    sl = slice(0, vvalues.size)
                    v[i, sl] = vvalues

            # Set global attributes
            nc.update_attributes(attributes)

        return IncompleteMultidimensionalProfile(output, **kwargs)
コード例 #19
0
ファイル: r.py プロジェクト: pyoceans/pocean-core
    def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        axv = get_mapped_axes_variables(self, axes)

        # Profile dimension
        p_var = self.filter_by_attrs(cf_role='profile_id')[0]
        p_dim = self.dimensions[p_var.dimensions[0]]

        # Station dimension
        s_var = self.filter_by_attrs(cf_role='timeseries_id')[0]
        if s_var.ndim == 1:
            s_dim = self.dimensions[s_var.dimensions[0]]
        elif s_var.ndim == 0:
            s_dim = None
        else:
            raise ValueError('Number of dimension on the station (timeseries_id) must be 0 or 1')

        # Station index
        r_index_var = self.filter_by_attrs(instance_dimension=lambda x: x is not None)
        if not r_index_var:
            # A reduced netCDF file, set station to 0 so it pulls the first value
            # of the variable that identifies the stations
            r_index_var = [0]
        else:
            r_index_var = r_index_var[0]

        # Sample (obs) dimension
        o_index_var = self.filter_by_attrs(sample_dimension=lambda x: x is not None)
        if not o_index_var:
            raise ValueError(
                'Could not find the "sample_dimension" attribute on any variables, '
                'is this a valid {}?'.format(self.__class__.__name__)
            )
        else:
            o_index_var = o_index_var[0]

        # Sample dimension
        # Since this is a flat dataframe, everything is the length of the obs dimension
        row_sizes = o_index_var[:]
        o_dim = self.dimensions[o_index_var.sample_dimension]

        profile_indexes = normalize_countable_array(p_var, count_if_none=p_dim.size)
        p = np.repeat(profile_indexes, row_sizes)

        stat_indexes = normalize_countable_array(s_var, count_if_none=s_dim.size)
        r = np.ma.masked_all(o_dim.size, dtype=stat_indexes.dtype)

        # Lat and Lon are on the station dimension
        xvar = axv.x
        x = np.ma.masked_all(o_dim.size, dtype=xvar.dtype)
        yvar = axv.y
        y = np.ma.masked_all(o_dim.size, dtype=yvar.dtype)
        si = 0
        for i in np.arange(stat_indexes.size):
            ei = si + o_index_var[i]
            r[si:ei] = np.array(stat_indexes[r_index_var[i]])
            x[si:ei] = xvar[i]
            y[si:ei] = yvar[i]
            si = ei
        x = generic_masked(x, minv=-180, maxv=180)
        y = generic_masked(y, minv=-90, maxv=90)

        # Time and Z are on the sample (obs) dimension
        tvar = axv.t
        t = get_masked_datetime_array(
            generic_masked(tvar[:].flatten(), attrs=self.vatts(tvar.name)),
            tvar
        )
        z = generic_masked(axv.z[:].flatten(), attrs=self.vatts(axv.z.name))

        df_data = OrderedDict([
            (axes.t, t),
            (axes.x, x),
            (axes.y, y),
            (axes.z, z),
            (axes.station, r),
            (axes.profile, p)
        ])

        building_index_to_drop = np.ones(o_dim.size, dtype=bool)

        extract_vars = copy(self.variables)
        # Skip the station and row index variables
        del extract_vars[o_index_var.name]
        del extract_vars[r_index_var.name]
        # Axes variables are already processed so skip them
        for ncvar in axv._asdict().values():
            if ncvar is not None and ncvar.name in extract_vars:
                del extract_vars[ncvar.name]

        for i, (dnam, dvar) in enumerate(extract_vars.items()):

            # Profile dimensions
            if dvar.dimensions == (p_dim.name,):
                vdata = generic_masked(
                    np.repeat(
                        dvar[:].flatten().astype(dvar.dtype),
                        row_sizes
                    ),
                    attrs=self.vatts(dnam)
                )

            # Sample dimensions
            elif dvar.dimensions == (o_dim.name,):
                vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam))

            else:
                vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype), attrs=self.vatts(dnam))
                # Carry through size 1 variables
                if vdata.size == 1:
                    if vdata[0] is np.ma.masked:
                        L.warning("Skipping variable {} that is completely masked".format(dnam))
                        continue
                else:
                    L.warning("Skipping variable {} since it didn't match any dimension sizes".format(dnam))
                    continue

            # Mark rows with data so we don't remove them with clear_rows
            if vdata.size == building_index_to_drop.size:
                building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True)  # noqa

            # Handle scalars here at the end
            if vdata.size == 1:
                vdata = vdata[0]

            df_data[dnam] = vdata

        df = pd.DataFrame(df_data)

        # Drop all data columns with no data
        if clean_cols:
            df = df.dropna(axis=1, how='all')

        # Drop all data rows with no data variable data
        if clean_rows:
            df = df.iloc[~building_index_to_drop]

        return df
コード例 #20
0
ファイル: cr.py プロジェクト: pyoceans/pocean-core
    def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))

        axv = get_mapped_axes_variables(self, axes)

        # The index variable (trajectory_index) is identified by having an
        # attribute with name of instance_dimension whose value is the instance
        # dimension name (trajectory in this example). The index variable must
        # have the profile dimension as its sole dimension, and must be type
        # integer. Each value in the index variable is the zero-based trajectory
        # index that the profile belongs to i.e. profile p belongs to trajectory
        # i=trajectory_index(p), as in section H.2.5.
        r_index_var = self.filter_by_attrs(
            instance_dimension=lambda x: x is not None)
        if not r_index_var:
            raise ValueError(
                'Could not find the "instance_dimension" attribute on any variables, '
                'is this a valid {}?'.format(self.__class__.__name__))
        else:
            r_index_var = r_index_var[0]
        p_dim = self.dimensions[r_index_var.dimensions[0]]  # Profile dimension

        # We should probably use this below to test for dimensionality of variables?
        # r_dim = self.dimensions[r_index_var.instance_dimension]  # Trajectory dimension

        # The count variable (row_size) contains the number of elements for
        # each profile, which must be written contiguously. The count variable
        # is identified by having an attribute with name sample_dimension whose
        # value is the sample dimension (obs in this example) being counted. It
        # must have the profile dimension as its sole dimension, and must be
        # type integer
        o_index_var = self.filter_by_attrs(
            sample_dimension=lambda x: x is not None)
        if not o_index_var:
            raise ValueError(
                'Could not find the "sample_dimension" attribute on any variables, '
                'is this a valid {}?'.format(self.__class__.__name__))
        else:
            o_index_var = o_index_var[0]
        o_dim = self.dimensions[
            o_index_var.sample_dimension]  # Sample dimension

        profile_indexes = normalize_countable_array(axv.profile,
                                                    count_if_none=p_dim.size)
        p = np.ma.masked_all(o_dim.size, dtype=profile_indexes.dtype)

        traj_indexes = normalize_countable_array(axv.trajectory)
        r = np.ma.masked_all(o_dim.size, dtype=traj_indexes.dtype)

        tvar = axv.t
        t = np.ma.masked_all(o_dim.size, dtype=tvar.dtype)

        xvar = axv.x
        x = np.ma.masked_all(o_dim.size, dtype=xvar.dtype)

        yvar = axv.y
        y = np.ma.masked_all(o_dim.size, dtype=yvar.dtype)
        si = 0

        # Sample (obs) dimension
        zvar = axv.z
        z = generic_masked(zvar[:].flatten(), attrs=self.vatts(zvar.name))

        for i in np.arange(profile_indexes.size):
            ei = si + o_index_var[i]
            p[si:ei] = profile_indexes[i]
            r[si:ei] = np.array(traj_indexes[r_index_var[i]])
            t[si:ei] = tvar[i]
            x[si:ei] = xvar[i]
            y[si:ei] = yvar[i]
            si = ei

        #  T
        nt = get_masked_datetime_array(t, tvar).flatten()

        # X and Y
        x = generic_masked(x, minv=-180, maxv=180)
        y = generic_masked(y, minv=-90, maxv=90)

        df_data = OrderedDict([(axes.t, nt), (axes.x, x), (axes.y, y),
                               (axes.z, z), (axes.trajectory, r),
                               (axes.profile, p)])

        building_index_to_drop = np.ones(o_dim.size, dtype=bool)

        extract_vars = copy(self.variables)
        # Skip the traj and row index variables
        del extract_vars[o_index_var.name]
        del extract_vars[r_index_var.name]
        # Axes variables are already processed so skip them
        for ncvar in axv._asdict().values():
            if ncvar is not None and ncvar.name in extract_vars:
                del extract_vars[ncvar.name]

        for i, (dnam, dvar) in enumerate(extract_vars.items()):

            # Profile dimensions
            if dvar.dimensions == (p_dim.name, ):
                vdata = np.ma.masked_all(o_dim.size, dtype=dvar.dtype)
                si = 0
                for j in np.arange(profile_indexes.size):
                    ei = si + o_index_var[j]
                    vdata[si:ei] = dvar[j]
                    si = ei

            # Sample dimensions
            elif dvar.dimensions == (o_dim.name, ):
                vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype),
                                       attrs=self.vatts(dnam))

            else:
                vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype),
                                       attrs=self.vatts(dnam))
                # Carry through size 1 variables
                if vdata.size == 1:
                    if vdata[0] is np.ma.masked:
                        L.warning(
                            "Skipping variable {} that is completely masked".
                            format(dnam))
                        continue
                else:
                    L.warning(
                        "Skipping variable {} since it didn't match any dimension sizes"
                        .format(dnam))
                    continue

            # Mark rows with data so we don't remove them with clear_rows
            if vdata.size == building_index_to_drop.size:
                building_index_to_drop = (building_index_to_drop == True) & (
                    vdata.mask == True)  # noqa

            # Handle scalars here at the end
            if vdata.size == 1:
                vdata = vdata[0]

            df_data[dnam] = vdata

        df = pd.DataFrame(df_data)

        # Drop all data columns with no data
        if clean_cols:
            df = df.dropna(axis=1, how='all')

        # Drop all data rows with no data variable data
        if clean_rows:
            df = df.iloc[~building_index_to_drop]

        return df
コード例 #21
0
ファイル: im.py プロジェクト: lucmehl/pocean-core
    def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))

        axv = get_mapped_axes_variables(self,
                                        axes,
                                        skip=[axes.profile, axes.station])

        # T
        t = get_masked_datetime_array(axv.t[:], axv.t).flatten()

        # X
        x = generic_masked(axv.x[:], attrs=self.vatts(axv.x.name)).flatten()

        # Y
        y = generic_masked(axv.y[:], attrs=self.vatts(axv.y.name)).flatten()

        # Z
        z = generic_masked(axv.z[:], attrs=self.vatts(axv.z.name)).flatten()

        # Trajectories
        rvar = axv.trajectory
        p = normalize_countable_array(rvar)

        # The Dimension that the trajectory id variable doesn't have is what
        # the trajectory data needs to be repeated by
        dim_diff = self.dimensions[list(
            set(axv.t.dimensions).difference(set(rvar.dimensions)))[0]]
        if dim_diff:
            p = p.repeat(dim_diff.size)

        df_data = OrderedDict([(axes.t, t), (axes.x, x), (axes.y, y),
                               (axes.z, z), (axes.trajectory, p)])

        building_index_to_drop = np.ones(t.size, dtype=bool)

        # Axes variables are already processed so skip them
        extract_vars = copy(self.variables)
        for ncvar in axv._asdict().values():
            if ncvar is not None and ncvar.name in extract_vars:
                del extract_vars[ncvar.name]

        for i, (dnam, dvar) in enumerate(extract_vars.items()):
            vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype),
                                   attrs=self.vatts(dnam))

            # Carry through size 1 variables
            if vdata.size == 1:
                if vdata[0] is np.ma.masked:
                    L.warning("Skipping variable {} that is completely masked".
                              format(dnam))
                    continue
                vdata = vdata[0]
            else:
                if dvar[:].flatten().size != t.size:
                    L.warning("Variable {} is not the correct size, skipping.".
                              format(dnam))
                    continue

                building_index_to_drop = (building_index_to_drop == True) & (
                    vdata.mask == True)  # noqa

            df_data[dnam] = vdata

        df = pd.DataFrame(df_data)

        # Drop all data columns with no data
        if clean_cols:
            df = df.dropna(axis=1, how='all')

        # Drop all data rows with no data variable data
        if clean_rows:
            df = df.iloc[~building_index_to_drop]

        return df
コード例 #22
0
    def to_dataframe(self, clean_cols=True, clean_rows=True, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))

        axv = get_mapped_axes_variables(self, axes)

        svar = axv.station
        s = normalize_countable_array(svar)

        # T
        t = get_masked_datetime_array(axv.t[:], axv.t)
        n_times = t.size

        # X
        x = generic_masked(axv.x[:], attrs=self.vatts(axv.x.name))

        # Y
        y = generic_masked(axv.y[:], attrs=self.vatts(axv.y.name))

        # Z
        z = generic_masked(axv.z[:], attrs=self.vatts(axv.z.name))
        n_z = z.size

        # denormalize table structure
        t = np.repeat(t, s.size * n_z)
        z = np.tile(np.repeat(z, s.size), n_times)
        s = np.tile(s, n_z * n_times)
        y = np.tile(y, n_times * n_z)
        x = np.tile(x, n_times * n_z)

        df_data = OrderedDict([
            (axes.t, t),
            (axes.x, x),
            (axes.y, y),
            (axes.z, z),
            (axes.station, s),
        ])

        building_index_to_drop = np.ones(t.size, dtype=bool)

        # Axes variables are already processed so skip them
        extract_vars = copy(self.variables)
        for ncvar in axv._asdict().values():
            if ncvar is not None and ncvar.name in extract_vars:
                del extract_vars[ncvar.name]

        for i, (dnam, dvar) in enumerate(extract_vars.items()):
            vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype),
                                   attrs=self.vatts(dnam))

            # Carry through size 1 variables
            if vdata.size == 1:
                if vdata[0] is np.ma.masked:
                    L.warning("Skipping variable {} that is completely masked".
                              format(dnam))
                    continue
                vdata = vdata[0]
            else:
                if dvar[:].flatten().size != t.size:
                    L.warning("Variable {} is not the correct size, skipping.".
                              format(dnam))
                    continue

                building_index_to_drop = (building_index_to_drop == True) & (
                    vdata.mask == True)  # noqa

            df_data[dnam] = vdata

        df = pd.DataFrame(df_data)

        # Drop all data columns with no data
        if clean_cols:
            df = df.dropna(axis=1, how='all')

        # Drop all data rows with no data variable data
        if clean_rows:
            df = df.iloc[~building_index_to_drop]

        return df
コード例 #23
0
ファイル: cr.py プロジェクト: pyoceans/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes

        _ = kwargs.pop('reduce_dims', False)
        _ = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with ContiguousRaggedTrajectoryProfile(output, 'w') as nc:

            trajectory_groups = df.groupby(axes.trajectory)
            unique_trajectories = list(trajectory_groups.groups.keys())
            num_trajectories = len(unique_trajectories)

            nc.createDimension(daxes.trajectory, num_trajectories)
            trajectory = nc.createVariable(axes.trajectory,
                                           get_dtype(df[axes.trajectory]),
                                           (daxes.trajectory, ))
            trajectory[:] = np.array(unique_trajectories)

            # Calculate the max number of profiles
            unique_profiles = df[axes.profile].unique()
            num_profiles = len(unique_profiles)

            nc.createDimension(daxes.profile, num_profiles)
            profile = nc.createVariable(axes.profile,
                                        get_dtype(df[axes.profile]),
                                        (daxes.profile, ))
            profile[:] = np.array(unique_profiles)

            # Get unique obs by grouping on traj and profile and getting the max size
            num_obs = len(df)
            nc.createDimension(daxes.sample, num_obs)

            # The trajectory this profile belongs to
            t_ind = nc.createVariable('trajectoryIndex', 'i4',
                                      (daxes.profile, ))
            # Number of observations in each profile
            row_size = nc.createVariable('rowSize', 'i4', (daxes.profile, ))

            # Create all of the axis variables
            time = nc.createVariable(axes.t,
                                     'f8', (daxes.profile, ),
                                     fill_value=np.dtype('f8').type(
                                         cls.default_fill_value))
            latitude = nc.createVariable(
                axes.y,
                get_dtype(df[axes.y]), (daxes.profile, ),
                fill_value=df[axes.y].dtype.type(cls.default_fill_value))
            longitude = nc.createVariable(
                axes.x,
                get_dtype(df[axes.x]), (daxes.profile, ),
                fill_value=df[axes.x].dtype.type(cls.default_fill_value))

            # Axes variables are already processed so skip them
            data_columns = [d for d in df.columns if d not in axes]
            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Variables defined on only the profile axis
            profile_vars = kwargs.pop('profile_vars', [])
            profile_columns = [p for p in profile_vars if p in data_columns]
            for c in profile_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    create_ncvar_from_series(nc,
                                             var_name, (daxes.profile, ),
                                             df[c],
                                             zlib=True,
                                             complevel=1)

            for i, (_, trg) in enumerate(trajectory_groups):
                for j, (_, pfg) in enumerate(trg.groupby(axes.profile)):
                    time[j] = get_ncdata_from_series(pfg[axes.t],
                                                     time).astype('f8')[0]
                    latitude[j] = get_ncdata_from_series(
                        pfg[axes.y], latitude)[0]
                    longitude[j] = get_ncdata_from_series(
                        pfg[axes.x], longitude)[0]
                    row_size[j] = len(pfg)
                    t_ind[j] = i

                    # Save any profile variables on the "profile" index using the first value found
                    # in the column.
                    for c in profile_columns:
                        var_name = cf_safe_name(c)
                        if var_name not in nc.variables:
                            continue
                        v = nc.variables[var_name]
                        vvalues = get_ncdata_from_series(pfg[c], v)[0]
                        try:
                            v[j] = vvalues
                        except BaseException:
                            L.exception('Failed to add {}'.format(c))
                            continue

            # Add back in the z axes that was removed when calculating data_columns
            # and ignore variables that were stored in the profile index
            sample_columns = [
                f for f in data_columns + [axes.z] if f not in profile_columns
            ]
            skips = ['trajectoryIndex', 'rowSize']
            for c in [d for d in sample_columns if d not in skips]:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name, (daxes.sample, ),
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                else:
                    v = nc.variables[var_name]
                vvalues = get_ncdata_from_series(df[c], v)
                try:
                    v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return ContiguousRaggedTrajectoryProfile(output, **kwargs)
コード例 #24
0
    def to_dataframe(self, clean_cols=False, clean_rows=False, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))

        axv = get_mapped_axes_variables(self, axes)

        # T
        t = get_masked_datetime_array(axv.t[:], axv.t)

        # X
        x = generic_masked(axv.x[:].repeat(t.size),
                           attrs=self.vatts(axv.x.name))

        # Y
        y = generic_masked(axv.y[:].repeat(t.size),
                           attrs=self.vatts(axv.y.name))

        # Z
        if axv.z is not None:
            z = generic_masked(axv.z[:].repeat(t.size),
                               attrs=self.vatts(axv.z.name))
        else:
            z = None

        svar = axv.station
        s = normalize_countable_array(svar)
        s = np.repeat(s, t.size)

        # now repeat t per station
        # figure out if this is a single-station file by checking
        # the dimension size of the x dimension
        if axv.x.ndim == 1:
            t = np.repeat(t, len(svar))

        df_data = OrderedDict([
            (axes.t, t),
            (axes.x, x),
            (axes.y, y),
            (axes.z, z),
            (axes.station, s),
        ])

        building_index_to_drop = np.ma.zeros(t.size, dtype=bool)

        # Axes variables are already processed so skip them
        extract_vars = copy(self.variables)
        for ncvar in axv._asdict().values():
            if ncvar is not None and ncvar.name in extract_vars:
                del extract_vars[ncvar.name]

        for i, (dnam, dvar) in enumerate(extract_vars.items()):
            vdata = generic_masked(dvar[:].flatten().astype(dvar.dtype),
                                   attrs=self.vatts(dnam))

            # Carry through size 1 variables
            if vdata.size == 1:
                if vdata[0] is np.ma.masked:
                    L.warning("Skipping variable {} that is completely masked".
                              format(dnam))
                    continue
            else:
                if dvar[:].flatten().size != t.size:
                    L.warning("Variable {} is not the correct size, skipping.".
                              format(dnam))
                    continue

            # Mark rows with data so we don't remove them with clear_rows
            if vdata.size == building_index_to_drop.size:
                building_index_to_drop = (building_index_to_drop == True) & (
                    vdata.mask == True)  # noqa

            # Handle scalars here at the end
            if vdata.size == 1:
                vdata = vdata[0]

            df_data[dnam] = vdata

        df = pd.DataFrame(df_data)

        # Drop all data columns with no data
        if clean_cols:
            df = df.dropna(axis=1, how='all')

        # Drop all data rows with no data variable data
        if clean_rows:
            df = df.iloc[~building_index_to_drop]

        return df
コード例 #25
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        data_columns = [d for d in df.columns if d not in axes]

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        # Downcast anything from int64 to int32
        df = downcast_dataframe(df)

        # Make a new index that is the Cartesian product of all of the values from all of the
        # values of the old index. This is so don't have to iterate over anything. The full column
        # of data will be able to be shaped to the size of the final unique sized dimensions.
        index_order = [axes.t, axes.z, axes.station]
        df = df.set_index(index_order)
        df = df.reindex(
            pd.MultiIndex.from_product(df.index.levels, names=index_order))

        unique_z = df.index.get_level_values(axes.z).unique().values
        unique_t = df.index.get_level_values(
            axes.t).unique().tolist()  # tolist converts to Timestamp
        all_stations = df.index.get_level_values(axes.station)
        unique_s = all_stations.unique()

        with OrthogonalMultidimensionalTimeseriesProfile(output, 'w') as nc:

            if reduce_dims is True and unique_s.size == 1:
                # If a singlular trajectory, we can reduce that dimension if it is of size 1
                def ts():
                    return np.s_[:, :]

                default_dimensions = (axes.t, axes.z)
                station_dimensions = ()
            else:

                def ts():
                    return np.s_[:, :, :]

                default_dimensions = (axes.t, axes.z, axes.station)
                station_dimensions = (axes.station, )
                nc.createDimension(axes.station, unique_s.size)

            station = nc.createVariable(axes.station, get_dtype(unique_s),
                                        station_dimensions)
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         station_dimensions)
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          station_dimensions)
            # Assign over loop because VLEN variables (strings) have to be assigned by integer index
            # and we need to find the lat/lon based on station index
            for si, st in enumerate(unique_s):
                station[si] = st
                latitude[si] = df[axes.y][all_stations == st].dropna().iloc[0]
                longitude[si] = df[axes.x][all_stations == st].dropna().iloc[0]

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            if unlimited is True:
                nc.createDimension(axes.t, None)
            else:
                nc.createDimension(axes.t, len(unique_t))
            time = nc.createVariable(axes.t, 'f8', (axes.t, ))
            time[:] = nc4.date2num(unique_t, units=cls.default_time_unit)

            nc.createDimension(axes.z, unique_z.size)
            z = nc.createVariable(axes.z, get_dtype(unique_z), (axes.z, ))
            z[:] = unique_z

            attributes = dict_update(nc.nc_attributes(axes),
                                     kwargs.pop('attributes', {}))

            for c in data_columns:
                # Create variable if it doesn't exist
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name,
                                                 default_dimensions,
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}), {
                            'coordinates':
                            '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                 axes.y)
                        })
                else:
                    v = nc.variables[var_name]

                vvalues = get_ncdata_from_series(df[c], v)
                v[ts()] = vvalues.reshape(len(unique_t), unique_z.size,
                                          unique_s.size)

            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseriesProfile(output, **kwargs)
コード例 #26
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes
        data_columns = [d for d in df.columns if d not in axes]

        reduce_dims = kwargs.pop('reduce_dims', False)
        _ = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with OrthogonalMultidimensionalTimeseries(output, 'w') as nc:

            station_group = df.groupby(axes.station)
            num_stations = len(station_group)
            has_z = axes.z is not None

            if reduce_dims is True and num_stations == 1:
                # If a station, we can reduce that dimension if it is of size 1
                def ts(i):
                    return np.s_[:]

                default_dimensions = (daxes.t, )
                station_dimensions = ()
            else:

                def ts(i):
                    return np.s_[i, :]

                default_dimensions = (daxes.station, daxes.t)
                station_dimensions = (daxes.station, )
                nc.createDimension(daxes.station, num_stations)

            # Set the coordinates attribute correctly
            coordinates = [axes.t, axes.x, axes.y]
            if has_z is True:
                coordinates.insert(1, axes.z)
            coordinates = ' '.join(coordinates)

            # assume all groups are the same size and have identical times
            _, sdf = list(station_group)[0]
            t = sdf[axes.t]

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            nc.createDimension(daxes.t, t.size)
            time = nc.createVariable(axes.t, 'f8', (daxes.t, ))
            station = nc.createVariable(axes.station,
                                        get_dtype(df[axes.station]),
                                        station_dimensions)
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         station_dimensions)
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          station_dimensions)
            if has_z is True:
                z = nc.createVariable(axes.z,
                                      get_dtype(df[axes.z]),
                                      station_dimensions,
                                      fill_value=df[axes.z].dtype.type(
                                          cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            time[:] = get_ncdata_from_series(t, time)

            # Create vars based on full dataframe (to get all variables)
            for c in data_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name,
                                                 default_dimensions,
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}),
                        {'coordinates': coordinates})

            for i, (uid, sdf) in enumerate(station_group):
                station[i] = uid
                latitude[i] = sdf[axes.y].iloc[0]
                longitude[i] = sdf[axes.x].iloc[0]

                if has_z is True:
                    # TODO: write a test for a Z with a _FillValue
                    z[i] = sdf[axes.z].iloc[0]

                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(sdf[c], v)
                    try:
                        v[ts(i)] = vvalues
                    except BaseException:
                        L.debug(
                            '{} was not written. Likely a metadata variable'.
                            format(v.name))

            # Set global attributes
            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseries(output, **kwargs)