コード例 #1
0
 def nc_attributes(self, axes):
     atts = super(OrthogonalMultidimensionalTimeseriesProfile,
                  self).nc_attributes()
     return dict_update(
         atts, {
             'global': {
                 'featureType': 'timeSeriesProfile',
                 'cdm_data_type': 'TimeseriesProfile'
             },
             axes.station: {
                 'cf_role': 'timeseries_id',
                 'long_name': 'station identifier'
             },
             axes.x: {
                 'axis': 'X'
             },
             axes.y: {
                 'axis': 'Y'
             },
             axes.z: {
                 'axis': 'Z'
             },
             axes.t: {
                 'units': self.default_time_unit,
                 'standard_name': 'time',
                 'axis': 'T'
             }
         })
コード例 #2
0
ファイル: nc.py プロジェクト: kerfoot/GUTILS
def create_dataset(file, reader_class, deployments_path, subset, template,
                   profile_id_type, **filter_args):
    # Remove None filters from the arguments
    filter_args = {k: v for k, v in filter_args.items() if v is not None}

    # Figure out the netCDF output path based on the file and the deployments_path
    dep_path = Path(deployments_path)
    file_path = Path(file)
    individual_dep_path = None
    for pp in file_path.parents:
        if dep_path == pp:
            break
        individual_dep_path = pp
    config_path = individual_dep_path / 'config'

    # Extract the filters from the config and override with passed in filters that are not None
    attrs = read_attrs(config_path, template=template)
    file_filters = attrs.pop('filters', {})
    filters = dict_update(file_filters, filter_args)

    processed_df, mode = process_dataset(file, reader_class, **filters)

    if processed_df is None:
        return 1

    output_path = individual_dep_path / mode / 'netcdf'
    return create_netcdf(attrs,
                         processed_df,
                         output_path,
                         mode,
                         profile_id_type,
                         subset=subset)
コード例 #3
0
 def nc_attributes(self):
     atts = super(OrthogonalMultidimensionalTimeseries, self).nc_attributes()
     return dict_update(atts, {
         'global' : {
             'featureType': 'timeseries',
             'cdm_data_type': 'Timeseries'
         },
         'station' : {
             'cf_role': 'timeseries_id',
             'long_name' : 'station identifier'
         },
         'time': {
             'units': self.default_time_unit,
             'standard_name': 'time',
             'axis': 'T'
         },
         'latitude': {
             'axis': 'Y'
         },
         'longitude': {
             'axis': 'X'
         },
         'z': {
             'axis': 'Z'
         }
     })
コード例 #4
0
 def nc_attributes(self, axes, daxes):
     atts = super(IncompleteMultidimensionalProfile, self).nc_attributes()
     return dict_update(
         atts, {
             'global': {
                 'featureType': 'profile',
                 'cdm_data_type': 'Profile'
             },
             axes.profile: {
                 'cf_role': 'profile_id',
                 'long_name': 'profile identifier'
             },
             axes.x: {
                 'axis': 'X'
             },
             axes.y: {
                 'axis': 'Y'
             },
             axes.z: {
                 'axis': 'Z'
             },
             axes.t: {
                 'units': self.default_time_unit,
                 'standard_name': 'time',
                 'axis': 'T'
             }
         })
コード例 #5
0
 def nc_attributes(self, axes, daxes):
     atts = super(ContiguousRaggedTrajectory, self).nc_attributes()
     return dict_update(
         atts, {
             'global': {
                 'featureType': 'trajectory',
                 'cdm_data_type': 'Trajectory'
             },
             axes.trajectory: {
                 'cf_role': 'trajectory_id',
                 'long_name': 'trajectory identifier',
                 'ioos_category': 'identifier'
             },
             axes.x: {
                 'axis': 'X'
             },
             axes.y: {
                 'axis': 'Y'
             },
             axes.z: {
                 'axis': 'Z'
             },
             axes.t: {
                 'units': self.default_time_unit,
                 'standard_name': 'time',
                 'axis': 'T'
             },
             'rowSize': {
                 'sample_dimension': daxes.sample
             }
         })
コード例 #6
0
def get_calculated_attributes(df, axes=None, history=None):
    """ Functions to automate netCDF attribute generation from the data itself
    This is a wrapper for the other four functions, which could be called separately.

    :param df: data (Pandas DataFrame)
    :param axes: keys (x,y,z,t) are associated with actual column names (dictionary)
    :param history: history: text initializing audit trail for modifications to the original data (optional, string)
    :return: dictionary of global attributes
    """

    axes = get_default_axes(axes)
    attrs = get_geographic_attributes(df, axes)
    attrs = dict_update(attrs, get_vertical_attributes(df, axes))
    attrs = dict_update(attrs, get_temporal_attributes(df, axes))
    attrs = dict_update(attrs, get_creation_attributes(history))

    return attrs
コード例 #7
0
ファイル: nc.py プロジェクト: zeroYXX/GUTILS
def read_attrs(config_path=None, template=None):

    def cfg_file(name):
        return os.path.join(
            config_path,
            name
        )

    template = template or 'trajectory'

    if os.path.isfile(template):
        default_attrs_path = template
    else:
        template_dir = os.path.join(os.path.dirname(__file__), 'templates')
        default_attrs_path = os.path.join(template_dir, '{}.json'.format(template))
        if not os.path.isfile(default_attrs_path):
            L.error("Template path {} not found, using defaults.".format(default_attrs_path))
            default_attrs_path = os.path.join(template_dir, 'trajectory.json')

    # Load in template defaults
    defaults = dict(MetaInterface.from_jsonfile(default_attrs_path))

    # Load instruments
    ins = {}
    if config_path:
        ins_attrs_path = cfg_file("instruments.json")
        if os.path.isfile(ins_attrs_path):
            ins = dict(MetaInterface.from_jsonfile(ins_attrs_path))

    # Load deployment attributes (including some global attributes)
    deps = {}
    if config_path:
        deps_attrs_path = cfg_file("deployment.json")
        if os.path.isfile(deps_attrs_path):
            deps = dict(MetaInterface.from_jsonfile(deps_attrs_path))

    # Update, highest precedence updates last
    one = dict_update(defaults, ins)
    two = dict_update(one, deps)
    return two
コード例 #8
0
 def nc_attributes(self, axes, daxes):
     atts = super(RaggedTimeseriesProfile, self).nc_attributes()
     return dict_update(
         atts, {
             'global': {
                 'featureType':
                 'timeSeriesProfile',
                 'cdm_data_type':
                 'TimeseriesProfile',
                 'cdm_timeseries_variables':
                 axes.station,
                 'cdm_profile_variables':
                 axes.profile,
                 'subsetVariables':
                 '{x},{y},{t},{station}'.format(**axes._asdict())
             },
             axes.station: {
                 'cf_role': 'timeseries_id',
                 'long_name': 'station identifier',
                 'ioos_category': 'identifier'
             },
             axes.profile: {
                 'cf_role': 'profile_id',
                 'long_name': 'profile identifier',
                 'ioos_category': 'identifier'
             },
             axes.x: {
                 'axis': 'X'
             },
             axes.y: {
                 'axis': 'Y'
             },
             axes.z: {
                 'axis': 'Z'
             },
             axes.t: {
                 'units': self.default_time_unit,
                 'standard_name': 'time',
                 'axis': 'T'
             },
             'stationIndex': {
                 'long_name': 'which station this profile belongs to',
                 'instance_dimension': daxes.station
             },
             'rowSize': {
                 'long_name': 'number of obs in this profile',
                 'sample_dimension': daxes.sample
             }
         })
コード例 #9
0
 def nc_attributes(self):
     atts = super(IncompleteMultidimensionalTrajectory, self).nc_attributes()
     return dict_update(atts, {
         'global' : {
             'featureType': 'trajectory',
             'cdm_data_type': 'Trajectory'
         },
         'trajectory' : {
             'cf_role': 'trajectory_id',
             'long_name' : 'trajectory identifier'
         },
         'distance' : {
             'long_name': 'Great circle distance between trajectory points',
             'standard_name': 'distance_between_trajectory_points',
             'units': 'm'
         }
     })
コード例 #10
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes
        data_columns = [d for d in df.columns if d not in axes]

        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with IncompleteMultidimensionalProfile(output, 'w') as nc:

            profile_group = df.groupby(axes.profile)

            if unlimited is True:
                max_profiles = None
            else:
                max_profiles = df[axes.profile].unique().size
            nc.createDimension(daxes.profile, max_profiles)

            max_zs = profile_group.size().max()
            nc.createDimension(daxes.z, max_zs)

            # Metadata variables
            nc.createVariable('crs', 'i4')

            profile = nc.createVariable(axes.profile,
                                        get_dtype(df[axes.profile]),
                                        (daxes.profile, ))

            # Create all of the variables
            time = nc.createVariable(axes.t, 'f8', (daxes.profile, ))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         (daxes.profile, ))
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          (daxes.profile, ))
            z = nc.createVariable(
                axes.z,
                get_dtype(df[axes.z]), (daxes.profile, daxes.z),
                fill_value=df[axes.z].dtype.type(cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Create vars based on full dataframe (to get all variables)
            for c in data_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name,
                                                 (daxes.profile, daxes.z),
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}), {
                            'coordinates':
                            '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                 axes.y)
                        })

            # Write values for each profile within profile_group
            for i, (uid, pdf) in enumerate(profile_group):
                profile[i] = uid

                time[i] = date2num(pdf[axes.t].iloc[0],
                                   units=cls.default_time_unit)
                latitude[i] = pdf[axes.y].iloc[0]
                longitude[i] = pdf[axes.x].iloc[0]

                zvalues = pdf[axes.z].fillna(z._FillValue).values
                sl = slice(0, zvalues.size)
                z[i, sl] = zvalues

                for c in data_columns:
                    var_name = cf_safe_name(c)
                    v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(pdf[c], v)

                    sl = slice(0, vvalues.size)
                    v[i, sl] = vvalues

            # Set global attributes
            nc.update_attributes(attributes)

        return IncompleteMultidimensionalProfile(output, **kwargs)
コード例 #11
0
    def from_dataframe(cls, df, output, **kwargs):
        reserved_columns = ['station', 't', 'x', 'y', 'z']
        data_columns = [ d for d in df.columns if d not in reserved_columns ]

        with OrthogonalMultidimensionalTimeseries(output, 'w') as nc:

            station_group = df.groupby('station')
            num_stations = len(station_group)
            
            # assume all groups are the same size and have identical times
            _, sdf = list(station_group)[0]
            t = sdf.t
            
            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            nc.createDimension('time', t.size)
            nc.createDimension('station', num_stations)
            station = nc.createVariable('station', get_dtype(df.station), ('station',))

            time = nc.createVariable('time', 'f8', ('time',))
            latitude = nc.createVariable('latitude', get_dtype(df.y), ('station',))
            longitude = nc.createVariable('longitude', get_dtype(df.x), ('station',))
            z = nc.createVariable('z', get_dtype(df.z), ('station',), fill_value=df.z.dtype.type(cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(), kwargs.pop('attributes', {}))

            logger.info(df.t.values.dtype)
            time[:] = nc4.date2num(t.tolist(), units=cls.default_time_unit)

            for i, (uid, sdf) in enumerate(station_group):
                station[i] = uid
                latitude[i] = sdf.y.iloc[0]
                longitude[i] = sdf.x.iloc[0]

                # TODO: write a test for a Z with a _FillValue
                z[i] = sdf.z.iloc[0]

                for c in data_columns:

                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    if var_name not in nc.variables:
                        if var_name not in attributes:
                            attributes[var_name] = {}
                        if sdf[c].dtype == np.dtype('datetime64[ns]'):
                            fv = np.dtype('f8').type(cls.default_fill_value)
                            v = nc.createVariable(var_name, 'f8', ('station', 'time',), fill_value=fv)
                            tvalues = pd.Series(nc4.date2num(sdf[c].tolist(), units=cls.default_time_unit))
                            attributes[var_name] = dict_update(attributes[var_name], {
                                'units': cls.default_time_unit
                            })
                        elif np.issubdtype(sdf[c].dtype, 'S') or sdf[c].dtype == object:
                            # AttributeError: cannot set _FillValue attribute for VLEN or compound variable
                            v = nc.createVariable(var_name, get_dtype(sdf[c]), ('station', 'time',))
                        else:
                            v = nc.createVariable(var_name, get_dtype(sdf[c]), ('station', 'time',), fill_value=sdf[c].dtype.type(cls.default_fill_value))

                        attributes[var_name] = dict_update(attributes[var_name], {
                            'coordinates' : 'time latitude longitude z',
                        })
                    else:
                        v = nc.variables[var_name]

                    if sdf[c].dtype == np.dtype('datetime64[ns]'):
                        vvalues = tvalues.fillna(v._FillValue).values
                    elif hasattr(v, '_FillValue'):
                        vvalues = sdf[c].fillna(v._FillValue).values
                    else:
                        # Use an empty string... better than nothing!
                        vvalues = sdf[c].fillna('').values

                    try:
                        v[i, :] = vvalues
                    except BaseException:
                        logger.error('{} NOPE'.format(v.name))

            # Set global attributes
            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseries(output, **kwargs)
コード例 #12
0
ファイル: cr.py プロジェクト: lizferguson5/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))

        _ = kwargs.pop('reduce_dims', False)
        _ = kwargs.pop('unlimited', False)

        with ContiguousRaggedTrajectoryProfile(output, 'w') as nc:

            trajectory_groups = df.groupby(axes.trajectory)
            unique_trajectories = list(trajectory_groups.groups.keys())
            num_trajectories = len(unique_trajectories)

            nc.createDimension(axes.trajectory, num_trajectories)
            trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (axes.trajectory,))
            trajectory[:] = np.array(unique_trajectories)

            # Calculate the max number of profiles
            unique_profiles = df[axes.profile].unique()
            num_profiles = len(unique_profiles)

            nc.createDimension(axes.profile, num_profiles)
            profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (axes.profile,))
            profile[:] = np.array(unique_profiles)

            # Get unique obs by grouping on traj and profile and getting the max size
            num_obs = len(df)
            nc.createDimension(axes.sample, num_obs)

            # The trajectory this profile belongs to
            t_ind = nc.createVariable('trajectoryIndex', 'i4', (axes.profile,))
            # Number of observations in each profile
            row_size = nc.createVariable('rowSize', 'i4', (axes.profile,))

            # Create all of the axis variables
            time = nc.createVariable(axes.t, 'f8', (axes.profile,), fill_value=np.dtype('f8').type(cls.default_fill_value))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), (axes.profile,), fill_value=df[axes.y].dtype.type(cls.default_fill_value))
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), (axes.profile,), fill_value=df[axes.x].dtype.type(cls.default_fill_value))

            # Axes variables are already processed so skip them
            data_columns = [ d for d in df.columns if d not in axes ]
            attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {}))

            for i, (_, trg) in enumerate(trajectory_groups):
                for j, (_, pfg) in enumerate(trg.groupby(axes.profile)):
                    time[j] = get_ncdata_from_series(pfg[axes.t], time)[0]
                    latitude[j] = get_ncdata_from_series(pfg[axes.y], latitude)[0]
                    longitude[j] = get_ncdata_from_series(pfg[axes.x], longitude)[0]
                    row_size[j] = len(pfg)
                    t_ind[j] = i

            # Add back in the z axes that was removed when calculating data_columns
            data_columns = data_columns + [axes.z]
            skips = ['trajectoryIndex', 'rowSize']
            for c in [ d for d in data_columns if d not in skips ]:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(
                        nc,
                        var_name,
                        (axes.sample,),
                        df[c],
                        zlib=True,
                        complevel=1
                    )
                else:
                    v = nc.variables[var_name]
                vvalues = get_ncdata_from_series(df[c], v)
                try:
                    v[:] = vvalues
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return ContiguousRaggedTrajectoryProfile(output, **kwargs)
コード例 #13
0
ファイル: im.py プロジェクト: lucmehl/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        data_columns = [ d for d in df.columns if d not in axes ]

        unlimited = kwargs.pop('unlimited', False)

        with IncompleteMultidimensionalProfile(output, 'w') as nc:

            profile_group = df.groupby(axes.profile)

            if unlimited is True:
                max_profiles = None
            else:
                max_profiles = df[axes.profile].unique().size
            nc.createDimension(axes.profile, max_profiles)

            max_zs = profile_group.size().max()
            nc.createDimension(axes.z, max_zs)

            # Metadata variables
            nc.createVariable('crs', 'i4')

            profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (axes.profile,))

            # Create all of the variables
            time = nc.createVariable(axes.t, 'f8', (axes.profile,))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), (axes.profile,))
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), (axes.profile,))
            z = nc.createVariable(axes.z, get_dtype(df[axes.z]), (axes.profile, axes.z), fill_value=df[axes.z].dtype.type(cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {}))

            for i, (uid, pdf) in enumerate(profile_group):
                profile[i] = uid

                time[i] = nc4.date2num(pdf[axes.t].iloc[0], units=cls.default_time_unit)
                latitude[i] = pdf[axes.y].iloc[0]
                longitude[i] = pdf[axes.x].iloc[0]

                zvalues = pdf[axes.z].fillna(z._FillValue).values
                sl = slice(0, zvalues.size)
                z[i, sl] = zvalues
                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    if var_name not in nc.variables:
                        v = create_ncvar_from_series(
                            nc,
                            var_name,
                            (axes.profile, axes.z),
                            pdf[c],
                            zlib=True,
                            complevel=1
                        )
                        attributes[var_name] = dict_update(attributes.get(var_name, {}), {
                            'coordinates' : '{} {} {} {}'.format(
                                axes.t, axes.z, axes.x, axes.y
                            )
                        })
                    else:
                        v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(pdf[c], v)

                    sl = slice(0, vvalues.size)
                    v[i, sl] = vvalues

            # Set global attributes
            nc.update_attributes(attributes)

        return IncompleteMultidimensionalProfile(output, **kwargs)
コード例 #14
0
ファイル: om.py プロジェクト: lucmehl/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        data_columns = [d for d in df.columns if d not in axes]

        with OrthogonalMultidimensionalTimeseries(output, 'w') as nc:

            station_group = df.groupby(axes.station)
            num_stations = len(station_group)

            # assume all groups are the same size and have identical times
            _, sdf = list(station_group)[0]
            t = sdf[axes.t]

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            nc.createDimension(axes.t, t.size)
            nc.createDimension(axes.station, num_stations)
            station = nc.createVariable(axes.station, get_dtype(df.station),
                                        (axes.station, ))

            time = nc.createVariable(axes.t, 'f8', (axes.t, ))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         (axes.station, ))
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          (axes.station, ))
            z = nc.createVariable(axes.z,
                                  get_dtype(df[axes.z]), (axes.station, ),
                                  fill_value=df[axes.z].dtype.type(
                                      cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes),
                                     kwargs.pop('attributes', {}))

            # tolist() converts to a python datetime object without timezone and has NaTs.
            g = t.tolist()
            # date2num convers NaTs to np.nan
            gg = nc4.date2num(g, units=cls.default_time_unit)
            # masked_invalid moves np.nan to a masked value
            time[:] = np.ma.masked_invalid(gg)

            for i, (uid, sdf) in enumerate(station_group):
                station[i] = uid
                latitude[i] = sdf[axes.y].iloc[0]
                longitude[i] = sdf[axes.x].iloc[0]

                # TODO: write a test for a Z with a _FillValue
                z[i] = sdf[axes.z].iloc[0]

                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    if var_name not in nc.variables:
                        v = create_ncvar_from_series(nc,
                                                     var_name,
                                                     (axes.station, axes.t),
                                                     sdf[c],
                                                     zlib=True,
                                                     complevel=1)
                        attributes[var_name] = dict_update(
                            attributes.get(var_name, {}), {
                                'coordinates':
                                '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                     axes.y)
                            })
                    else:
                        v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(sdf[c], v)
                    try:
                        v[i, :] = vvalues
                    except BaseException:
                        L.debug(
                            '{} was not written. Likely a metadata variable'.
                            format(v.name))

            # Set global attributes
            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseries(output, **kwargs)
コード例 #15
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes
        data_columns = [d for d in df.columns if d not in axes]

        reduce_dims = kwargs.pop('reduce_dims', False)
        _ = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with OrthogonalMultidimensionalTimeseries(output, 'w') as nc:

            station_group = df.groupby(axes.station)
            num_stations = len(station_group)
            has_z = axes.z is not None

            if reduce_dims is True and num_stations == 1:
                # If a station, we can reduce that dimension if it is of size 1
                def ts(i):
                    return np.s_[:]

                default_dimensions = (daxes.t, )
                station_dimensions = ()
            else:

                def ts(i):
                    return np.s_[i, :]

                default_dimensions = (daxes.station, daxes.t)
                station_dimensions = (daxes.station, )
                nc.createDimension(daxes.station, num_stations)

            # Set the coordinates attribute correctly
            coordinates = [axes.t, axes.x, axes.y]
            if has_z is True:
                coordinates.insert(1, axes.z)
            coordinates = ' '.join(coordinates)

            # assume all groups are the same size and have identical times
            _, sdf = list(station_group)[0]
            t = sdf[axes.t]

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            nc.createDimension(daxes.t, t.size)
            time = nc.createVariable(axes.t, 'f8', (daxes.t, ))
            station = nc.createVariable(axes.station,
                                        get_dtype(df[axes.station]),
                                        station_dimensions)
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         station_dimensions)
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          station_dimensions)
            if has_z is True:
                z = nc.createVariable(axes.z,
                                      get_dtype(df[axes.z]),
                                      station_dimensions,
                                      fill_value=df[axes.z].dtype.type(
                                          cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            time[:] = get_ncdata_from_series(t, time)

            # Create vars based on full dataframe (to get all variables)
            for c in data_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name,
                                                 default_dimensions,
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}),
                        {'coordinates': coordinates})

            for i, (uid, sdf) in enumerate(station_group):
                station[i] = uid
                latitude[i] = sdf[axes.y].iloc[0]
                longitude[i] = sdf[axes.x].iloc[0]

                if has_z is True:
                    # TODO: write a test for a Z with a _FillValue
                    z[i] = sdf[axes.z].iloc[0]

                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(sdf[c], v)
                    try:
                        v[ts(i)] = vvalues
                    except BaseException:
                        L.debug(
                            '{} was not written. Likely a metadata variable'.
                            format(v.name))

            # Set global attributes
            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseries(output, **kwargs)
コード例 #16
0
ファイル: im.py プロジェクト: lucmehl/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        data_columns = [d for d in df.columns if d not in axes]

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        with IncompleteMultidimensionalTrajectory(output, 'w') as nc:

            trajectory_group = df.groupby(axes.trajectory)

            if unlimited is True:
                max_obs = None
            else:
                max_obs = trajectory_group.size().max()
            nc.createDimension(axes.sample, max_obs)

            num_trajectories = len(trajectory_group)
            if reduce_dims is True and num_trajectories == 1:
                # If a singlular trajectory, we can reduce that dimension if it is of size 1
                def ts(t_index, size):
                    return np.s_[0:size]

                default_dimensions = (axes.sample, )
                trajectory = nc.createVariable(axes.trajectory,
                                               get_dtype(df[axes.trajectory]))
            else:

                def ts(t_index, size):
                    return np.s_[t_index, 0:size]

                default_dimensions = (axes.trajectory, axes.sample)
                nc.createDimension(axes.trajectory, num_trajectories)
                trajectory = nc.createVariable(axes.trajectory,
                                               get_dtype(df[axes.trajectory]),
                                               (axes.trajectory, ))

            # Create all of the variables
            time = nc.createVariable(axes.t,
                                     'f8',
                                     default_dimensions,
                                     fill_value=np.dtype('f8').type(
                                         cls.default_fill_value))
            z = nc.createVariable(axes.z,
                                  get_dtype(df[axes.z]),
                                  default_dimensions,
                                  fill_value=df[axes.z].dtype.type(
                                      cls.default_fill_value))
            latitude = nc.createVariable(axes.y,
                                         get_dtype(df[axes.y]),
                                         default_dimensions,
                                         fill_value=df[axes.y].dtype.type(
                                             cls.default_fill_value))
            longitude = nc.createVariable(axes.x,
                                          get_dtype(df[axes.x]),
                                          default_dimensions,
                                          fill_value=df[axes.x].dtype.type(
                                              cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes),
                                     kwargs.pop('attributes', {}))

            for i, (uid, gdf) in enumerate(trajectory_group):
                trajectory[i] = uid

                # tolist() converts to a python datetime object without timezone and has NaTs.
                g = gdf[axes.t].tolist()
                # date2num convers NaTs to np.nan
                gg = nc4.date2num(g, units=cls.default_time_unit)
                # masked_invalid moves np.nan to a masked value
                time[ts(i, gg.size)] = np.ma.masked_invalid(gg)

                lats = gdf[axes.y].fillna(get_fill_value(latitude)).values
                latitude[ts(i, lats.size)] = lats

                lons = gdf[axes.x].fillna(get_fill_value(longitude)).values
                longitude[ts(i, lons.size)] = lons

                zs = gdf[axes.z].fillna(get_fill_value(z)).values
                z[ts(i, zs.size)] = zs

                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    if var_name not in nc.variables:
                        v = create_ncvar_from_series(nc,
                                                     var_name,
                                                     default_dimensions,
                                                     gdf[c],
                                                     zlib=True,
                                                     complevel=1)
                        attributes[var_name] = dict_update(
                            attributes.get(var_name, {}), {
                                'coordinates':
                                '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                     axes.y)
                            })
                    else:
                        v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(gdf[c], v)
                    v[ts(i, vvalues.size)] = vvalues

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return IncompleteMultidimensionalTrajectory(output, **kwargs)
コード例 #17
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes

        # Should never be a CR file with one trajectory so we ignore the "reduce_dims" attribute
        _ = kwargs.pop('reduce_dims', False)  # noqa
        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with ContiguousRaggedTrajectory(output, 'w') as nc:

            trajectory_groups = df.groupby(axes.trajectory)
            unique_trajectories = list(trajectory_groups.groups.keys())
            num_trajectories = len(unique_trajectories)
            nc.createDimension(daxes.trajectory, num_trajectories)
            trajectory = nc.createVariable(axes.trajectory,
                                           get_dtype(df[axes.trajectory]),
                                           (daxes.trajectory, ))

            # Get unique obs by grouping on traj getting the max size
            if unlimited is True:
                nc.createDimension(daxes.sample, None)
            else:
                nc.createDimension(daxes.sample, len(df))

            # Number of observations in each trajectory
            row_size = nc.createVariable('rowSize', 'i4', (daxes.trajectory, ))

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Variables defined on only the trajectory axis
            traj_vars = kwargs.pop('traj_vars', [])
            traj_columns = [p for p in traj_vars if p in df.columns]
            for c in traj_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    create_ncvar_from_series(nc,
                                             var_name, (daxes.trajectory, ),
                                             df[c],
                                             zlib=True,
                                             complevel=1)

            for i, (trajid, trg) in enumerate(trajectory_groups):
                trajectory[i] = trajid
                row_size[i] = len(trg)

                # Save any trajectory variables using the first value found
                # in the column.
                for c in traj_columns:
                    var_name = cf_safe_name(c)
                    if var_name not in nc.variables:
                        continue
                    v = nc.variables[var_name]
                    vvalues = get_ncdata_from_series(trg[c], v)[0]
                    try:
                        v[i] = vvalues
                    except BaseException:
                        L.exception('Failed to add {}'.format(c))
                        continue

            # Add all of the columns based on the sample dimension. Take all columns and remove the
            # trajectory, rowSize and other trajectory based columns.
            sample_columns = [
                f for f in df.columns
                if f not in traj_columns + ['rowSize', axes.trajectory]
            ]
            for c in sample_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name, (daxes.sample, ),
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                else:
                    v = nc.variables[var_name]
                vvalues = get_ncdata_from_series(df[c], v)
                try:
                    if unlimited is True:
                        v[:] = vvalues
                    else:
                        v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return ContiguousRaggedTrajectory(output, **kwargs)
コード例 #18
0
    def from_dataframe(cls, df, output, **kwargs):
        reserved_columns = ['trajectory', 't', 'x', 'y', 'z', 'distance']
        data_columns = [ d for d in df.columns if d not in reserved_columns ]

        with IncompleteMultidimensionalTrajectory(output, 'w') as nc:

            trajectory_group = df.groupby('trajectory')
            max_obs = trajectory_group.size().max()

            unique_trajectories = df.trajectory.unique()
            nc.createDimension('trajectory', unique_trajectories.size)
            nc.createDimension('obs', max_obs)

            # Metadata variables
            nc.createVariable('crs', 'i4')

            trajectory = nc.createVariable('trajectory', get_dtype(df.trajectory), ('trajectory',))

            # Create all of the variables
            time = nc.createVariable('time', 'i4', ('trajectory', 'obs'), fill_value=int(cls.default_fill_value))
            z = nc.createVariable('z', get_dtype(df.z), ('trajectory', 'obs'), fill_value=df.z.dtype.type(cls.default_fill_value))
            latitude = nc.createVariable('latitude', get_dtype(df.y), ('trajectory', 'obs'), fill_value=df.y.dtype.type(cls.default_fill_value))
            longitude = nc.createVariable('longitude', get_dtype(df.x), ('trajectory', 'obs'), fill_value=df.x.dtype.type(cls.default_fill_value))
            if 'distance' in df:
                distance = nc.createVariable('distance', get_dtype(df.distance), ('trajectory', 'obs'), fill_value=df.distance.dtype.type(cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(), kwargs.pop('attributes', {}))

            for i, (uid, gdf) in enumerate(trajectory_group):
                trajectory[i] = uid

                # tolist() converts to a python datetime object without timezone
                g = gdf.t.fillna(999999).tolist()   # 999999 is a dummy value
                NaTs = gdf.t.isnull()
                timenums = np.ma.MaskedArray(nc4.date2num(g, units=cls.default_time_unit))
                timenums.mask = NaTs
                time[i, :] = timenums

                latitude[i, :] = gdf.y.fillna(latitude._FillValue).values
                longitude[i, :] = gdf.x.fillna(longitude._FillValue).values
                z[i, :] = gdf.z.fillna(z._FillValue).values
                if 'distance' in gdf:
                    distance[i, :] = gdf.distance.fillna(distance._FillValue).values

                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    if var_name not in nc.variables:
                        if np.issubdtype(gdf[c].dtype, 'S') or gdf[c].dtype == object:
                            # AttributeError: cannot set _FillValue attribute for VLEN or compound variable
                            v = nc.createVariable(var_name, get_dtype(gdf[c]), ('trajectory', 'obs'))
                        else:
                            v = nc.createVariable(var_name, get_dtype(gdf[c]), ('trajectory', 'obs'), fill_value=gdf[c].dtype.type(cls.default_fill_value))

                        if var_name not in attributes:
                            attributes[var_name] = {}
                        attributes[var_name] = dict_update(attributes[var_name], {
                            'coordinates' : 'time latitude longitude z',
                        })
                    else:
                        v = nc.variables[var_name]

                    if hasattr(v, '_FillValue'):
                        vvalues = gdf[c].fillna(v._FillValue).values
                    else:
                        # Use an empty string... better than nothing!
                        vvalues = gdf[c].fillna('').values

                    sl = slice(0, vvalues.size)
                    v[i, sl] = vvalues

            # Set global attributes
            nc.update_attributes(attributes)

        return IncompleteMultidimensionalTrajectory(output, **kwargs)
コード例 #19
0
ファイル: r.py プロジェクト: pyoceans/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not supported in xarray
            changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with RaggedTimeseriesProfile(output, 'w') as nc:

            station_groups = df.groupby(axes.station)
            unique_stations = list(station_groups.groups.keys())
            num_stations = len(unique_stations)

            # Calculate the max number of profiles
            profile_groups = df.groupby(axes.profile)
            unique_profiles = list(profile_groups.groups.keys())
            num_profiles = len(unique_profiles)
            nc.createDimension(daxes.profile, num_profiles)

            if reduce_dims is True and num_stations == 1:
                # If a singular station, remove the dimension
                station_dimensions = ()
                s_ind = None
            else:
                station_dimensions = (daxes.station,)
                nc.createDimension(daxes.station, num_stations)
                # The station this profile belongs to
                s_ind = nc.createVariable('stationIndex', 'i4', (daxes.profile,))

            station = nc.createVariable(axes.station, get_dtype(unique_stations), station_dimensions)
            profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (daxes.profile,))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), station_dimensions)
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), station_dimensions)

            # Get unique obs by grouping on traj and profile and getting the max size
            if unlimited is True:
                nc.createDimension(daxes.sample, None)
            else:
                nc.createDimension(daxes.sample, len(df))

            # Number of observations in each profile
            row_size = nc.createVariable('rowSize', 'i4', (daxes.profile,))

            # Axes variables are already processed so skip them
            data_columns = [ d for d in df.columns if d not in axes ]
            data_columns += [axes.t, axes.z]  # time isn't really special, its dimensioned by obs
            attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {}))

            for i, (sname, srg) in enumerate(station_groups):
                station[i] = sname
                latitude[i] = df[axes.y][df[axes.station] == sname].dropna().iloc[0]
                longitude[i] = df[axes.x][df[axes.station] == sname].dropna().iloc[0]

            for j, (pname, pfg) in enumerate(profile_groups):
                profile[j] = pname
                row_size[j] = len(pfg)
                if s_ind is not None:
                    s_ind[j] = np.asscalar(np.argwhere(station[:] == pfg[axes.station].dropna().iloc[0]))

            # Add back in the z axes that was removed when calculating data_columns
            # and ignore variables that were stored in the profile index
            skips = ['stationIndex', 'rowSize']
            for c in [ d for d in data_columns if d not in skips ]:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(
                        nc,
                        var_name,
                        (daxes.sample,),
                        df[c],
                        zlib=True,
                        complevel=1
                    )
                else:
                    v = nc.variables[var_name]
                vvalues = get_ncdata_from_series(df[c], v)
                try:
                    if unlimited is True:
                        v[:] = vvalues
                    else:
                        v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return RaggedTimeseriesProfile(output, **kwargs)
コード例 #20
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes
        data_columns = [ d for d in df.columns if d not in axes ]

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = { k: '{}_dim'.format(v) for k, v in axes._asdict().items() }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with IncompleteMultidimensionalTrajectory(output, 'w') as nc:

            trajectory_group = df.groupby(axes.trajectory)

            if unlimited is True:
                max_obs = None
            else:
                max_obs = trajectory_group.size().max()
            nc.createDimension(daxes.sample, max_obs)

            num_trajectories = len(trajectory_group)
            if reduce_dims is True and num_trajectories == 1:
                # If a singlular trajectory, we can reduce that dimension if it is of size 1
                def ts(t_index, size):
                    return np.s_[0:size]
                default_dimensions = (daxes.sample,)
                trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]))
            else:
                def ts(t_index, size):
                    return np.s_[t_index, 0:size]
                default_dimensions = (daxes.trajectory, daxes.sample)
                nc.createDimension(daxes.trajectory, num_trajectories)
                trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (daxes.trajectory,))

            # Create all of the variables
            time = nc.createVariable(axes.t, 'f8', default_dimensions, fill_value=np.dtype('f8').type(cls.default_fill_value))
            z = nc.createVariable(axes.z, get_dtype(df[axes.z]), default_dimensions, fill_value=df[axes.z].dtype.type(cls.default_fill_value))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), default_dimensions, fill_value=df[axes.y].dtype.type(cls.default_fill_value))
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), default_dimensions, fill_value=df[axes.x].dtype.type(cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes, daxes), kwargs.pop('attributes', {}))

            # Create vars based on full dataframe (to get all variables)
            for c in data_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(
                        nc,
                        var_name,
                        default_dimensions,
                        df[c],
                        zlib=True,
                        complevel=1
                    )
                    attributes[var_name] = dict_update(attributes.get(var_name, {}), {
                        'coordinates': '{} {} {} {}'.format(
                            axes.t, axes.z, axes.x, axes.y
                        )
                    })

            for i, (uid, gdf) in enumerate(trajectory_group):
                trajectory[i] = uid

                times = get_ncdata_from_series(gdf[axes.t], time)
                time[ts(i, times.size)] = times

                lats = get_ncdata_from_series(gdf[axes.y], latitude)
                latitude[ts(i, lats.size)] = lats

                lons = get_ncdata_from_series(gdf[axes.x], longitude)
                longitude[ts(i, lons.size)] = lons

                zs = gdf[axes.z].fillna(get_fill_value(z)).values
                z[ts(i, zs.size)] = zs

                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(gdf[c], v)
                    slicer = ts(i, vvalues.size)
                    v[slicer] = vvalues

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return IncompleteMultidimensionalTrajectory(output, **kwargs)
コード例 #21
0
ファイル: nc.py プロジェクト: zeroYXX/GUTILS
def create_profile_netcdf(attrs, profile, output_path, mode, profile_id_type=ProfileIdTypes.EPOCH):
    try:
        # Path to hold file while we create it
        tmp_handle, tmp_path = tempfile.mkstemp(suffix='.nc', prefix='gutils_glider_netcdf_')

        profile_time = profile.t.dropna().iloc[0]

        if profile_id_type == ProfileIdTypes.EPOCH:
            # We are using the epoch as the profile_index!
            profile_index = calendar.timegm(profile_time.utctimetuple())
        # Figure out which profile index to use (epoch or integer)
        elif profile_id_type == ProfileIdTypes.COUNT:
            # Get all existing netCDF outputs and find out the index of this netCDF file. That
            # will be the profile_id of this file. This is effectively keeping a tally of netCDF
            # files that have been created and only works if NETCDF FILES ARE WRITTEN IN
            # ASCENDING ORDER.
            # There is a race condition here if files are being in parallel and one should be
            # sure that when this function is being run there can be no more files writtten.
            # This file being written is the last profile available.
            netcdf_files_same_mode = list(glob(
                os.path.join(
                    output_path,
                    '*_{}.nc'.format(mode)
                )
            ))
            profile_index = len(netcdf_files_same_mode)
        elif profile_id_type == ProfileIdTypes.FRAME:
            profile_index = profile.profile.iloc[0]
        else:
            raise ValueError('{} is not a valid profile type'.format(profile_id_type))

        # Create final filename
        filename = "{0}_{1:010d}_{2:%Y%m%dT%H%M%S}Z_{3}.nc".format(
            attrs['glider'],
            profile_index,
            profile_time,
            mode
        )
        output_file = os.path.join(output_path, filename)

        # Add in the trajectory dimension to make pocean happy
        traj_name = '{}-{}'.format(
            attrs['glider'],
            attrs['trajectory_date']
        )
        profile = profile.assign(trajectory=traj_name)

        # We add this back in later
        profile.drop('profile', axis=1, inplace=True)

        # Compute U/V scalar values
        uv_txy = get_uv_data(profile)
        if 'u_orig' in profile.columns and 'v_orig' in profile.columns:
            profile.drop(['u_orig', 'v_orig'], axis=1, inplace=True)

        # Compute profile scalar values
        profile_txy = get_profile_data(profile, method=None)

        # Calculate some geographic global attributes
        attrs = dict_update(attrs, get_geographic_attributes(profile))
        # Calculate some vertical global attributes
        attrs = dict_update(attrs, get_vertical_attributes(profile))
        # Calculate some temporal global attributes
        attrs = dict_update(attrs, get_temporal_attributes(profile))
        # Set the creation dates and history
        attrs = dict_update(attrs, get_creation_attributes(profile))

        # Changing column names here from the default 't z x y'
        axes = {
            't': 'time',
            'z': 'depth',
            'x': 'lon',
            'y': 'lat',
            'sample': 'time'
        }
        profile = profile.rename(columns=axes)

        # Use pocean to create NetCDF file
        with IncompleteMultidimensionalTrajectory.from_dataframe(
                profile,
                tmp_path,
                axes=axes,
                reduce_dims=True,
                mode='a') as ncd:

            # We only want to apply metadata from the `attrs` map if the variable is already in
            # the netCDF file or it is a scalar variable (no shape defined). This avoids
            # creating measured variables that were not measured in this profile.
            prof_attrs = attrs.copy()

            vars_to_update = OrderedDict()
            for vname, vobj in prof_attrs['variables'].items():
                if vname in ncd.variables or ('shape' not in vobj and 'type' in vobj):
                    if 'shape' in vobj:
                        # Assign coordinates
                        vobj['attributes']['coordinates'] = '{} {} {} {}'.format(
                            axes.get('t'),
                            axes.get('z'),
                            axes.get('x'),
                            axes.get('y'),
                        )
                    vars_to_update[vname] = vobj
                else:
                    # L.debug("Skipping missing variable: {}".format(vname))
                    pass

            prof_attrs['variables'] = vars_to_update
            ncd.apply_meta(prof_attrs)

            # Set trajectory value
            ncd.id = traj_name
            ncd.variables['trajectory'][0] = traj_name

            # Set profile_* data
            set_profile_data(ncd, profile_txy, profile_index)

            # Set *_uv data
            set_uv_data(ncd, uv_txy)

        # Move to final destination
        safe_makedirs(os.path.dirname(output_file))
        os.chmod(tmp_path, 0o664)
        shutil.move(tmp_path, output_file)
        L.info('Created: {}'.format(output_file))
        return output_file
    except BaseException:
        raise
    finally:
        os.close(tmp_handle)
        if os.path.exists(tmp_path):
            os.remove(tmp_path)
コード例 #22
0
ファイル: cr.py プロジェクト: pyoceans/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes

        _ = kwargs.pop('reduce_dims', False)
        _ = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with ContiguousRaggedTrajectoryProfile(output, 'w') as nc:

            trajectory_groups = df.groupby(axes.trajectory)
            unique_trajectories = list(trajectory_groups.groups.keys())
            num_trajectories = len(unique_trajectories)

            nc.createDimension(daxes.trajectory, num_trajectories)
            trajectory = nc.createVariable(axes.trajectory,
                                           get_dtype(df[axes.trajectory]),
                                           (daxes.trajectory, ))
            trajectory[:] = np.array(unique_trajectories)

            # Calculate the max number of profiles
            unique_profiles = df[axes.profile].unique()
            num_profiles = len(unique_profiles)

            nc.createDimension(daxes.profile, num_profiles)
            profile = nc.createVariable(axes.profile,
                                        get_dtype(df[axes.profile]),
                                        (daxes.profile, ))
            profile[:] = np.array(unique_profiles)

            # Get unique obs by grouping on traj and profile and getting the max size
            num_obs = len(df)
            nc.createDimension(daxes.sample, num_obs)

            # The trajectory this profile belongs to
            t_ind = nc.createVariable('trajectoryIndex', 'i4',
                                      (daxes.profile, ))
            # Number of observations in each profile
            row_size = nc.createVariable('rowSize', 'i4', (daxes.profile, ))

            # Create all of the axis variables
            time = nc.createVariable(axes.t,
                                     'f8', (daxes.profile, ),
                                     fill_value=np.dtype('f8').type(
                                         cls.default_fill_value))
            latitude = nc.createVariable(
                axes.y,
                get_dtype(df[axes.y]), (daxes.profile, ),
                fill_value=df[axes.y].dtype.type(cls.default_fill_value))
            longitude = nc.createVariable(
                axes.x,
                get_dtype(df[axes.x]), (daxes.profile, ),
                fill_value=df[axes.x].dtype.type(cls.default_fill_value))

            # Axes variables are already processed so skip them
            data_columns = [d for d in df.columns if d not in axes]
            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Variables defined on only the profile axis
            profile_vars = kwargs.pop('profile_vars', [])
            profile_columns = [p for p in profile_vars if p in data_columns]
            for c in profile_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    create_ncvar_from_series(nc,
                                             var_name, (daxes.profile, ),
                                             df[c],
                                             zlib=True,
                                             complevel=1)

            for i, (_, trg) in enumerate(trajectory_groups):
                for j, (_, pfg) in enumerate(trg.groupby(axes.profile)):
                    time[j] = get_ncdata_from_series(pfg[axes.t],
                                                     time).astype('f8')[0]
                    latitude[j] = get_ncdata_from_series(
                        pfg[axes.y], latitude)[0]
                    longitude[j] = get_ncdata_from_series(
                        pfg[axes.x], longitude)[0]
                    row_size[j] = len(pfg)
                    t_ind[j] = i

                    # Save any profile variables on the "profile" index using the first value found
                    # in the column.
                    for c in profile_columns:
                        var_name = cf_safe_name(c)
                        if var_name not in nc.variables:
                            continue
                        v = nc.variables[var_name]
                        vvalues = get_ncdata_from_series(pfg[c], v)[0]
                        try:
                            v[j] = vvalues
                        except BaseException:
                            L.exception('Failed to add {}'.format(c))
                            continue

            # Add back in the z axes that was removed when calculating data_columns
            # and ignore variables that were stored in the profile index
            sample_columns = [
                f for f in data_columns + [axes.z] if f not in profile_columns
            ]
            skips = ['trajectoryIndex', 'rowSize']
            for c in [d for d in sample_columns if d not in skips]:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name, (daxes.sample, ),
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                else:
                    v = nc.variables[var_name]
                vvalues = get_ncdata_from_series(df[c], v)
                try:
                    v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return ContiguousRaggedTrajectoryProfile(output, **kwargs)
コード例 #23
0
ファイル: om.py プロジェクト: pyoceans/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes
        data_columns = [d for d in df.columns if d not in axes]

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not supported in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        # Make a new index that is the Cartesian product of all of the values from all of the
        # values of the old index. This is so don't have to iterate over anything. The full column
        # of data will be able to be shaped to the size of the final unique sized dimensions.
        index_order = [axes.t, axes.z, axes.station]
        df = df.set_index(index_order)
        df = df.reindex(
            pd.MultiIndex.from_product(df.index.levels, names=index_order))

        unique_z = df.index.get_level_values(axes.z).unique().values
        unique_t = df.index.get_level_values(
            axes.t).unique().tolist()  # tolist converts to Timestamp
        all_stations = df.index.get_level_values(axes.station)
        unique_s = all_stations.unique()

        with OrthogonalMultidimensionalTimeseriesProfile(output, 'w') as nc:

            if reduce_dims is True and unique_s.size == 1:
                # If a singular trajectory, we can reduce that dimension if it is of size 1
                default_dimensions = (daxes.t, daxes.z)
                station_dimensions = ()
            else:
                default_dimensions = (daxes.t, daxes.z, daxes.station)
                station_dimensions = (daxes.station, )
                nc.createDimension(daxes.station, unique_s.size)

            station = nc.createVariable(axes.station, get_dtype(unique_s),
                                        station_dimensions)
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         station_dimensions)
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          station_dimensions)
            # Assign over loop because VLEN variables (strings) have to be assigned by integer index
            # and we need to find the lat/lon based on station index
            for si, st in enumerate(unique_s):
                station[si] = st
                latitude[si] = df[axes.y][all_stations == st].dropna().iloc[0]
                longitude[si] = df[axes.x][all_stations == st].dropna().iloc[0]

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            if unlimited is True:
                nc.createDimension(daxes.t, None)
            else:
                nc.createDimension(daxes.t, len(unique_t))
            time = nc.createVariable(axes.t, 'f8', (daxes.t, ))
            time[:] = date2num(unique_t,
                               units=cls.default_time_unit).astype('f8')

            nc.createDimension(daxes.z, unique_z.size)
            z = nc.createVariable(axes.z, get_dtype(unique_z), (daxes.z, ))
            z[:] = unique_z

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Variables defined on only the time axis and not the depth axis
            detach_z_vars = kwargs.pop('detach_z', [])
            detach_z_columnms = [p for p in detach_z_vars if p in data_columns]
            for c in detach_z_columnms:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(
                        nc,
                        var_name,
                        default_dimensions[
                            0::2],  # this removes the second dimension (z)
                        df[c],
                        zlib=True,
                        complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}), {
                            'coordinates':
                            '{} {} {}'.format(axes.t, axes.x, axes.y)
                        })
                else:
                    v = nc.variables[var_name]

                # Because we need access to the fillvalues here, we ask not to return
                # the values with them already filled.
                vvalues = get_ncdata_from_series(df[c], v, fillna=False)
                # Reshape to the full array, with Z
                vvalues = vvalues.reshape(len(unique_t), unique_z.size,
                                          unique_s.size)
                # The Z axis is always the second axis, take the mean over that axis
                vvalues = np.apply_along_axis(np.nanmean, 1, vvalues).flatten()
                # Now reshape to the array without Z
                vvalues = vvalues.reshape(len(unique_t), unique_s.size)
                try:
                    v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            full_columns = [
                f for f in data_columns if f not in detach_z_columnms
            ]
            for c in full_columns:
                # Create variable if it doesn't exist
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name,
                                                 default_dimensions,
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}), {
                            'coordinates':
                            '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                 axes.y)
                        })
                else:
                    v = nc.variables[var_name]

                vvalues = get_ncdata_from_series(df[c], v)
                v[:] = vvalues.reshape(v.shape)

            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseriesProfile(output, **kwargs)
コード例 #24
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        data_columns = [d for d in df.columns if d not in axes]

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        # Downcast anything from int64 to int32
        df = downcast_dataframe(df)

        # Make a new index that is the Cartesian product of all of the values from all of the
        # values of the old index. This is so don't have to iterate over anything. The full column
        # of data will be able to be shaped to the size of the final unique sized dimensions.
        index_order = [axes.t, axes.z, axes.station]
        df = df.set_index(index_order)
        df = df.reindex(
            pd.MultiIndex.from_product(df.index.levels, names=index_order))

        unique_z = df.index.get_level_values(axes.z).unique().values
        unique_t = df.index.get_level_values(
            axes.t).unique().tolist()  # tolist converts to Timestamp
        all_stations = df.index.get_level_values(axes.station)
        unique_s = all_stations.unique()

        with OrthogonalMultidimensionalTimeseriesProfile(output, 'w') as nc:

            if reduce_dims is True and unique_s.size == 1:
                # If a singlular trajectory, we can reduce that dimension if it is of size 1
                def ts():
                    return np.s_[:, :]

                default_dimensions = (axes.t, axes.z)
                station_dimensions = ()
            else:

                def ts():
                    return np.s_[:, :, :]

                default_dimensions = (axes.t, axes.z, axes.station)
                station_dimensions = (axes.station, )
                nc.createDimension(axes.station, unique_s.size)

            station = nc.createVariable(axes.station, get_dtype(unique_s),
                                        station_dimensions)
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         station_dimensions)
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          station_dimensions)
            # Assign over loop because VLEN variables (strings) have to be assigned by integer index
            # and we need to find the lat/lon based on station index
            for si, st in enumerate(unique_s):
                station[si] = st
                latitude[si] = df[axes.y][all_stations == st].dropna().iloc[0]
                longitude[si] = df[axes.x][all_stations == st].dropna().iloc[0]

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            if unlimited is True:
                nc.createDimension(axes.t, None)
            else:
                nc.createDimension(axes.t, len(unique_t))
            time = nc.createVariable(axes.t, 'f8', (axes.t, ))
            time[:] = nc4.date2num(unique_t, units=cls.default_time_unit)

            nc.createDimension(axes.z, unique_z.size)
            z = nc.createVariable(axes.z, get_dtype(unique_z), (axes.z, ))
            z[:] = unique_z

            attributes = dict_update(nc.nc_attributes(axes),
                                     kwargs.pop('attributes', {}))

            for c in data_columns:
                # Create variable if it doesn't exist
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name,
                                                 default_dimensions,
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}), {
                            'coordinates':
                            '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                 axes.y)
                        })
                else:
                    v = nc.variables[var_name]

                vvalues = get_ncdata_from_series(df[c], v)
                v[ts()] = vvalues.reshape(len(unique_t), unique_z.size,
                                          unique_s.size)

            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseriesProfile(output, **kwargs)
コード例 #25
0
ファイル: im.py プロジェクト: joefutrelle/pocean-core
    def from_dataframe(cls, df, output, **kwargs):
        reserved_columns = [
            'trajectory', 'profile', 't', 'x', 'y', 'z', 'distance'
        ]
        data_columns = [d for d in df.columns if d not in reserved_columns]

        with IncompleteMultidimensionalProfile(output, 'w') as nc:

            profile_group = df.groupby('profile')
            max_zs = profile_group.size().max()

            unique_profiles = df.profile.unique()
            nc.createDimension('profile', unique_profiles.size)
            nc.createDimension('z', max_zs)

            # Metadata variables
            nc.createVariable('crs', 'i4')

            profile = nc.createVariable('profile', get_dtype(df.profile),
                                        ('profile', ))

            # Create all of the variables
            time = nc.createVariable('time', 'i4', ('profile', ))
            latitude = nc.createVariable('latitude', get_dtype(df.y),
                                         ('profile', ))
            longitude = nc.createVariable('longitude', get_dtype(df.x),
                                          ('profile', ))
            if 'distance' in df:
                distance = nc.createVariable('distance',
                                             get_dtype(df.distance),
                                             ('profile', ))
            z = nc.createVariable('z',
                                  get_dtype(df.z), ('profile', 'z'),
                                  fill_value=df.z.dtype.type(
                                      cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(),
                                     kwargs.pop('attributes', {}))

            for i, (uid, pdf) in enumerate(profile_group):
                profile[i] = uid

                time[i] = nc4.date2num(pdf.t.iloc[0],
                                       units=cls.default_time_unit)
                latitude[i] = pdf.y.iloc[0]
                longitude[i] = pdf.x.iloc[0]
                if 'distance' in pdf:
                    distance[i] = pdf.distance.iloc[0]

                zvalues = pdf.z.fillna(z._FillValue).values
                sl = slice(0, zvalues.size)
                z[i, sl] = zvalues
                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    if var_name not in nc.variables:
                        if np.issubdtype(pdf[c].dtype,
                                         'S') or pdf[c].dtype == object:
                            # AttributeError: cannot set _FillValue attribute for VLEN or compound variable
                            v = nc.createVariable(var_name, get_dtype(pdf[c]),
                                                  ('profile', 'z'))
                        else:
                            v = nc.createVariable(var_name,
                                                  get_dtype(pdf[c]),
                                                  ('profile', 'z'),
                                                  fill_value=pdf[c].dtype.type(
                                                      cls.default_fill_value))

                        if var_name not in attributes:
                            attributes[var_name] = {}
                        attributes[var_name] = dict_update(
                            attributes[var_name], {
                                'coordinates': 'time latitude longitude z',
                            })
                    else:
                        v = nc.variables[var_name]

                    if hasattr(v, '_FillValue'):
                        vvalues = pdf[c].fillna(v._FillValue).values
                    else:
                        # Use an empty string... better than nothing!
                        vvalues = pdf[c].fillna('').values

                    sl = slice(0, vvalues.size)
                    v[i, sl] = vvalues

            # Set global attributes
            nc.update_attributes(attributes)

        return IncompleteMultidimensionalProfile(output, **kwargs)