Exemple #1
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes

        _ = kwargs.pop('reduce_dims', False)
        _ = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with ContiguousRaggedTrajectoryProfile(output, 'w') as nc:

            trajectory_groups = df.groupby(axes.trajectory)
            unique_trajectories = list(trajectory_groups.groups.keys())
            num_trajectories = len(unique_trajectories)

            nc.createDimension(daxes.trajectory, num_trajectories)
            trajectory = nc.createVariable(axes.trajectory,
                                           get_dtype(df[axes.trajectory]),
                                           (daxes.trajectory, ))
            trajectory[:] = np.array(unique_trajectories)

            # Calculate the max number of profiles
            unique_profiles = df[axes.profile].unique()
            num_profiles = len(unique_profiles)

            nc.createDimension(daxes.profile, num_profiles)
            profile = nc.createVariable(axes.profile,
                                        get_dtype(df[axes.profile]),
                                        (daxes.profile, ))
            profile[:] = np.array(unique_profiles)

            # Get unique obs by grouping on traj and profile and getting the max size
            num_obs = len(df)
            nc.createDimension(daxes.sample, num_obs)

            # The trajectory this profile belongs to
            t_ind = nc.createVariable('trajectoryIndex', 'i4',
                                      (daxes.profile, ))
            # Number of observations in each profile
            row_size = nc.createVariable('rowSize', 'i4', (daxes.profile, ))

            # Create all of the axis variables
            time = nc.createVariable(axes.t,
                                     'f8', (daxes.profile, ),
                                     fill_value=np.dtype('f8').type(
                                         cls.default_fill_value))
            latitude = nc.createVariable(
                axes.y,
                get_dtype(df[axes.y]), (daxes.profile, ),
                fill_value=df[axes.y].dtype.type(cls.default_fill_value))
            longitude = nc.createVariable(
                axes.x,
                get_dtype(df[axes.x]), (daxes.profile, ),
                fill_value=df[axes.x].dtype.type(cls.default_fill_value))

            # Axes variables are already processed so skip them
            data_columns = [d for d in df.columns if d not in axes]
            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Variables defined on only the profile axis
            profile_vars = kwargs.pop('profile_vars', [])
            profile_columns = [p for p in profile_vars if p in data_columns]
            for c in profile_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    create_ncvar_from_series(nc,
                                             var_name, (daxes.profile, ),
                                             df[c],
                                             zlib=True,
                                             complevel=1)

            for i, (_, trg) in enumerate(trajectory_groups):
                for j, (_, pfg) in enumerate(trg.groupby(axes.profile)):
                    time[j] = get_ncdata_from_series(pfg[axes.t],
                                                     time).astype('f8')[0]
                    latitude[j] = get_ncdata_from_series(
                        pfg[axes.y], latitude)[0]
                    longitude[j] = get_ncdata_from_series(
                        pfg[axes.x], longitude)[0]
                    row_size[j] = len(pfg)
                    t_ind[j] = i

                    # Save any profile variables on the "profile" index using the first value found
                    # in the column.
                    for c in profile_columns:
                        var_name = cf_safe_name(c)
                        if var_name not in nc.variables:
                            continue
                        v = nc.variables[var_name]
                        vvalues = get_ncdata_from_series(pfg[c], v)[0]
                        try:
                            v[j] = vvalues
                        except BaseException:
                            L.exception('Failed to add {}'.format(c))
                            continue

            # Add back in the z axes that was removed when calculating data_columns
            # and ignore variables that were stored in the profile index
            sample_columns = [
                f for f in data_columns + [axes.z] if f not in profile_columns
            ]
            skips = ['trajectoryIndex', 'rowSize']
            for c in [d for d in sample_columns if d not in skips]:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name, (daxes.sample, ),
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                else:
                    v = nc.variables[var_name]
                vvalues = get_ncdata_from_series(df[c], v)
                try:
                    v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return ContiguousRaggedTrajectoryProfile(output, **kwargs)
Exemple #2
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))

        _ = kwargs.pop('reduce_dims', False)
        _ = kwargs.pop('unlimited', False)

        with ContiguousRaggedTrajectoryProfile(output, 'w') as nc:

            trajectory_groups = df.groupby(axes.trajectory)
            unique_trajectories = list(trajectory_groups.groups.keys())
            num_trajectories = len(unique_trajectories)

            nc.createDimension(axes.trajectory, num_trajectories)
            trajectory = nc.createVariable(axes.trajectory, get_dtype(df[axes.trajectory]), (axes.trajectory,))
            trajectory[:] = np.array(unique_trajectories)

            # Calculate the max number of profiles
            unique_profiles = df[axes.profile].unique()
            num_profiles = len(unique_profiles)

            nc.createDimension(axes.profile, num_profiles)
            profile = nc.createVariable(axes.profile, get_dtype(df[axes.profile]), (axes.profile,))
            profile[:] = np.array(unique_profiles)

            # Get unique obs by grouping on traj and profile and getting the max size
            num_obs = len(df)
            nc.createDimension(axes.sample, num_obs)

            # The trajectory this profile belongs to
            t_ind = nc.createVariable('trajectoryIndex', 'i4', (axes.profile,))
            # Number of observations in each profile
            row_size = nc.createVariable('rowSize', 'i4', (axes.profile,))

            # Create all of the axis variables
            time = nc.createVariable(axes.t, 'f8', (axes.profile,), fill_value=np.dtype('f8').type(cls.default_fill_value))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]), (axes.profile,), fill_value=df[axes.y].dtype.type(cls.default_fill_value))
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]), (axes.profile,), fill_value=df[axes.x].dtype.type(cls.default_fill_value))

            # Axes variables are already processed so skip them
            data_columns = [ d for d in df.columns if d not in axes ]
            attributes = dict_update(nc.nc_attributes(axes), kwargs.pop('attributes', {}))

            for i, (_, trg) in enumerate(trajectory_groups):
                for j, (_, pfg) in enumerate(trg.groupby(axes.profile)):
                    time[j] = get_ncdata_from_series(pfg[axes.t], time)[0]
                    latitude[j] = get_ncdata_from_series(pfg[axes.y], latitude)[0]
                    longitude[j] = get_ncdata_from_series(pfg[axes.x], longitude)[0]
                    row_size[j] = len(pfg)
                    t_ind[j] = i

            # Add back in the z axes that was removed when calculating data_columns
            data_columns = data_columns + [axes.z]
            skips = ['trajectoryIndex', 'rowSize']
            for c in [ d for d in data_columns if d not in skips ]:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(
                        nc,
                        var_name,
                        (axes.sample,),
                        df[c],
                        zlib=True,
                        complevel=1
                    )
                else:
                    v = nc.variables[var_name]
                vvalues = get_ncdata_from_series(df[c], v)
                try:
                    v[:] = vvalues
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return ContiguousRaggedTrajectoryProfile(output, **kwargs)
Exemple #3
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes
        data_columns = [d for d in df.columns if d not in axes]

        reduce_dims = kwargs.pop('reduce_dims', False)
        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not supported in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        # Make a new index that is the Cartesian product of all of the values from all of the
        # values of the old index. This is so don't have to iterate over anything. The full column
        # of data will be able to be shaped to the size of the final unique sized dimensions.
        index_order = [axes.t, axes.z, axes.station]
        df = df.set_index(index_order)
        df = df.reindex(
            pd.MultiIndex.from_product(df.index.levels, names=index_order))

        unique_z = df.index.get_level_values(axes.z).unique().values
        unique_t = df.index.get_level_values(
            axes.t).unique().tolist()  # tolist converts to Timestamp
        all_stations = df.index.get_level_values(axes.station)
        unique_s = all_stations.unique()

        with OrthogonalMultidimensionalTimeseriesProfile(output, 'w') as nc:

            if reduce_dims is True and unique_s.size == 1:
                # If a singular trajectory, we can reduce that dimension if it is of size 1
                default_dimensions = (daxes.t, daxes.z)
                station_dimensions = ()
            else:
                default_dimensions = (daxes.t, daxes.z, daxes.station)
                station_dimensions = (daxes.station, )
                nc.createDimension(daxes.station, unique_s.size)

            station = nc.createVariable(axes.station, get_dtype(unique_s),
                                        station_dimensions)
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         station_dimensions)
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          station_dimensions)
            # Assign over loop because VLEN variables (strings) have to be assigned by integer index
            # and we need to find the lat/lon based on station index
            for si, st in enumerate(unique_s):
                station[si] = st
                latitude[si] = df[axes.y][all_stations == st].dropna().iloc[0]
                longitude[si] = df[axes.x][all_stations == st].dropna().iloc[0]

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            if unlimited is True:
                nc.createDimension(daxes.t, None)
            else:
                nc.createDimension(daxes.t, len(unique_t))
            time = nc.createVariable(axes.t, 'f8', (daxes.t, ))
            time[:] = date2num(unique_t,
                               units=cls.default_time_unit).astype('f8')

            nc.createDimension(daxes.z, unique_z.size)
            z = nc.createVariable(axes.z, get_dtype(unique_z), (daxes.z, ))
            z[:] = unique_z

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Variables defined on only the time axis and not the depth axis
            detach_z_vars = kwargs.pop('detach_z', [])
            detach_z_columnms = [p for p in detach_z_vars if p in data_columns]
            for c in detach_z_columnms:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(
                        nc,
                        var_name,
                        default_dimensions[
                            0::2],  # this removes the second dimension (z)
                        df[c],
                        zlib=True,
                        complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}), {
                            'coordinates':
                            '{} {} {}'.format(axes.t, axes.x, axes.y)
                        })
                else:
                    v = nc.variables[var_name]

                # Because we need access to the fillvalues here, we ask not to return
                # the values with them already filled.
                vvalues = get_ncdata_from_series(df[c], v, fillna=False)
                # Reshape to the full array, with Z
                vvalues = vvalues.reshape(len(unique_t), unique_z.size,
                                          unique_s.size)
                # The Z axis is always the second axis, take the mean over that axis
                vvalues = np.apply_along_axis(np.nanmean, 1, vvalues).flatten()
                # Now reshape to the array without Z
                vvalues = vvalues.reshape(len(unique_t), unique_s.size)
                try:
                    v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            full_columns = [
                f for f in data_columns if f not in detach_z_columnms
            ]
            for c in full_columns:
                # Create variable if it doesn't exist
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name,
                                                 default_dimensions,
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}), {
                            'coordinates':
                            '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                 axes.y)
                        })
                else:
                    v = nc.variables[var_name]

                vvalues = get_ncdata_from_series(df[c], v)
                v[:] = vvalues.reshape(v.shape)

            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseriesProfile(output, **kwargs)
Exemple #4
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes

        # Should never be a CR file with one trajectory so we ignore the "reduce_dims" attribute
        _ = kwargs.pop('reduce_dims', False)  # noqa
        unlimited = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with ContiguousRaggedTrajectory(output, 'w') as nc:

            trajectory_groups = df.groupby(axes.trajectory)
            unique_trajectories = list(trajectory_groups.groups.keys())
            num_trajectories = len(unique_trajectories)
            nc.createDimension(daxes.trajectory, num_trajectories)
            trajectory = nc.createVariable(axes.trajectory,
                                           get_dtype(df[axes.trajectory]),
                                           (daxes.trajectory, ))

            # Get unique obs by grouping on traj getting the max size
            if unlimited is True:
                nc.createDimension(daxes.sample, None)
            else:
                nc.createDimension(daxes.sample, len(df))

            # Number of observations in each trajectory
            row_size = nc.createVariable('rowSize', 'i4', (daxes.trajectory, ))

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            # Variables defined on only the trajectory axis
            traj_vars = kwargs.pop('traj_vars', [])
            traj_columns = [p for p in traj_vars if p in df.columns]
            for c in traj_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    create_ncvar_from_series(nc,
                                             var_name, (daxes.trajectory, ),
                                             df[c],
                                             zlib=True,
                                             complevel=1)

            for i, (trajid, trg) in enumerate(trajectory_groups):
                trajectory[i] = trajid
                row_size[i] = len(trg)

                # Save any trajectory variables using the first value found
                # in the column.
                for c in traj_columns:
                    var_name = cf_safe_name(c)
                    if var_name not in nc.variables:
                        continue
                    v = nc.variables[var_name]
                    vvalues = get_ncdata_from_series(trg[c], v)[0]
                    try:
                        v[i] = vvalues
                    except BaseException:
                        L.exception('Failed to add {}'.format(c))
                        continue

            # Add all of the columns based on the sample dimension. Take all columns and remove the
            # trajectory, rowSize and other trajectory based columns.
            sample_columns = [
                f for f in df.columns
                if f not in traj_columns + ['rowSize', axes.trajectory]
            ]
            for c in sample_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name, (daxes.sample, ),
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                else:
                    v = nc.variables[var_name]
                vvalues = get_ncdata_from_series(df[c], v)
                try:
                    if unlimited is True:
                        v[:] = vvalues
                    else:
                        v[:] = vvalues.reshape(v.shape)
                except BaseException:
                    L.exception('Failed to add {}'.format(c))
                    continue

            # Metadata variables
            if 'crs' not in nc.variables:
                nc.createVariable('crs', 'i4')

            # Set attributes
            nc.update_attributes(attributes)

        return ContiguousRaggedTrajectory(output, **kwargs)
Exemple #5
0
    def to_dataframe(self, clean_cols=True, clean_rows=True):
        # Z
        zvar = self.z_axes()[0]
        z = np.ma.fix_invalid(np.ma.MaskedArray(zvar[:]))
        z = z.flatten().round(5)
        logger.debug(['z data size: ', z.size])

        # T
        tvar = self.t_axes()[0]
        t = np.ma.MaskedArray(nc4.num2date(tvar[:], tvar.units, getattr(tvar, 'calendar', 'standard'))).flatten()
        # Patch the time variable back to its original mask, since num2date
        # breaks any missing/fill values
        if hasattr(tvar[0], 'mask'):
            t.mask = tvar[:].mask
        logger.debug(['time data size: ', t.size])

        # X
        xvar = self.x_axes()[0]
        x = np.ma.fix_invalid(np.ma.MaskedArray(xvar[:])).flatten().round(5)
        logger.debug(['x data size: ', x.size])

        # Y
        yvar = self.y_axes()[0]
        y = np.ma.fix_invalid(np.ma.MaskedArray(yvar[:])).flatten().round(5)
        logger.debug(['y data size: ', y.size])

        # Trajectories
        pvar = self.filter_by_attrs(cf_role='trajectory_id')[0]

        try:
            p = normalize_array(pvar)
        except BaseException:
            logger.exception('Could not pull trajectory values from the variable, using indexes.')
            p = np.asarray(list(range(len(pvar))), dtype=np.integer)

        # The Dimension that the trajectory id variable doesn't have is what
        # the trajectory data needs to be repeated by
        dim_diff = self.dimensions[list(set(tvar.dimensions).difference(set(pvar.dimensions)))[0]]
        if dim_diff:
            p = p.repeat(dim_diff.size)
        logger.debug(['trajectory data size: ', p.size])

        # Distance
        d = np.append([0], great_distance(start_latitude=y[0:-1], end_latitude=y[1:], start_longitude=x[0:-1], end_longitude=x[1:])['distance'])
        d = np.ma.fix_invalid(np.ma.MaskedArray(np.cumsum(d)).astype(np.float64).round(2))
        logger.debug(['distance data size: ', d.size])

        df_data = {
            't': t,
            'x': x,
            'y': y,
            'z': z,
            'trajectory': p,
            'distance': d
        }

        building_index_to_drop = np.ones(t.size, dtype=bool)
        extract_vars = list(set(self.data_vars() + self.ancillary_vars()))
        for i, dvar in enumerate(extract_vars):
            vdata = np.ma.fix_invalid(np.ma.MaskedArray(dvar[:].round(3).flatten()))
            building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True)  # noqa
            df_data[dvar.name] = vdata

        df = pd.DataFrame(df_data)

        # Drop all data columns with no data
        if clean_cols:
            df = df.dropna(axis=1, how='all')

        # Drop all data rows with no data variable data
        if clean_rows:
            df = df.iloc[~building_index_to_drop]

        return df