Beispiel #1
0
def test_is_mine(klass, fp):
    with CFDataset.load(fp) as dsg:
        assert dsg.__class__ == klass

    allsubs = list(all_subclasses(CFDataset))
    subs = [s for s in allsubs if s != klass]
    with CFDataset(fp) as dsg:
        logger.debug('\nTesting {}'.format(klass.__name__))
        assert klass.is_mine(dsg) is True
        for s in subs:
            if hasattr(s, 'is_mine'):
                logger.debug('  * Trying {}...'.format(s.__name__))
                assert s.is_mine(dsg) is False
Beispiel #2
0
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        data_columns = [d for d in df.columns if d not in axes]

        with OrthogonalMultidimensionalTimeseries(output, 'w') as nc:

            station_group = df.groupby(axes.station)
            num_stations = len(station_group)

            # assume all groups are the same size and have identical times
            _, sdf = list(station_group)[0]
            t = sdf[axes.t]

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            nc.createDimension(axes.t, t.size)
            nc.createDimension(axes.station, num_stations)
            station = nc.createVariable(axes.station, get_dtype(df.station),
                                        (axes.station, ))

            time = nc.createVariable(axes.t, 'f8', (axes.t, ))
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         (axes.station, ))
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          (axes.station, ))
            z = nc.createVariable(axes.z,
                                  get_dtype(df[axes.z]), (axes.station, ),
                                  fill_value=df[axes.z].dtype.type(
                                      cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes),
                                     kwargs.pop('attributes', {}))

            # tolist() converts to a python datetime object without timezone and has NaTs.
            g = t.tolist()
            # date2num convers NaTs to np.nan
            gg = nc4.date2num(g, units=cls.default_time_unit)
            # masked_invalid moves np.nan to a masked value
            time[:] = np.ma.masked_invalid(gg)

            for i, (uid, sdf) in enumerate(station_group):
                station[i] = uid
                latitude[i] = sdf[axes.y].iloc[0]
                longitude[i] = sdf[axes.x].iloc[0]

                # TODO: write a test for a Z with a _FillValue
                z[i] = sdf[axes.z].iloc[0]

                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    if var_name not in nc.variables:
                        v = create_ncvar_from_series(nc,
                                                     var_name,
                                                     (axes.station, axes.t),
                                                     sdf[c],
                                                     zlib=True,
                                                     complevel=1)
                        attributes[var_name] = dict_update(
                            attributes.get(var_name, {}), {
                                'coordinates':
                                '{} {} {} {}'.format(axes.t, axes.z, axes.x,
                                                     axes.y)
                            })
                    else:
                        v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(sdf[c], v)
                    try:
                        v[i, :] = vvalues
                    except BaseException:
                        L.debug(
                            '{} was not written. Likely a metadata variable'.
                            format(v.name))

            # Set global attributes
            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseries(output, **kwargs)
    def from_dataframe(cls, df, output, **kwargs):
        axes = get_default_axes(kwargs.pop('axes', {}))
        daxes = axes
        data_columns = [d for d in df.columns if d not in axes]

        reduce_dims = kwargs.pop('reduce_dims', False)
        _ = kwargs.pop('unlimited', False)

        unique_dims = kwargs.pop('unique_dims', False)
        if unique_dims is True:
            # Rename the dimension to avoid a dimension and coordinate having the same name
            # which is not support in xarray
            changed_axes = {
                k: '{}_dim'.format(v)
                for k, v in axes._asdict().items()
            }
            daxes = get_default_axes(changed_axes)

        # Downcast anything from int64 to int32
        # Convert any timezone aware datetimes to native UTC times
        df = downcast_dataframe(nativize_times(df))

        with OrthogonalMultidimensionalTimeseries(output, 'w') as nc:

            station_group = df.groupby(axes.station)
            num_stations = len(station_group)
            has_z = axes.z is not None

            if reduce_dims is True and num_stations == 1:
                # If a station, we can reduce that dimension if it is of size 1
                def ts(i):
                    return np.s_[:]

                default_dimensions = (daxes.t, )
                station_dimensions = ()
            else:

                def ts(i):
                    return np.s_[i, :]

                default_dimensions = (daxes.station, daxes.t)
                station_dimensions = (daxes.station, )
                nc.createDimension(daxes.station, num_stations)

            # Set the coordinates attribute correctly
            coordinates = [axes.t, axes.x, axes.y]
            if has_z is True:
                coordinates.insert(1, axes.z)
            coordinates = ' '.join(coordinates)

            # assume all groups are the same size and have identical times
            _, sdf = list(station_group)[0]
            t = sdf[axes.t]

            # Metadata variables
            nc.createVariable('crs', 'i4')

            # Create all of the variables
            nc.createDimension(daxes.t, t.size)
            time = nc.createVariable(axes.t, 'f8', (daxes.t, ))
            station = nc.createVariable(axes.station,
                                        get_dtype(df[axes.station]),
                                        station_dimensions)
            latitude = nc.createVariable(axes.y, get_dtype(df[axes.y]),
                                         station_dimensions)
            longitude = nc.createVariable(axes.x, get_dtype(df[axes.x]),
                                          station_dimensions)
            if has_z is True:
                z = nc.createVariable(axes.z,
                                      get_dtype(df[axes.z]),
                                      station_dimensions,
                                      fill_value=df[axes.z].dtype.type(
                                          cls.default_fill_value))

            attributes = dict_update(nc.nc_attributes(axes, daxes),
                                     kwargs.pop('attributes', {}))

            time[:] = get_ncdata_from_series(t, time)

            # Create vars based on full dataframe (to get all variables)
            for c in data_columns:
                var_name = cf_safe_name(c)
                if var_name not in nc.variables:
                    v = create_ncvar_from_series(nc,
                                                 var_name,
                                                 default_dimensions,
                                                 df[c],
                                                 zlib=True,
                                                 complevel=1)
                    attributes[var_name] = dict_update(
                        attributes.get(var_name, {}),
                        {'coordinates': coordinates})

            for i, (uid, sdf) in enumerate(station_group):
                station[i] = uid
                latitude[i] = sdf[axes.y].iloc[0]
                longitude[i] = sdf[axes.x].iloc[0]

                if has_z is True:
                    # TODO: write a test for a Z with a _FillValue
                    z[i] = sdf[axes.z].iloc[0]

                for c in data_columns:
                    # Create variable if it doesn't exist
                    var_name = cf_safe_name(c)
                    v = nc.variables[var_name]

                    vvalues = get_ncdata_from_series(sdf[c], v)
                    try:
                        v[ts(i)] = vvalues
                    except BaseException:
                        L.debug(
                            '{} was not written. Likely a metadata variable'.
                            format(v.name))

            # Set global attributes
            nc.update_attributes(attributes)

        return OrthogonalMultidimensionalTimeseries(output, **kwargs)
Beispiel #4
0
    def to_dataframe(self, clean_cols=True, clean_rows=True):

        zvar = self.z_axes()[0]
        zs = len(self.dimensions[zvar.dimensions[0]])

        # Profiles
        pvar = self.filter_by_attrs(cf_role='profile_id')[0]
        try:
            p = normalize_array(pvar)
        except ValueError:
            p = np.asarray(list(range(len(pvar))), dtype=np.integer)
        ps = p.size
        p = p.repeat(zs)
        logger.debug(['profile data size: ', p.size])

        # Z
        z = generic_masked(zvar[:], attrs=self.vatts(zvar.name)).round(5)
        try:
            z = np.tile(z, ps)
        except ValueError:
            z = z.flatten()
        logger.debug(['z data size: ', z.size])

        # T
        tvar = self.t_axes()[0]
        t = nc4.num2date(tvar[:], tvar.units,
                         getattr(tvar, 'calendar', 'standard'))
        if isinstance(t, datetime):
            # Size one
            t = np.array([t.isoformat()], dtype='datetime64')
        t = t.repeat(zs)
        logger.debug(['time data size: ', t.size])

        # X
        xvar = self.x_axes()[0]
        x = generic_masked(xvar[:].repeat(zs),
                           attrs=self.vatts(xvar.name)).round(5)
        logger.debug(['x data size: ', x.size])

        # Y
        yvar = self.y_axes()[0]
        y = generic_masked(yvar[:].repeat(zs),
                           attrs=self.vatts(yvar.name)).round(5)
        logger.debug(['y data size: ', y.size])

        # Distance
        d = np.ma.zeros(y.size, dtype=np.float64)
        d[1:] = great_distance(start_latitude=y[0:-1],
                               end_latitude=y[1:],
                               start_longitude=x[0:-1],
                               end_longitude=x[1:])['distance']
        d = generic_masked(np.cumsum(d), minv=0).round(2)
        logger.debug(['distance data size: ', d.size])

        df_data = {'t': t, 'x': x, 'y': y, 'z': z, 'profile': p, 'distance': d}

        building_index_to_drop = np.ones(t.size, dtype=bool)
        extract_vars = list(set(self.data_vars() + self.ancillary_vars()))
        for i, dvar in enumerate(extract_vars):
            vdata = np.ma.fix_invalid(
                np.ma.MaskedArray(dvar[:].round(3).flatten()))
            building_index_to_drop = (building_index_to_drop == True) & (
                vdata.mask == True)  # noqa
            df_data[dvar.name] = vdata

        df = pd.DataFrame(df_data)

        # Drop all data columns with no data
        if clean_cols:
            df = df.dropna(axis=1, how='all')

        # Drop all data rows with no data variable data
        if clean_rows:
            df = df.iloc[~building_index_to_drop]

        return df
Beispiel #5
0
    def to_dataframe(self, clean_cols=True, clean_rows=True):
        # Z
        zvar = self.z_axes()[0]
        z = np.ma.fix_invalid(np.ma.MaskedArray(zvar[:]))
        z = z.flatten().round(5)
        logger.debug(['z data size: ', z.size])

        # T
        tvar = self.t_axes()[0]
        t = np.ma.MaskedArray(nc4.num2date(tvar[:], tvar.units, getattr(tvar, 'calendar', 'standard'))).flatten()
        # Patch the time variable back to its original mask, since num2date
        # breaks any missing/fill values
        if hasattr(tvar[0], 'mask'):
            t.mask = tvar[:].mask
        logger.debug(['time data size: ', t.size])

        # X
        xvar = self.x_axes()[0]
        x = np.ma.fix_invalid(np.ma.MaskedArray(xvar[:])).flatten().round(5)
        logger.debug(['x data size: ', x.size])

        # Y
        yvar = self.y_axes()[0]
        y = np.ma.fix_invalid(np.ma.MaskedArray(yvar[:])).flatten().round(5)
        logger.debug(['y data size: ', y.size])

        # Trajectories
        pvar = self.filter_by_attrs(cf_role='trajectory_id')[0]

        try:
            p = normalize_array(pvar)
        except BaseException:
            logger.exception('Could not pull trajectory values from the variable, using indexes.')
            p = np.asarray(list(range(len(pvar))), dtype=np.integer)

        # The Dimension that the trajectory id variable doesn't have is what
        # the trajectory data needs to be repeated by
        dim_diff = self.dimensions[list(set(tvar.dimensions).difference(set(pvar.dimensions)))[0]]
        if dim_diff:
            p = p.repeat(dim_diff.size)
        logger.debug(['trajectory data size: ', p.size])

        # Distance
        d = np.append([0], great_distance(start_latitude=y[0:-1], end_latitude=y[1:], start_longitude=x[0:-1], end_longitude=x[1:])['distance'])
        d = np.ma.fix_invalid(np.ma.MaskedArray(np.cumsum(d)).astype(np.float64).round(2))
        logger.debug(['distance data size: ', d.size])

        df_data = {
            't': t,
            'x': x,
            'y': y,
            'z': z,
            'trajectory': p,
            'distance': d
        }

        building_index_to_drop = np.ones(t.size, dtype=bool)
        extract_vars = list(set(self.data_vars() + self.ancillary_vars()))
        for i, dvar in enumerate(extract_vars):
            vdata = np.ma.fix_invalid(np.ma.MaskedArray(dvar[:].round(3).flatten()))
            building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True)  # noqa
            df_data[dvar.name] = vdata

        df = pd.DataFrame(df_data)

        # Drop all data columns with no data
        if clean_cols:
            df = df.dropna(axis=1, how='all')

        # Drop all data rows with no data variable data
        if clean_rows:
            df = df.iloc[~building_index_to_drop]

        return df
Beispiel #6
0
    def to_dataframe(self, clean_cols=False, clean_rows=False):

        # Don't pass around the attributes store them in the class

        # T
        tvar = self.t_axes()[0]
        t = nc4.num2date(tvar[:], tvar.units, getattr(tvar, 'calendar', 'standard'))
        if isinstance(t, datetime):
            # Size one
            t = np.array([t.isoformat()], dtype='datetime64')
        logger.debug(['time data size: ', t.size])

        svar = self.filter_by_attrs(cf_role='timeseries_id')[0]
        # Stations
        # TODO: Make sure there is a test for a file with multiple time variables
        try:
            s = normalize_array(svar)
        except ValueError:
            s = np.asarray(list(range(len(svar))), dtype=np.integer)
        s = np.repeat(s, t.size)
        logger.debug(['station data size: ', s.size])

        # X
        xvar = self.x_axes()[0]
        x = generic_masked(xvar[:].repeat(t.size), attrs=self.vatts(xvar.name)).round(5)
        logger.debug(['x data size: ', x.size])

        # Y
        yvar = self.y_axes()[0]
        y = generic_masked(yvar[:].repeat(t.size), attrs=self.vatts(yvar.name)).round(5)
        logger.debug(['y data size: ', y.size])

        # Z
        zvar = self.z_axes()[0]
        z = generic_masked(zvar[:].repeat(t.size), attrs=self.vatts(zvar.name))
        logger.debug(['z data size: ', z.size])

        # now repeat t per station

        # figure out if this is a single-station file
        # do this by checking the dimensions of the Z var
        if zvar.ndim == 1:
            t = np.repeat(t, len(svar))

        df_data = {
            't': t,
            'x': x,
            'y': y,
            'z': z,
            'station': s,
        }

        #building_index_to_drop = np.ones(t.size, dtype=bool)
        extract_vars = copy(self.variables)
        del extract_vars[svar.name]
        del extract_vars[xvar.name]
        del extract_vars[yvar.name]
        del extract_vars[zvar.name]
        del extract_vars[tvar.name]

        for i, (dnam, dvar) in enumerate(extract_vars.items()):
            if dvar[:].flatten().size > t.size:
                logger.warning("Variable {} is not the correct size, skipping.".format(dnam))
                continue

            vdata = generic_masked(dvar[:].flatten(), attrs=self.vatts(dnam))
            if vdata.size == 1:
                vdata = vdata[0]
            #building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True)  # noqa
            try:
                if re.match(r'.* since .*',dvar.units):
                    vdata = nc4.num2date(vdata[:], dvar.units, getattr(dvar, 'calendar', 'standard'))
            except AttributeError:
                pass
            df_data[dnam] = vdata
            #logger.info('{} - {}'.format(dnam, vdata.shape))

        df = pd.DataFrame()
        for k, v in df_data.items():
            df[k] = v

        # Drop all data columns with no data
        if clean_cols:
            df = df.dropna(axis=1, how='all')

        # Drop all data rows with no data variable data
        #if clean_rows:
        #    df = df.iloc[~building_index_to_drop]

        return df