Exemple #1
0
    def json_attributes(self, vfuncs=None):
        """
        vfuncs can be any callable that accepts a single argument, the
        Variable object, and returns a dictionary of new attributes to
        set. These will overwrite existing attributes
        """

        vfuncs = vfuncs or []

        js = {'global': {}}

        for k in self.ncattrs():
            js['global'][k] = self.getncattr(k)

        for varname, var in self.variables.items():
            js[varname] = {}
            for k in var.ncattrs():
                z = var.getncattr(k)
                try:
                    assert not np.isnan(z).all()
                    js[varname][k] = z
                except AssertionError:
                    js[varname][k] = None
                except TypeError:
                    js[varname][k] = z

            for vf in vfuncs:
                try:
                    js[varname].update(vfuncs(var))
                except BaseException:
                    logger.exception(
                        "Could not apply custom variable attribue function")

        return json.loads(json.dumps(js, cls=BasicNumpyEncoder))
Exemple #2
0
    def from_directory(cls, directory, suffix=".nc", subdirs=True, dimName='time', apply_to_members=None):

        if not os.path.isdir(directory):
            logger.error("Directory {0} does not exists or I do not have the correct permissions to access".format(directory))

        # Create NcML pointing to the directory
        ncml = """<?xml version="1.0" encoding="UTF-8"?>
                    <netcdf xmlns="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2">
                        <aggregation dimName="{0}" type="joinExisting">
                            <scan location="{1}" suffix="{2}" subdirs="{3}" />
                        </aggregation>
                    </netcdf>
               """.format(dimName, directory, suffix, subdirs)
        try:
            return cls(pyncml.scan(ncml, apply_to_members=apply_to_members))
        except BaseException:
            logger.exception("Could not load Collection from Directory.")
Exemple #3
0
    def from_directory(cls, directory, suffix=".nc", subdirs=True, dimName='time', apply_to_members=None):

        if not os.path.isdir(directory):
            logger.error("Directory {0} does not exists or I do not have the correct permissions to access".format(directory))

        # Create NcML pointing to the directory
        ncml = """<?xml version="1.0" encoding="UTF-8"?>
                    <netcdf xmlns="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2">
                        <aggregation dimName="{0}" type="joinExisting">
                            <scan location="{1}" suffix="{2}" subdirs="{3}" />
                        </aggregation>
                    </netcdf>
               """.format(dimName, directory, suffix, subdirs)
        try:
            return cls(pyncml.scan(ncml, apply_to_members=apply_to_members))
        except BaseException:
            logger.exception("Could not load Collection from Directory.")
Exemple #4
0
    def to_dataframe(self, clean_cols=True, clean_rows=True):
        # Z
        zvar = self.z_axes()[0]
        z = np.ma.fix_invalid(np.ma.MaskedArray(zvar[:]))
        z = z.flatten().round(5)
        logger.debug(['z data size: ', z.size])

        # T
        tvar = self.t_axes()[0]
        t = np.ma.MaskedArray(nc4.num2date(tvar[:], tvar.units, getattr(tvar, 'calendar', 'standard'))).flatten()
        # Patch the time variable back to its original mask, since num2date
        # breaks any missing/fill values
        if hasattr(tvar[0], 'mask'):
            t.mask = tvar[:].mask
        logger.debug(['time data size: ', t.size])

        # X
        xvar = self.x_axes()[0]
        x = np.ma.fix_invalid(np.ma.MaskedArray(xvar[:])).flatten().round(5)
        logger.debug(['x data size: ', x.size])

        # Y
        yvar = self.y_axes()[0]
        y = np.ma.fix_invalid(np.ma.MaskedArray(yvar[:])).flatten().round(5)
        logger.debug(['y data size: ', y.size])

        # Trajectories
        pvar = self.get_variables_by_attributes(cf_role='trajectory_id')[0]

        try:
            p = normalize_array(pvar)
        except BaseException:
            logger.exception('Could not pull trajectory values from the variable, using indexes.')
            p = np.asarray(list(range(len(pvar))), dtype=np.integer)

        # The Dimension that the trajectory id variable doesn't have is what
        # the trajectory data needs to be repeated by
        dim_diff = self.dimensions[list(set(tvar.dimensions).difference(set(pvar.dimensions)))[0]]
        if dim_diff:
            p = p.repeat(dim_diff.size)
        logger.debug(['trajectory data size: ', p.size])

        # Distance
        d = np.append([0], great_distance(start_latitude=y[0:-1], end_latitude=y[1:], start_longitude=x[0:-1], end_longitude=x[1:])['distance'])
        d = np.ma.fix_invalid(np.ma.MaskedArray(np.cumsum(d)).astype(np.float64).round(2))
        logger.debug(['distance data size: ', d.size])

        df_data = {
            't': t,
            'x': x,
            'y': y,
            'z': z,
            'trajectory': p,
            'distance': d
        }

        building_index_to_drop = np.ones(t.size, dtype=bool)
        extract_vars = list(set(self.data_vars() + self.ancillary_vars()))
        for i, dvar in enumerate(extract_vars):
            vdata = np.ma.fix_invalid(np.ma.MaskedArray(dvar[:].round(3).flatten()))
            building_index_to_drop = (building_index_to_drop == True) & (vdata.mask == True)  # noqa
            df_data[dvar.name] = vdata

        df = pd.DataFrame(df_data)

        # Drop all data columns with no data
        if clean_cols:
            df = df.dropna(axis=1, how='all')

        # Drop all data rows with no data variable data
        if clean_rows:
            df = df.iloc[~building_index_to_drop]

        return df
Exemple #5
0
    def from_glob(cls, glob_string, timevar_name='time', ncml=None):
        dataset_name      = None
        dataset_starting  = None
        dataset_ending    = None
        dataset_variables = []
        dataset_members   = []

        files = glob(glob_string)
        logger.info("Processing aggregation containing {!s} files".format(len(files)))
        for i, filepath in enumerate(files):
            logger.info("Processing member ({0}/{1}) - {2} ".format(i+1, len(files), filepath))
            nc = None
            try:
                if ncml is not None:
                    # Apply NcML
                    tmp_f, tmp_fp = tempfile.mkstemp(prefix="nc")
                    os.close(tmp_f)
                    nc = pyncml.apply(filepath, ncml, output_file=tmp_fp)
                else:
                    nc = netCDF4.Dataset(filepath)

                if dataset_name is None:
                    if hasattr(nc, 'name'):
                        dataset_name = nc.name
                    elif hasattr(nc, 'title'):
                        dataset_name = nc.title
                    else:
                        dataset_name = "Pyaxiom Glob Dataset"

                timevar = nc.variables.get(timevar_name)
                if timevar is None:
                    logger.error("Time variable '{0}' was not found in file '{1}'. Skipping.".format(timevar_name, filepath))
                    continue

                # Start/Stop of NetCDF file
                starting  = netCDF4.num2date(np.min(timevar[:]), units=timevar.units)
                ending    = netCDF4.num2date(np.max(timevar[:]), units=timevar.units)
                variables = filter(None, [ nc.variables[v].standard_name if hasattr(nc.variables[v], 'standard_name') else None for v in nc.variables.keys() ])

                dataset_variables = list(set(dataset_variables + variables))

                if starting.tzinfo is None:
                    starting = starting.replace(tzinfo=pytz.utc)
                if ending.tzinfo is None:
                    ending = ending.replace(tzinfo=pytz.utc)
                if dataset_starting is None or starting < dataset_starting:
                    dataset_starting = starting
                if dataset_ending is None or ending > dataset_ending:
                    dataset_ending = ending

                member = DotDict(path=filepath, standard_names=variables, starting=starting, ending=ending)
                dataset_members.append(member)
            except BaseException:
                logger.exception("Something went wrong with {0}".format(filepath))
                continue
            finally:
                nc.close()
                try:
                    os.remove(tmp_fp)
                except (OSError, UnboundLocalError):
                    pass

        dataset_members = sorted(dataset_members, key=operator.attrgetter('starting'))
        return cls(DotDict(name=dataset_name,
                           timevar_name=timevar_name,
                           starting=dataset_starting,
                           ending=dataset_ending,
                           standard_names=dataset_variables,
                           members=dataset_members))
Exemple #6
0
 def from_ncml_file(cls, ncml_path, apply_to_members=None):
     try:
         with open(ncml_path) as f:
             return cls(pyncml.scan(f.read(), apply_to_members=apply_to_members))
     except BaseException:
         logger.exception("Could not load Collection from NcML.  Please check the NcML.")
Exemple #7
0
    def from_glob(cls, glob_string, timevar_name='time', ncml=None):
        dataset_name      = None
        dataset_starting  = None
        dataset_ending    = None
        dataset_variables = []
        dataset_members   = []

        files = glob(glob_string)
        logger.info("Processing aggregation containing {!s} files".format(len(files)))
        for i, filepath in enumerate(files):
            logger.info("Processing member ({0}/{1}) - {2} ".format(i+1, len(files), filepath))
            nc = None
            try:
                if ncml is not None:
                    # Apply NcML
                    tmp_f, tmp_fp = tempfile.mkstemp(prefix="nc")
                    os.close(tmp_f)
                    nc = pyncml.apply(filepath, ncml, output_file=tmp_fp)
                else:
                    nc = netCDF4.Dataset(filepath)

                if dataset_name is None:
                    if 'name' in nc.ncattrs():
                        dataset_name = nc.name
                    elif 'title' in nc.ncattrs():
                        dataset_name = nc.title
                    else:
                        dataset_name = "Pyaxiom Glob Dataset"

                timevar = nc.variables.get(timevar_name)
                if timevar is None:
                    logger.error("Time variable '{0}' was not found in file '{1}'. Skipping.".format(timevar_name, filepath))
                    continue

                # Start/Stop of NetCDF file
                starting  = netCDF4.num2date(np.min(timevar[:]), units=timevar.units)
                ending    = netCDF4.num2date(np.max(timevar[:]), units=timevar.units)
                variables = list([_f for _f in [ nc.variables[v].standard_name if hasattr(nc.variables[v], 'standard_name') else None for v in list(nc.variables.keys()) ] if _f])

                dataset_variables = list(set(dataset_variables + variables))

                if starting.tzinfo is None:
                    starting = starting.replace(tzinfo=pytz.utc)
                if ending.tzinfo is None:
                    ending = ending.replace(tzinfo=pytz.utc)
                if dataset_starting is None or starting < dataset_starting:
                    dataset_starting = starting
                if dataset_ending is None or ending > dataset_ending:
                    dataset_ending = ending

                member = DotDict(path=filepath, standard_names=variables, starting=starting, ending=ending)
                dataset_members.append(member)
            except BaseException:
                logger.exception("Something went wrong with {0}".format(filepath))
                continue
            finally:
                nc.close()
                try:
                    os.remove(tmp_fp)
                except (OSError, UnboundLocalError):
                    pass

        dataset_members = sorted(dataset_members, key=operator.attrgetter('starting'))
        return cls(DotDict(name=dataset_name,
                           timevar_name=timevar_name,
                           starting=dataset_starting,
                           ending=dataset_ending,
                           standard_names=dataset_variables,
                           members=dataset_members))
Exemple #8
0
 def from_ncml_file(cls, ncml_path, apply_to_members=None):
     try:
         with open(ncml_path) as f:
             return cls(pyncml.scan(f.read(), apply_to_members=apply_to_members))
     except BaseException:
         logger.exception("Could not load Collection from NcML.  Please check the NcML.")
Exemple #9
0
    def add_variable(self, variable_name, values, times=None, verticals=None, sensor_vertical_datum=None, attributes=None, unlink_from_profile=None, fillvalue=None, raise_on_error=False):

        if isinstance(values, (list, tuple,)) and values:
            values = np.asarray(values)
        if isinstance(times, (list, tuple,)) and times:
            times = np.asarray(times)
        if isinstance(verticals, (list, tuple,)) and verticals:
            verticals = np.asarray(verticals)

        # Set vertical datum on the CRS variable
        if sensor_vertical_datum is not None:
            try:
                self.crs.geoid_name = sensor_vertical_datum
                self.crs.vertical_datum = sensor_vertical_datum
                self.crs.water_surface_reference_datum = sensor_vertical_datum
            except AttributeError:
                pass

        # Set default fillvalue for new variables
        if fillvalue is None:
            fillvalue = -9999.9

        used_values = None
        try:
            if unlink_from_profile is True:
                used_values = np.ma.reshape(values, (self.time.size, ))
                used_values = used_values[self.time_indexes]
            # These next two cases should work for all but a few cases, which are caught below
            elif self.z.size == 1:
                used_values = np.ma.reshape(values, (self.time.size, ))
                used_values = used_values[self.time_indexes]
            else:
                used_values = np.ma.reshape(values, (self.time.size, self.z.size, ))
                used_values = used_values[self.time_indexes]
                try:
                    used_values = used_values[:, self.vertical_indexes]
                except IndexError:
                    # The vertical values most likely had duplicates.  Ignore the
                    # falty index here and try to save the values as is.
                    pass
        except ValueError:
            if raise_on_error is True:
                self.close()
                raise
            else:
                logger.exception("Could not do a simple reshape of data, trying to match manually! Time:{!s}, Heights:{!s}, Values:{!s}".format(self.time.size, self.z.size, values.size))
            if self.z.size > 1:
                if times is not None and verticals is not None:
                    # Hmmm, we have two actual height values for this station.
                    # Not cool man, not cool.
                    # Reindex the entire values array.  This is slow.
                    indexed = ((bisect.bisect_left(self.time[:], times[i]), bisect.bisect_left(self.z[:], verticals[i]), values[i]) for i in xrange(values.size))
                    used_values = np.ndarray((self.time.size, self.z.size, ), dtype=np.float64)
                    used_values.fill(float(fillvalue))
                    for (tzi, zzi, vz) in indexed:
                        if zzi < self.z.size and tzi < self.time.size:
                            used_values[tzi, zzi] = vz
                else:
                    self.close()
                    raise ValueError("You need to pass in both 'times' and 'verticals' parameters that matches the size of the 'values' parameter.")
            else:
                if times is not None:
                    # Ugh, find the time indexes manually
                    indexed = ((bisect.bisect_left(self.time[:], times[i]), values[i]) for i in xrange(values.size))
                    used_values = np.ndarray((self.time.size, ), dtype=np.float64)
                    used_values.fill(float(fillvalue))
                    for (tzi, vz) in indexed:
                        if tzi < self.time.size:
                            used_values[tzi] = vz
                else:
                    self.close()
                    raise ValueError("You need to pass in a 'times' parameter that matches the size of the 'values' parameter.")

        logger.info("Setting values for {}...".format(variable_name))
        if len(used_values.shape) == 1:
            var = self.nc.createVariable(variable_name,    "f8", ("time",), fill_value=fillvalue, chunksizes=(1000,), zlib=True)
            if self.z.size == 1:
                var.coordinates = "{} {} latitude longitude".format(self.time_axis_name, self.vertical_axis_name)
            else:
                # This is probably a bottom sensor on an ADCP or something, don't add the height coordinate
                var.coordinates = "time latitude longitude"
                if unlink_from_profile is True:
                    # Create metadata variable for the sensor_depth
                    if self.nc.variables.get('sensor_depth') is None:
                        logger.info("Setting the special case 'sensor_depth' metadata variable")
                        inst_depth = self.nc.createVariable('sensor_depth', 'f4')
                        inst_depth.units = 'm'
                        inst_depth.standard_name = 'surface_altitude'
                        inst_depth.long_name = 'sensor depth below datum'
                        inst_depth.positive = self.vertical_positive
                        inst_depth.datum = sensor_vertical_datum or 'Unknown'
                        inst_depth[:] = verticals[0] * -1

        elif len(used_values.shape) == 2:
            var = self.nc.createVariable(variable_name,    "f8", ("time", "z",), fill_value=fillvalue, chunksizes=(1000, self.z.size,), zlib=True)
            var.coordinates = "{} {} latitude longitude".format(self.time_axis_name, self.vertical_axis_name)
        else:
            raise ValueError("Could not create variable.  Shape of data is {!s}.  Expected a dimension of 1 or 2, not {!s}.".format(used_values.shape, len(used_values.shape)))
        # Set the variable attributes as passed in
        if attributes:
            for k, v in attributes.iteritems():

                if k == 'vertical_datum' and sensor_vertical_datum is None and v is not None:
                    # Use this as the vertical datum if it is specified and we didn't already have one
                    try:
                        self.crs.geoid_name = v
                        self.crs.vertical_datum = v
                        self.crs.water_surface_reference_datum = v
                    except AttributeError:
                        pass

                if k != '_FillValue' and v is not None:
                    try:
                        setattr(var, k, v)
                    except BaseException:
                        logger.info('Could not add attribute {}: {}, skipping.'.format(k, v))

        var.grid_mapping = 'crs'
        var[:] = used_values

        return var