Example #1
0
        class MyAlgorithm(Algorithm):
            x = NodeTrait().tag(attr=True)
            y = NodeTrait().tag(attr=True)
            outputs = ["sum", "prod", "diff"]

            def algorithm(self, inputs):
                sum_ = inputs["x"] + inputs["y"]
                prod = inputs["x"] * inputs["y"]
                diff = inputs["x"] - inputs["y"]
                return np.stack([sum_, prod, diff], -1)
Example #2
0
class UnaryAlgorithm(BaseAlgorithm):
    """
    Base class for computation nodes that take a single source and transform it.

    Attributes
    ----------
    source : Node
        The source node

    Notes
    ------
    Developers of new Algorithm nodes need to implement the `eval` method.
    """

    source = NodeTrait().tag(attr=True,required=True)

    # list of attribute names, used by __repr__ and __str__ to display minimal info about the node
    _repr_keys = ["source"]

    @tl.default("outputs")
    def _default_outputs(self):
        return self.source.outputs

    @tl.default("style")
    def _default_style(self):  # Pass through source style by default
        return self.source.style
Example #3
0
class Reproject(Interpolate):
    """
    Create a Algorithm that evalutes a Node with one set of coordinates, and then interpolates it.
    This can be used to bilinearly interpolate an averaged dataset, for example.

    Attributes
    ----------
    source : Node
        The source node. This node will use it's own, specified interpolation scheme
    interpolation : str
        Type of interpolation method to use for the interpolation
    coordinates: Coordinates, Node, str, dict
        Coordinates used to evaluate the source. These can be specified as a dictionary, json-formatted string,
        PODPAC Coordinates, or a PODPAC Node, where the node MUST implement the 'coordinates' attribute.
    """

    coordinates = tl.Union(
        [NodeTrait(),
         tl.Dict(),
         tl.Unicode(),
         tl.Instance(Coordinates)],
        help=
        """Coordinates used to evaluate the source. These can be specified as a dictionary,
                           json-formatted string, PODPAC Coordinates, or a PODPAC Node, where the node MUST implement
                           the 'coordinates' attribute""",
    ).tag(attr=True)

    @tl.validate("coordinates")
    def _validate_coordinates(self, d):
        if isinstance(d["value"],
                      Node) and not hasattr(d["value"], "coordinates"):
            raise ValueError(
                "When specifying the coordinates as a PODPAC Node, this Node must have a 'coordinates' attribute"
            )
        return d["value"]

    @property
    def _coordinates(self):
        if isinstance(self.coordinates, Coordinates):
            return self.coordinates
        elif isinstance(self.coordinates, Node):
            return self.coordinates.coordinates
        elif isinstance(self.coordinates, dict):
            return Coordinates.from_definition(self.coordinates)
        elif isinstance(self.coordinates, string_types):
            return Coordinates.from_json(self.coordinates)
        else:
            raise TypeError("The coordinates attribute is of the wrong type.")

    def _source_eval(self, coordinates, selector, output=None):
        return self.source.eval(self._coordinates,
                                output=output,
                                _selector=selector)

    @property
    def base_ref(self):
        return "{}_reprojected".format(self.source.base_ref)
Example #4
0
class Process(Node):
    """
    Source node will be evaluated in another process, and it is blocking!
    """

    source = NodeTrait().tag(attr=True)
    output_format = tl.Dict(None, allow_none=True).tag(attr=True)
    timeout = tl.Int(None, allow_none=True)
    block = tl.Bool(True)

    @property
    def outputs(self):
        return self.source.outputs

    def eval(self, coordinates, **kwargs):
        output = kwargs.get("output")
        definition = self.source.json
        coords = coordinates.json

        q = Queue()
        process = mpProcess(target=_f,
                            args=(definition, coords, q, self.output_format))
        process.daemon = True
        _log.debug("Starting process.")
        process.start()
        _log.debug("Retrieving data from queue.")
        o = q.get(timeout=self.timeout, block=self.block)
        _log.debug("Joining.")
        process.join()  # This is blocking!
        _log.debug("Closing.")
        if (sys.version_info.major + sys.version_info.minor / 10.0) >= 3.7:
            process.close()  # New in version Python 3.7
        if isinstance(o, str):
            raise Exception(o)
        if o is None:
            return
        o._pp_deserialize()
        if output is not None:
            output[:] = o.data[:]
        else:
            output = o

        return output
Example #5
0
 class MyAlgorithm(BaseAlgorithm):
     x = NodeTrait().tag(attr=True)
     y = NodeTrait().tag(attr=True)
Example #6
0
 class MyNode(Node):
     my_attr = NodeTrait().tag(attr=True)
Example #7
0
class Mask(Algorithm):
    """
    Masks the `source` based on a boolean expression involving the `mask`
    (i.e. source[mask <bool_op> <bool_val> ] = <masked_val>).
    For a normal boolean mask input, default values for `bool_op`, `bool_val` and `masked_val` can be used.

    Attributes
    ----------
    source : podpac.Node
        The source that will be masked
    mask : podpac.Node
        The data that will be used to compute the mask
    masked_val : float, optional
        Default value is np.nan. The value that will replace the masked items.
    bool_val : float, optional
        Default value is 1. The value used to compare the mask when creating the boolean expression
    bool_op : enum, optional
        Default value is '=='. One of ['==', '<', '<=', '>', '>=']
    in_place : bool, optional
        Default is False. If True, the source array will be changed in-place, which could affect the value of the source
        in other parts of the pipeline.

    Examples
    ----------
    # Mask data from a boolean data node using the default behavior.
    # Create a boolean masked Node (as an example)
    b = Arithmetic(A=SinCoords(), eqn='A>0)
    # Create the source node
    a = Arange()
    masked = Mask(source=a, mask=b)

    # Create a node that make the following substitution "a[b > 0] = np.nan"
    a = Arange()
    b = SinCoords()
    masked = Mask(source=a, mask=b,
                  masked_val=np.nan,
                  bool_val=0, bool_op='>'
                  in_place=True)

    """

    source = NodeTrait().tag(attr=True, required=True)
    mask = NodeTrait().tag(attr=True, required=True)
    masked_val = tl.Float(allow_none=True, default_value=None).tag(attr=True)
    bool_val = tl.Float(1).tag(attr=True)
    bool_op = tl.Enum(["==", "<", "<=", ">", ">="], default_value="==").tag(attr=True)
    in_place = tl.Bool(False).tag(attr=True)

    _repr_keys = ["source", "mask"]

    def algorithm(self, inputs, coordinates):
        """
        Sets the values in inputs['source'] to self.masked_val using (inputs['mask'] <self.bool_op> <self.bool_val>)

        Attributes
        ----------
        inputs : dict
            Evaluated outputs of the input nodes. The keys are the attribute names.
        coordinates : podpac.Coordinates
            Requested coordinates.
            Note that the ``inputs`` may contain with different coordinates.

        Returns
        -------
        result : UnitsDataArray
            Algorithm result.
        """

        # shorter names
        mask = inputs["mask"]
        source = inputs["source"]
        op = self.bool_op
        bv = self.bool_val

        # Make a copy if we don't want to change the source in-place
        if not self.in_place:
            source = source.copy()

        # Make the mask boolean
        if op == "==":
            mask = mask == bv
        elif op == "<":
            mask = mask < bv
        elif op == "<=":
            mask = mask <= bv
        elif op == ">":
            mask = mask > bv
        elif op == ">=":
            mask = mask >= bv

        # Mask the values and return
        if self.masked_val is None:
            source.set(np.nan, mask)
        else:
            source.set(self.masked_val, mask)

        return source
Example #8
0
 class MyClass(tl.HasTraits):
     node = NodeTrait()
Example #9
0
class ReprojectedSource(DataSource):
    """Create a DataSource with a different resolution from another Node. This can be used to bilinearly interpolated a
    dataset after averaging over a larger area.

    Attributes
    ----------
    source : Node
        The source node
    source_interpolation : str
        Type of interpolation method to use for the source node
    reprojected_coordinates : :class:`podpac.Coordinates`
        Coordinates where the source node should be evaluated.
    """

    source = NodeTrait().tag(attr=True, required=True)
    source_interpolation = InterpolationTrait().tag(attr=True)
    reprojected_coordinates = tl.Instance(Coordinates).tag(attr=True,
                                                           required=True)

    # list of attribute names, used by __repr__ and __str__ to display minimal info about the node
    _repr_keys = ["source", "interpolation"]

    def _first_init(self, **kwargs):
        warnings.warn(
            "ReprojectedSource has been replaced by the Reproject algorithm node "
            "and will be removed in a future version of podpac.",
            DeprecationWarning,
        )

        if "reprojected_coordinates" in kwargs:
            if isinstance(kwargs["reprojected_coordinates"], dict):
                kwargs[
                    "reprojected_coordinates"] = Coordinates.from_definition(
                        kwargs["reprojected_coordinates"])
            elif isinstance(kwargs["reprojected_coordinates"], string_types):
                kwargs["reprojected_coordinates"] = Coordinates.from_json(
                    kwargs["reprojected_coordinates"])

        return super(ReprojectedSource, self)._first_init(**kwargs)

    @cached_property
    def eval_source(self):
        if self.source_interpolation is not None and not self.source.has_trait(
                "interpolation"):
            _logger.warning(
                "ReprojectedSource cannot set the 'source_interpolation'"
                " since 'source' does not have an 'interpolation' "
                " trait. \n type(source): %s\nsource: %s" %
                (str(type(self.source)), str(self.source)))

        source = self.source
        if (self.source_interpolation is not None
                and self.source.has_trait("interpolation")
                and self.source_interpolation != self.source.interpolation):
            source = copy.deepcopy(source)
            source.set_trait("interpolation", self.source_interpolation)

        return source

    @common_doc(COMMON_DATA_DOC)
    def get_coordinates(self):
        """{get_coordinates}"""

        # cannot guarantee that coordinates exist
        if not isinstance(self.source, DataSource):
            return self.reprojected_coordinates

        sc = self.source.coordinates
        rc = self.reprojected_coordinates
        return Coordinates(
            [
                rc[dim] if dim in rc.dims else self.source.coordinates[dim]
                for dim in self.source.coordinates.dims
            ],
            validate_crs=False,
        )

    @common_doc(COMMON_DATA_DOC)
    def get_data(self, coordinates, coordinates_index):
        """{get_data}"""

        data = self.eval_source.eval(coordinates)

        # The following is needed in case the source is an algorithm
        # or compositor node that doesn't have all the dimensions of
        # the reprojected coordinates
        # TODO: What if data has coordinates that reprojected_coordinates doesn't have
        keep_dims = list(data.coords.keys())
        drop_dims = [d for d in coordinates.dims if d not in keep_dims]
        coordinates.drop(drop_dims)
        return data

    @property
    def base_ref(self):
        return "{}_reprojected".format(self.source.base_ref)
Example #10
0
class BaseCompositor(Node):
    """A base class for compositor nodes.

    Attributes
    ----------
    sources : list
        Source nodes.
    source_coordinates : :class:`podpac.Coordinates`
        Coordinates that make each source unique. Must the same size as ``sources`` and single-dimensional. Optional.
    multithreading : bool, optional
        Default is False. If True, will always evaluate the compositor in serial, ignoring any MULTITHREADING settings

    Notes
    -----
    Developers of compositor subclasses nodes need to implement the `composite` method.

    Multitheading::
      * When MULTITHREADING is False, the compositor stops evaluated sources once the output is completely filled.
      * When MULTITHREADING is True, the compositor must evaluate every source.
        The result is the same, but note that because of this, disabling multithreading could sometimes be faster,
        especially if the number of threads is low.
      * NASA data servers seem to have a hard limit of 10 simultaneous requests, so a max of 10 threads is recommend
        for most use-cases.
    """

    sources = tl.List(trait=NodeTrait()).tag(attr=True, required=True)
    source_coordinates = tl.Instance(Coordinates,
                                     allow_none=True,
                                     default_value=None).tag(attr=True)
    multithreading = tl.Bool(False)

    @tl.default("multithreading")
    def _default_multithreading(self):
        return settings["MULTITHREADING"]

    dims = tl.List(trait=Dimension()).tag(attr=True)
    auto_outputs = tl.Bool(False)

    # debug traits
    _eval_sources = tl.Any()

    @tl.validate("sources")
    def _validate_sources(self, d):
        sources = d["value"]

        n = np.sum([source.outputs is None for source in sources])
        if not (n == 0 or n == len(sources)):
            raise ValueError(
                "Cannot composite standard sources with multi-output sources. "
                "The sources must all be standard single-output nodes or all multi-output nodes."
            )

        return sources

    @tl.validate("source_coordinates")
    def _validate_source_coordinates(self, d):
        if d["value"] is None:
            return None

        if d["value"].ndim != 1:
            raise ValueError(
                "Invalid source_coordinates, invalid ndim (%d != 1)" %
                d["value"].ndim)

        if d["value"].size != len(self.sources):
            raise ValueError(
                "Invalid source_coordinates, source and source_coordinates size mismatch (%d != %d)"
                % (d["value"].size, len(self.sources)))

        return d["value"]

    @tl.default("outputs")
    def _default_outputs(self):
        if not self.auto_outputs:
            return None

        # autodetect outputs from sources
        if all(source.outputs is None for source in self.sources):
            outputs = None

        elif all(source.outputs is not None and source.output is None
                 for source in self.sources):
            outputs = []
            for source in self.sources:
                for output in source.outputs:
                    if output not in outputs:
                        outputs.append(output)

            if len(outputs) == 0:
                outputs = None

        else:
            raise RuntimeError(
                "Compositor sources were not validated correctly. "
                "Cannot composite standard sources with multi-output sources.")

        return outputs

    def select_sources(self, coordinates, _selector=None):
        """Select and prepare sources based on requested coordinates.

        Parameters
        ----------
        coordinates : :class:`podpac.Coordinates`
            Coordinates to evaluate at compositor sources
        _selector : :class:`podpac.core.interpolation.selectors.Selector`
            Selector used to sub-select sources based on the interpolation scheme

        Returns
        -------
        sources : :class:`np.ndarray`
            Array of sources

        Notes
        -----
         * If :attr:`source_coordinates` is defined, only sources that intersect the requested coordinates are selected.
        """

        # select intersecting sources, if possible
        if self.source_coordinates is None:
            sources = self.sources
        else:
            try:
                if _selector is not None:
                    _, I = _selector(self.source_coordinates,
                                     coordinates,
                                     index_type="numpy")
                else:
                    _, I = self.source_coordinates.intersect(coordinates,
                                                             outer=True,
                                                             return_index=True)
            except:
                # Likely non-monotonic coordinates
                _, I = self.source_coordinates.intersect(coordinates,
                                                         outer=False,
                                                         return_index=True)
            i = I[0]
            sources = np.array(self.sources)[i].tolist()

        return sources

    def composite(self, coordinates, data_arrays, result=None):
        """Implements the rules for compositing multiple sources together. Must be implemented by child classes.

        Parameters
        ----------
        coordinates : :class:`podpac.Coordinates`
            {requested_coordinates}
        data_arrays : generator
            Evaluated data, in the same order as the sources. Yields a UnitsDataArray.
        result : UnitDataArray, optional
            An optional pre-filled array may be supplied, otherwise the output will be allocated.

        Returns
        -------
        {eval_return}
        """

        raise NotImplementedError()

    def iteroutputs(self, coordinates, _selector=None):
        """Summary

        Parameters
        ----------
        coordinates : :class:`podpac.Coordinates`
            Coordinates to evaluate at compositor sources

        Yields
        ------
        :class:`podpac.core.units.UnitsDataArray`
            Output from source node eval method
        """

        # get sources, potentially downselected
        sources = self.select_sources(coordinates, _selector)

        if settings["DEBUG"]:
            self._eval_sources = sources

        if len(sources) == 0:
            yield self.create_output_array(coordinates)
            return

        if self.multithreading:
            n_threads = thread_manager.request_n_threads(len(sources))
            if n_threads == 1:
                thread_manager.release_n_threads(n_threads)
        else:
            n_threads = 0

        if self.multithreading and n_threads > 1:
            # evaluate nodes in parallel using thread pool
            self._multi_threaded = True
            pool = thread_manager.get_thread_pool(processes=n_threads)
            outputs = pool.map(
                lambda src: src.eval(coordinates, _selector=_selector),
                sources)
            pool.close()
            thread_manager.release_n_threads(n_threads)
            for output in outputs:
                yield output

        else:
            # evaluate nodes serially
            self._multi_threaded = False
            for src in sources:
                yield src.eval(coordinates, _selector=_selector)

    @common_doc(COMMON_COMPOSITOR_DOC)
    def eval(self, coordinates, **kwargs):
        """
        Wraps the super Node.eval method in order to cache with the correct coordinates.

        The output is independent of any extra dimensions, so this removes extra dimensions before caching in the
        super eval method.
        """

        super_coordinates = coordinates

        # remove extra dimensions
        if self.dims:
            extra = [
                c.name for c in coordinates.values()
                if (isinstance(c, Coordinates1d) and c.name not in self.dims)
                or (isinstance(c, StackedCoordinates) and all(
                    dim not in self.dims for dim in c.dims))
            ]
            super_coordinates = super_coordinates.drop(extra)

        # note: super().eval (not self._eval)
        output = super().eval(super_coordinates, **kwargs)

        if settings["DEBUG"]:
            self._requested_coordinates = coordinates

        return output

    @common_doc(COMMON_COMPOSITOR_DOC)
    def _eval(self, coordinates, output=None, _selector=None):
        """Evaluates this nodes using the supplied coordinates.

        Parameters
        ----------
        coordinates : :class:`podpac.Coordinates`
            {requested_coordinates}
        output : podpac.UnitsDataArray, optional
            {eval_output}
        _selector: callable(coordinates, request_coordinates)
            {eval_selector}

        Returns
        -------
        {eval_return}
        """

        self._evaluated_coordinates = coordinates
        outputs = self.iteroutputs(coordinates, _selector)
        output = self.composite(coordinates, outputs, output)
        return output

    def find_coordinates(self):
        """
        Get the available coordinates for the Node.

        Returns
        -------
        coords_list : list
            available coordinates from all of the sources.
        """

        return [
            coords for source in self.sources
            for coords in source.find_coordinates()
        ]

    @property
    def _repr_keys(self):
        """list of attribute names, used by __repr__ and __str__ to display minimal info about the node"""
        keys = []
        if self.trait_is_defined("sources"):
            keys.append("sources")
        return keys
Example #11
0
class Parallel(Node):
    """
    This class launches the parallel node evaluations in separate threads. As such, the node does not need to return
    immediately (i.e. does NOT have to be asynchronous). For asynchronous nodes
    (i.e. aws.Lambda with download_result=False) use ParrallelAsync

    Attributes
    -----------
    chunks: dict
        Dictionary of dimensions and sizes that will be iterated over. If a dimension is not in this dictionary, the
        size of the eval coordinates will be used for the chunk. In this case, it may not be possible to automatically
        set the coordinates of missing dimensions in the final file.
    fill_output: bool
        Default is True. When True, the final results will be assembled and returned to the user. If False, the final
        results should be written to a file by specifying the output_format in a Process or Lambda node.
        See note below.
    source: podpac.Node
        The source dataset for the computation
    number_of_workers: int
        Default is 1. Number of parallel process workers at one time.
    start_i: int, optional
        Default is 0. Starting chunk. This allow you to restart a run without having to check/submit 1000's of workers
        before getting back to where you were. Empty chunks make the submission slower.

    Notes
    ------
    In some cases where the input and output coordinates of the source node is not the same (such as reduce nodes)
    and fill_output is True, the user may need to specify 'output' as part of the eval call.
    """

    _repr_keys = ["source", "number_of_workers", "chunks"]
    source = NodeTrait().tag(attr=True)
    chunks = tl.Dict().tag(attr=True)
    fill_output = tl.Bool(True).tag(attr=True)
    number_of_workers = tl.Int(1).tag(attr=True)
    _lock = Lock()
    errors = tl.List()
    start_i = tl.Int(0)

    def eval(self, coordinates, **kwargs):
        output = kwargs.get("output")
        # Make a thread pool to manage queue
        pool = ThreadPool(processes=self.number_of_workers)

        if output is None and self.fill_output:
            output = self.create_output_array(coordinates)

        shape = []
        for d in coordinates.dims:
            if d in self.chunks:
                shape.append(self.chunks[d])
            else:
                shape.append(coordinates[d].size)

        results = []
        #         inputs = []
        i = 0
        for coords, slc in coordinates.iterchunks(shape, True):
            #             inputs.append(coords)
            if i < self.start_i:
                _log.debug(
                    "Skipping {} since it is less than self.start_i ({})".
                    format(i, self.start_i))
                i += 1
                continue

            out = None
            if self.fill_output and output is not None:
                out = output[slc]
            with self._lock:
                _log.debug("Added {} to worker pool".format(i))
                _log.debug("Node eval with coords: {}, {}".format(slc, coords))
                results.append(
                    pool.apply_async(self.eval_source, [coords, slc, out, i]))
            i += 1

        _log.info("Added all chunks to worker pool. Now waiting for results.")
        start_time = time.time()
        for i, res in enumerate(results):
            #             _log.debug('Waiting for results: {} {}'.format(i, inputs[i]))
            dt = str(
                np.timedelta64(int(1000 * (time.time() - start_time)),
                               "ms").astype(object))
            _log.info("({}): Waiting for results: {} / {}".format(
                dt, i + 1, len(results)))

            # Try to get the results / wait for the results
            try:
                o, slc = res.get()
            except Exception as e:
                o = None
                slc = None
                self.errors.append((i, res, e))
                dt = str(
                    np.timedelta64(int(1000 * (time.time() - start_time)),
                                   "ms").astype(object))
                _log.warning("({}) {} failed with exception {}".format(
                    dt, i, e))

            dt = str(
                np.timedelta64(int(1000 * (time.time() - start_time)),
                               "ms").astype(object))
            _log.info("({}) Finished result: {} / {}".format(
                time.time() - start_time, i + 1, len(results)))

            # Fill output
            if self.fill_output:
                if output is None:
                    missing_dims = [
                        d for d in coordinates.dims
                        if d not in self.chunks.keys()
                    ]
                    coords = coordinates.drop(missing_dims)
                    missing_coords = Coordinates.from_xarray(o).drop(
                        list(self.chunks.keys()))
                    coords = merge_dims([coords, missing_coords])
                    coords = coords.transpose(*coordinates.dims)
                    output = self.create_output_array(coords)
                output[slc] = o

        _log.info("Completed parallel execution.")
        pool.close()

        return output

    def eval_source(self, coordinates, coordinates_index, out, i, source=None):
        if source is None:
            source = self.source
            # Make a copy to prevent any possibility of memory corruption
            source = Node.from_definition(source.definition)

        _log.info("Submitting source {}".format(i))
        return (source.eval(coordinates, output=out), coordinates_index)
Example #12
0
class ZarrOutputMixin(tl.HasTraits):
    """
    This class assumes that the node has a 'output_format' attribute
    (currently the "Lambda" Node, and the "Process" Node)

    Attributes
    -----------
    zarr_file: str
        Path to the output zarr file that collects all of the computed results. This can reside on S3.
    dataset: ZarrGroup
        A handle to the zarr group pointing to the output file
    fill_output: bool, optional
        Default is False (unlike parent class). If True, will collect the output data and return it as an xarray.
    init_file_mode: str, optional
        Default is 'w'. Mode used for initializing the zarr file.
    zarr_chunks: dict
        Size of the chunks in the zarr file for each dimension
    zarr_shape: dict, optional
        Default is the {coordinated.dims: coordinates.shape}, where coordinates used as part of the eval call. This
        does not need to be specified unless the Node modifies the input coordinates (as part of a Reduce operation,
        for example). The result can be incorrect and requires care/checking by the user.
    zarr_coordinates: podpac.Coordinates, optional
        Default is None. If the node modifies the shape of the input coordinates, this allows users to set the
        coordinates in the output zarr file. This can be incorrect and requires care by the user.
    skip_existing: bool
        Default is False. If true, this will check to see if the results already exist. And if so, it will not
        submit a job for that particular coordinate evaluation. This assumes self.chunks == self.zar_chunks
    list_dir: bool, optional
        Default is False. If skip_existing is True, by default existing files are checked by asking for an 'exists' call.
        If list_dir is True, then at the first opportunity a "list_dir" is performed on the directory and the results
        are cached.
    """

    zarr_file = tl.Unicode().tag(attr=True)
    dataset = tl.Any()
    zarr_node = NodeTrait()
    zarr_data_key = tl.Union([tl.Unicode(), tl.List()])
    fill_output = tl.Bool(False)
    init_file_mode = tl.Unicode("a").tag(attr=True)
    zarr_chunks = tl.Dict(default_value=None, allow_none=True).tag(attr=True)
    zarr_shape = tl.Dict(allow_none=True, default_value=None).tag(attr=True)
    zarr_coordinates = tl.Instance(Coordinates,
                                   allow_none=True,
                                   default_value=None).tag(attr=True)
    zarr_dtype = tl.Unicode("f4")
    skip_existing = tl.Bool(True).tag(attr=True)
    list_dir = tl.Bool(False)
    _list_dir = tl.List(allow_none=True, default_value=[])
    _shape = tl.Tuple()
    _chunks = tl.List()
    aws_client_kwargs = tl.Dict()
    aws_config_kwargs = tl.Dict()

    def eval(self, coordinates, **kwargs):
        output = kwargs.get("output")
        if self.zarr_shape is None:
            self._shape = coordinates.shape
        else:
            self._shape = tuple(self.zarr_shape.values())

        # initialize zarr file
        if self.zarr_chunks is None:
            chunks = [self.chunks[d] for d in coordinates]
        else:
            chunks = [self.zarr_chunks[d] for d in coordinates]
        self._chunks = chunks
        zf, data_key, zn = self.initialize_zarr_array(self._shape, chunks)
        self.dataset = zf
        self.zarr_data_key = data_key
        self.zarr_node = zn
        zn.keys

        # eval
        _log.debug("Starting parallel eval.")
        missing_dims = [
            d for d in coordinates.dims if d not in self.chunks.keys()
        ]
        if self.zarr_coordinates is not None:
            missing_dims = missing_dims + [
                d for d in self.zarr_coordinates.dims if d not in missing_dims
            ]
            set_coords = merge_dims(
                [coordinates.drop(missing_dims), self.zarr_coordinates])
        else:
            set_coords = coordinates.drop(missing_dims)
        set_coords.transpose(*coordinates.dims)

        self.set_zarr_coordinates(set_coords, data_key)
        if self.list_dir:
            dk = data_key
            if isinstance(dk, list):
                dk = dk[0]
            self._list_dir = self.zarr_node.list_dir(dk)

        output = super(ZarrOutputMixin, self).eval(coordinates, output=output)

        # fill in the coordinates, this is guaranteed to be correct even if the user messed up.
        if output is not None:
            self.set_zarr_coordinates(Coordinates.from_xarray(output),
                                      data_key)
        else:
            return zf

        return output

    def set_zarr_coordinates(self, coordinates, data_key):
        # Fill in metadata
        for dk in data_key:
            self.dataset[dk].attrs["_ARRAY_DIMENSIONS"] = coordinates.dims
        for d in coordinates.dims:
            # TODO ADD UNITS AND TIME DECODING INFORMATION
            self.dataset.create_dataset(d,
                                        shape=coordinates[d].size,
                                        overwrite=True)
            self.dataset[d][:] = coordinates[d].coordinates

    def initialize_zarr_array(self, shape, chunks):
        _log.debug("Creating Zarr file.")
        zn = Zarr(source=self.zarr_file,
                  file_mode=self.init_file_mode,
                  aws_client_kwargs=self.aws_client_kwargs)
        if self.source.output or getattr(self.source, "data_key", None):
            data_key = self.source.output
            if data_key is None:
                data_key = self.source.data_key
            if not isinstance(data_key, list):
                data_key = [data_key]
            elif self.source.outputs:  # If someone restricted the outputs for this node, we need to know
                data_key = [dk for dk in data_key if dk in self.source.outputs]
        elif self.source.outputs:
            data_key = self.source.outputs
        else:
            data_key = ["data"]

        zf = zarr.open(zn._get_store(), mode=self.init_file_mode)

        # Intialize the output zarr arrays
        for dk in data_key:
            try:
                arr = zf.create_dataset(
                    dk,
                    shape=shape,
                    chunks=chunks,
                    fill_value=np.nan,
                    dtype=self.zarr_dtype,
                    overwrite=not self.skip_existing,
                )
            except ValueError:
                pass  # Dataset already exists

        # Recompute any cached properties
        zn = Zarr(source=self.zarr_file,
                  file_mode=self.init_file_mode,
                  aws_client_kwargs=self.aws_client_kwargs)
        return zf, data_key, zn

    def eval_source(self, coordinates, coordinates_index, out, i, source=None):
        if source is None:
            source = self.source

        if self.skip_existing:  # This section allows previously computed chunks to be skipped
            dk = self.zarr_data_key
            if isinstance(dk, list):
                dk = dk[0]
            try:
                exists = self.zarr_node.chunk_exists(coordinates_index,
                                                     data_key=dk,
                                                     list_dir=self._list_dir,
                                                     chunks=self._chunks)
            except ValueError as e:  # This was needed in cases where a poor internet connection caused read errors
                exists = False
            if exists:
                _log.info("Skipping {} (already exists)".format(i))
                return out, coordinates_index

        # Make a copy to prevent any possibility of memory corruption
        source = Node.from_definition(source.definition)
        _log.debug("Creating output format.")
        output = dict(
            format="zarr_part",
            format_kwargs=dict(
                part=[[s.start, min(s.stop, self._shape[i]), s.step]
                      for i, s in enumerate(coordinates_index)],
                source=self.zarr_file,
                mode="a",
            ),
        )
        _log.debug("Finished creating output format.")

        if source.has_trait("output_format"):
            source.set_trait("output_format", output)
        _log.debug("output: {}, coordinates.shape: {}".format(
            output, coordinates.shape))
        _log.debug("Evaluating node.")

        o, slc = super(ZarrOutputMixin,
                       self).eval_source(coordinates, coordinates_index, out,
                                         i, source)

        if not source.has_trait("output_format"):
            o.to_format(output["format"], **output["format_kwargs"])
        return o, slc
Example #13
0
class ParallelAsync(Parallel):
    """
    This class launches the parallel node evaluations in threads up to n_workers, and expects the node.eval to return
    quickly for parallel execution. This Node was written with aws.Lambda(eval_timeout=1.25<small>) Nodes in mind.

    Users can implement the `check_worker_available` method or specify the `no_worker_exception` attribute, which is an
    exception thrown if workers are not available.

    Attributes
    -----------
    chunks: dict
        Dictionary of dimensions and sizes that will be iterated over. If a dimension is not in this dictionary, the
        size of the eval coordinates will be used for the chunk. In this case, it may not be possible to automatically
        set the coordinates of missing dimensions in the final file.
    fill_output: bool
        Default is True. When True, the final results will be assembled and returned to the user. If False, the final
        results should be written to a file by specifying the output_format in a Process or Lambda node.
        See note below.
    source: podpac.Node
        The source dataset for the computation
    sleep_time: float
        Default is 1 second. Number of seconds to sleep between trying to submit new workers
    no_worker_exception: Exception, optional
        Default is .Exception class used to identify when a submission failed due to no available workers. The default
        is chosen to work with the podpac.managers.Lambda node.
    async_exception: Exception
        Default is botocore.exceptions.ReadTimeoutException. This is an exception thrown by the async function in case
        it time out waiting for a return. In our case, this is a success. The default is chosen to work with the
        podpac.managers.Lambda node.
    Notes
    ------
    In some cases where the input and output coordinates of the source node is not the same (such as reduce nodes)
    and fill_output is True, the user may need to specify 'output' as part of the eval call.
    """

    source = NodeTrait().tag(attr=True)
    chunks = tl.Dict().tag(attr=True)
    fill_output = tl.Bool(True).tag(attr=True)
    sleep_time = tl.Float(1).tag(attr=True)
    no_worker_exception = tl.Type(
        botocore.exceptions.ClientError).tag(attr=True)
    async_exception = tl.Type(
        botocore.exceptions.ReadTimeoutError).tag(attr=True)

    def check_worker_available(self):
        return True

    def eval_source(self, coordinates, coordinates_index, out, i, source=None):
        if source is None:
            source = self.source
            # Make a copy to prevent any possibility of memory corruption
            source = Node.from_definition(source.definition)

        success = False
        o = None
        while not success:
            if self.check_worker_available():
                try:
                    o = source.eval(coordinates, output=out)
                    success = True
                except self.async_exception:
                    # This exception is fine and constitutes a success
                    o = None
                    success = True
                except self.no_worker_exception as e:
                    response = e.response
                    if not (response and response.get("Error", {}).get("Code")
                            == "TooManyRequestsException"):
                        raise e  # Raise error again, not the right error
                    _log.debug("Worker {} exception {}".format(i, e))
                    success = False
                    time.sleep(self.sleep_time)
            else:
                _log.debug("Worker unavailable for {}".format(i, e))
                time.sleep(self.sleep_time)
        _log.info("Submitting source {}".format(i))
        return (o, coordinates_index)
Example #14
0
class Interpolate(Node):
    """Node to used to interpolate from self.source.coordinates to the user-specified, evaluated coordinates.

    Parameters
    ----------
    source : Any
        The source node which will be interpolated
    interpolation : str, dict, optional
        Interpolation definition for the data source.
        By default, the interpolation method is set to ``'nearest'`` for all dimensions.

         If input is a string, it must match one of the interpolation shortcuts defined in
        :attr:`podpac.data.INTERPOLATION_SHORTCUTS`. The interpolation method associated
        with this string will be applied to all dimensions at the same time.

        If input is a dict or list of dict, the dict or dict elements must adhere to the following format:

        The key ``'method'`` defining the interpolation method name.
        If the interpolation method is not one of :attr:`podpac.data.INTERPOLATION_SHORTCUTS`, a
        second key ``'interpolators'`` must be defined with a list of
        :class:`podpac.interpolators.Interpolator` classes to use in order of uages.
        The dictionary may contain an option ``'params'`` key which contains a dict of parameters to pass along to
        the :class:`podpac.interpolators.Interpolator` classes associated with the interpolation method.

        The dict may contain the key ``'dims'`` which specifies dimension names (i.e. ``'time'`` or ``('lat', 'lon')`` ).
        If the dictionary does not contain a key for all unstacked dimensions of the source coordinates, the
        :attr:`podpac.data.INTERPOLATION_DEFAULT` value will be used.
        All dimension keys must be unstacked even if the underlying coordinate dimensions are stacked.
        Any extra dimensions included but not found in the source coordinates will be ignored.

        The dict may contain a key ``'params'`` that can be used to configure the :class:`podpac.interpolators.Interpolator` classes associated with the interpolation method.

        If input is a :class:`podpac.data.Interpolation` class, this Interpolation
        class will be used without modification.
    cache_output : bool
        Should the node's output be cached? If not provided or None, uses default based on
        settings["CACHE_DATASOURCE_OUTPUT_DEFAULT"]. If True, outputs will be cached and retrieved from cache. If False,
        outputs will not be cached OR retrieved from cache (even if they exist in cache).

    Examples
    -----
    # To use bilinear interpolation for [lat,lon]  a specific interpolator for [time], and the default for [alt], use:
    >>> interp_node = Interpolation(
            source=some_node,
            interpolation=interpolation = [
                {
                'method': 'bilinear',
                'dims': ['lat', 'lon']
                },
                {
                'method': [podpac.interpolators.NearestNeighbor],
                'dims': ['time']
                }
            ]
        )

    """

    source = NodeTrait(allow_none=True).tag(attr=True)
    _source_xr = tl.Instance(UnitsDataArray, allow_none=True)  # This is needed for the Interpolation Mixin

    interpolation = InterpolationTrait().tag(attr=True)
    cache_output = tl.Bool()

    # privates
    _interpolation = tl.Instance(InterpolationManager)
    _coordinates = tl.Instance(Coordinates, allow_none=True, default_value=None, read_only=True)

    _requested_source_coordinates = tl.Instance(Coordinates)
    _requested_source_coordinates_index = tl.Tuple()
    _requested_source_data = tl.Instance(UnitsDataArray)
    _evaluated_coordinates = tl.Instance(Coordinates)

    # this adds a more helpful error message if user happens to try an inspect _interpolation before evaluate
    @tl.default("_interpolation")
    def _default_interpolation(self):
        self._set_interpolation()
        return self._interpolation

    @tl.default("cache_output")
    def _cache_output_default(self):
        return settings["CACHE_NODE_OUTPUT_DEFAULT"]

    # ------------------------------------------------------------------------------------------------------------------
    # Properties
    # ------------------------------------------------------------------------------------------------------------------

    @property
    def interpolation_class(self):
        """Get the interpolation class currently set for this data source.

        The DataSource ``interpolation`` property is used to define the
        :class:`podpac.data.InterpolationManager` class that will handle interpolation for requested coordinates.

        Returns
        -------
        :class:`podpac.data.InterpolationManager`
            InterpolationManager class defined by DataSource `interpolation` definition
        """

        return self._interpolation

    @property
    def interpolators(self):
        """Return the interpolators selected for the previous node evaluation interpolation.
        If the node has not been evaluated, or if interpolation was not necessary, this will return
        an empty OrderedDict

        Returns
        -------
        OrderedDict
            Key are tuple of unstacked dimensions, the value is the interpolator used to interpolate these dimensions
        """

        if self._interpolation._last_interpolator_queue is not None:
            return self._interpolation._last_interpolator_queue
        else:
            return OrderedDict()

    def _set_interpolation(self):
        """Update _interpolation property"""

        # define interpolator with source coordinates dimensions
        if isinstance(self.interpolation, InterpolationManager):
            self._interpolation = self.interpolation
        else:
            self._interpolation = InterpolationManager(self.interpolation)

    def _eval(self, coordinates, output=None, _selector=None):
        """Evaluates this node using the supplied coordinates.

        The coordinates are mapped to the requested coordinates, interpolated if necessary, and set to
        `_requested_source_coordinates` with associated index `_requested_source_coordinates_index`. The requested
        source coordinates and index are passed to `get_data()` returning the source data at the
        coordinatesset to `_requested_source_data`. Finally `_requested_source_data` is interpolated
        using the `interpolate` method and set to the `output` attribute of the node.


        Parameters
        ----------
        coordinates : :class:`podpac.Coordinates`
            {requested_coordinates}

            An exception is raised if the requested coordinates are missing dimensions in the DataSource.
            Extra dimensions in the requested coordinates are dropped.
        output : :class:`podpac.UnitsDataArray`, optional
            {eval_output}
        _selector :
            {eval_selector}

        Returns
        -------
        {eval_return}

        Raises
        ------
        ValueError
            Cannot evaluate these coordinates
        """

        _logger.debug("Evaluating {} data source".format(self.__class__.__name__))

        # store requested coordinates for debugging
        if settings["DEBUG"]:
            self._original_requested_coordinates = coordinates

        # store input coordinates to evaluated coordinates
        self._evaluated_coordinates = deepcopy(coordinates)

        # reset interpolation
        self._set_interpolation()

        selector = self._interpolation.select_coordinates

        source_out = self._source_eval(self._evaluated_coordinates, selector)
        source_coords = Coordinates.from_xarray(source_out.coords, crs=source_out.crs)

        # Drop extra coordinates
        extra_dims = [d for d in coordinates.udims if d not in source_coords.udims]
        coordinates = coordinates.drop(extra_dims)

        # Transform so that interpolation happens on the source data coordinate system
        if source_coords.crs.lower() != coordinates.crs.lower():
            coordinates = coordinates.transform(source_coords.crs)

        if output is None:
            if "output" in source_out.dims:
                self.set_trait("outputs", source_out.coords["output"].data.tolist())
            output = self.create_output_array(coordinates)

        if source_out.size == 0:  # short cut
            return output

        # interpolate data into output
        output = self._interpolation.interpolate(source_coords, source_out, coordinates, output)

        # if requested crs is differented than coordinates,
        # fabricate a new output with the original coordinates and new values
        if self._evaluated_coordinates.crs != coordinates.crs:
            output = self.create_output_array(self._evaluated_coordinates.drop(extra_dims), data=output[:].values)

        # save output to private for debugging
        if settings["DEBUG"]:
            self._output = output
            self._source_xr = source_out

        return output

    def _source_eval(self, coordinates, selector, output=None):
        if isinstance(self._source_xr, UnitsDataArray):
            return self._source_xr
        else:
            return self.source.eval(coordinates, output=output, _selector=selector)

    def find_coordinates(self):
        """
        Get the available coordinates for the Node. For a DataSource, this is just the coordinates.

        Returns
        -------
        coords_list : list
            singleton list containing the coordinates (Coordinates object)
        """

        return self.source.find_coordinates()
Example #15
0
class Reproject(Interpolate):
    """
    Create a Algorithm that evalutes a Node with one set of coordinates, and then interpolates it.
    This can be used to bilinearly interpolate an averaged dataset, for example.

    Attributes
    ----------
    source : Node
        The source node. This node will use it's own, specified interpolation scheme
    interpolation : str
        Type of interpolation method to use for the interpolation
    coordinates : Coordinates, Node, str, dict
        Coordinates used to evaluate the source. These can be specified as a dictionary, json-formatted string,
        PODPAC Coordinates, or a PODPAC Node, where the node MUST implement the 'coordinates' attribute.
    reproject_dims : list
        Dimensions to reproject. The source will be evaluated with the reprojection coordinates in these dims
        and the requested coordinates for any other dims.
    """

    coordinates = tl.Union(
        [NodeTrait(),
         tl.Dict(),
         tl.Unicode(),
         tl.Instance(Coordinates)],
        help=
        """Coordinates used to evaluate the source. These can be specified as a dictionary,
                           json-formatted string, PODPAC Coordinates, or a PODPAC Node, where the node MUST implement
                           the 'coordinates' attribute""",
    ).tag(attr=True)

    reproject_dims = tl.List(trait=tl.Unicode(),
                             allow_none=True,
                             default_value=None).tag(attr=True)

    @tl.validate("coordinates")
    def _validate_coordinates(self, d):
        val = d["value"]
        if isinstance(val, Node):
            if not hasattr(val, "coordinates"):
                raise ValueError(
                    "When specifying the coordinates as a PODPAC Node, this Node must have a 'coordinates' attribute"
                )
        elif isinstance(val, dict):
            Coordinates.from_definition(self.coordinates)
        elif isinstance(val, string_types):
            Coordinates.from_json(self.coordinates)
        return val

    @cached_property
    def reprojection_coordinates(self):
        # get coordinates
        if isinstance(self.coordinates, Coordinates):
            coordinates = self.coordinates
        elif isinstance(self.coordinates, Node):
            coordinates = self.coordinates.coordinates
        elif isinstance(self.coordinates, dict):
            coordinates = Coordinates.from_definition(self.coordinates)
        elif isinstance(self.coordinates, string_types):
            coordinates = Coordinates.from_json(self.coordinates)

        # drop non-reprojection dims
        if self.reproject_dims is not None:
            coordinates = coordinates.drop(
                [dim for dim in coordinates if dim not in self.reproject_dims])

        return coordinates

    def _source_eval(self, coordinates, selector, output=None):
        coords = self.reprojection_coordinates.intersect(coordinates,
                                                         outer=True)
        extra_eval_coords = coordinates.drop(
            self.reproject_dims or self.reprojection_coordinates.dims)
        if coords.crs != coordinates.crs:
            # Better to evaluate in reproject coordinate crs than eval crs for next step of interpolation
            extra_eval_coords = extra_eval_coords.transform(coords.crs)
        coords = merge_dims([coords, extra_eval_coords])
        return self.source.eval(coords, output=output, _selector=selector)

    @property
    def base_ref(self):
        return "{}_reprojected".format(self.source.base_ref)
Example #16
0
class ModifyCoordinates(UnaryAlgorithm):
    """
    Base class for nodes that modify the requested coordinates before evaluation.

    Attributes
    ----------
    source : podpac.Node
        Source node that will be evaluated with the modified coordinates.
    coordinates_source : podpac.Node
        Node that supplies the available coordinates when necessary, optional. The source node is used by default.
    lat, lon, time, alt : List
        Modification parameters for given dimension. Varies by node.
    """

    coordinates_source = NodeTrait().tag(attr=True)
    lat = tl.List().tag(attr=True)
    lon = tl.List().tag(attr=True)
    time = tl.List().tag(attr=True)
    alt = tl.List().tag(attr=True)
    substitute_eval_coords = tl.Bool(False).tag(attr=True)

    _modified_coordinates = tl.Instance(Coordinates, allow_none=True)

    @tl.default("coordinates_source")
    def _default_coordinates_source(self):
        return self.source

    @common_doc(COMMON_DOC)
    def _eval(self, coordinates, output=None, _selector=None):
        """Evaluates this nodes using the supplied coordinates.

        Parameters
        ----------
        coordinates : podpac.Coordinates
            {requested_coordinates}
        output : podpac.UnitsDataArray, optional
            {eval_output}
        _selector: callable(coordinates, request_coordinates)
            {eval_selector}

        Returns
        -------
        {eval_return}

        Notes
        -------
        The input coordinates are modified and the passed to the base class implementation of eval.
        """

        self._requested_coordinates = coordinates
        self._modified_coordinates = Coordinates(
            [
                self.get_modified_coordinates1d(coordinates, dim)
                for dim in coordinates.dims
            ],
            crs=coordinates.crs,
            validate_crs=False,
        )

        for dim in self._modified_coordinates.udims:
            if self._modified_coordinates[dim].size == 0:
                raise ValueError(
                    "Modified coordinates do not intersect with source data (dim '%s')"
                    % dim)

        outputs = {}
        outputs["source"] = self.source.eval(self._modified_coordinates,
                                             output=output,
                                             _selector=_selector)

        if self.substitute_eval_coords:
            dims = outputs["source"].dims
            coords = self._requested_coordinates
            extra_dims = [d for d in coords.dims if d not in dims]
            coords = coords.drop(extra_dims)

            outputs["source"] = outputs["source"].assign_coords(
                **coords.xcoords)

        if output is None:
            output = outputs["source"]
        else:
            output[:] = outputs["source"]

        if settings["DEBUG"]:
            self._output = output
        return output
Example #17
0
class ResampleReduce(UnaryAlgorithm):
    """
    Resample a time-dependent source node using a statistical operation to achieve the result.

    Attributes
    ----------
    custom_reduce_fn : function
        required if reduce_fn is 'custom'.
    resample : str
        datetime sub-accessor. Currently 'dayofyear' is the enabled option.
    reduce_fn : str
        builtin xarray groupby reduce function, or 'custom'.
    source : podpac.Node
        Source node
    """

    _repr_keys = ["source", "resample", "reduce_fn"]
    coordinates_source = NodeTrait(allow_none=True).tag(attr=True)

    # see https://github.com/pydata/xarray/blob/eeb109d9181c84dfb93356c5f14045d839ee64cb/xarray/core/accessors.py#L61
    resample = tl.Unicode().tag(attr=True)
    reduce_fn = tl.CaselessStrEnum(_REDUCE_FUNCTIONS).tag(attr=True)
    custom_reduce_fn = tl.Any(allow_none=True, default_value=None).tag(attr=True)

    _source_coordinates = tl.Instance(Coordinates)

    @tl.default("coordinates_source")
    def _default_coordinates_source(self):
        return self.source

    @common_doc(COMMON_DOC)
    def _eval(self, coordinates, output=None, _selector=None):
        """Evaluates this nodes using the supplied coordinates.

        Parameters
        ----------
        coordinates : podpac.Coordinates
            {requested_coordinates}
        output : podpac.UnitsDataArray, optional
            {eval_output}
        _selector: callable(coordinates, request_coordinates)
            {eval_selector}

        Returns
        -------
        {eval_return}

        Raises
        ------
        ValueError
            If source it not time-dependent (required by this node).
        """

        source_output = self.source.eval(coordinates, _selector=_selector)

        # group
        grouped = source_output.resample(time=self.resample)

        # reduce
        if self.reduce_fn == "custom":
            out = grouped.reduce(self.custom_reduce_fn)
        else:
            # standard, e.g. grouped.median('time')
            out = getattr(grouped, self.reduce_fn)()

        if output is None:
            output = podpac.UnitsDataArray(out)
            output.attrs = source_output.attrs
        else:
            output.data[:] = out.data[:]

        ## map
        # eval_time = xr.DataArray(coordinates.coords["time"])
        # E = getattr(eval_time.dt, self.groupby)
        # out = out.sel(**{self.groupby: E}).rename({self.groupby: "time"})
        # output[:] = out.transpose(*output.dims).data

        return output

    @property
    def base_ref(self):
        """
        Default node reference/name in node definitions

        Returns
        -------
        str
            Default node reference/name in node definitions
        """
        return "%s.%s.%s" % (self.source.base_ref, self.resample, self.reduce_fn)