Example #1
0
 def broadcast_apply(cls, axis, apply_func, left, right):
     client = _get_global_client()
     right_parts = np.squeeze(right)
     if len(right_parts.shape) == 0:
         right_parts = np.array([right_parts.item()])
     assert (
         len(right_parts.shape) == 1
     ), "Invalid broadcast partitions shape {}\n{}".format(
         right_parts.shape, [[i.get() for i in j] for j in right_parts]
     )
     return np.array(
         [
             [
                 PandasOnDaskFramePartition(
                     client.submit(
                         deploy_func,
                         part.future,
                         right_parts[col_idx].future
                         if axis
                         else right_parts[row_idx].future,
                         apply_func,
                         part.call_queue,
                         right_parts[col_idx].call_queue
                         if axis
                         else right_parts[row_idx].call_queue,
                         pure=False,
                     )
                 )
                 for col_idx, part in enumerate(left[row_idx])
             ]
             for row_idx in range(len(left))
         ]
     )
Example #2
0
 def deploy(cls, func, num_returns, kwargs):
     client = _get_global_client()
     remote_task_future = client.submit(func, **kwargs)
     return [
         client.submit(lambda l, i: l[i], remote_task_future, i)
         for i in range(num_returns)
     ]
Example #3
0
    def get_indices(cls, axis, partitions, index_func):
        """This gets the internal indices stored in the partitions.

        Note: These are the global indices of the object. This is mostly useful
            when you have deleted rows/columns internally, but do not know
            which ones were deleted.

        Args:
            axis: This axis to extract the labels. (0 - index, 1 - columns).
            index_func: The function to be used to extract the function.
            old_blocks: An optional previous object that this object was
                created from. This is used to compute the correct offsets.

        Returns:
            A Pandas Index object.
        """
        client = _get_global_client()
        ErrorMessage.catch_bugs_and_request_email(not callable(index_func))
        func = cls.preprocess_func(index_func)
        if axis == 0:
            # We grab the first column of blocks and extract the indices
            new_idx = (
                [idx.apply(func).future for idx in partitions.T[0]]
                if len(partitions.T)
                else []
            )
        else:
            new_idx = (
                [idx.apply(func).future for idx in partitions[0]]
                if len(partitions)
                else []
            )
        new_idx = client.gather(new_idx)
        return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
Example #4
0
    def data_array(self):
        """ Lazy load model/analysis data into memory. 
        """

        client = _get_global_client()

        xr_data = xr.open_mfdataset(self.path_to_files,
                                    chunks=self.chunks,
                                    parallel=True)

        if not all(x in list(xr_data.coords) for x in self.DIMS):
            xr_data = xr_data.rename({
                'latitude': 'lat',
                'longitude': 'lon',
            })

        if isinstance(xr_data.time.values[0], cftime._cftime.DatetimeNoLeap):
            datetime_index = xr_data.indexes['time'].to_datetimeindex()
            xr_data['time'] = datetime_index

        if self.subset_dict is not None:
            xr_data = self.cut(xr_data)
            print('Cut data')

        if self.season is not None:
            xr_data = xr_data.where(xr_data.time.dt.season == self.season,
                                    drop=True)

        if self.rescale_longitude is True:
            xr_data = xr_data.assign_coords(lon=(((xr_data.lon + 180) % 360) -
                                                 180)).sortby('lon')

        return xr_data[self.DIMS + [self.temp_var]].squeeze()
Example #5
0
 def __init__(self, future, length=None, width=None, call_queue=None):
     self.future = future
     if call_queue is None:
         call_queue = []
     self.call_queue = call_queue
     self._length_cache = length
     self._width_cache = width
     self.client = _get_global_client()
Example #6
0
def _get_client(client):
    if client is None:
        return _get_global_client()
    elif isinstance(client, Client):
        return client
    else:
        # e.g., connection string
        return Client(client)
Example #7
0
 def _determine_worker(self):
     try:
         get_worker()
         self.worker = True
         self.fs = filesystem(self.protocol, **self.storage_options)
     except ValueError:
         self.worker = False
         self.client = _get_global_client()
         self.rfs = dask.delayed(self)
Example #8
0
    def put(cls, obj):
        """A factory classmethod to format a given object.

        Args:
            obj: An object.

        Returns:
            A `RemotePartitions` object.
        """
        client = _get_global_client()
        return cls(client.scatter(obj))
Example #9
0
    def _column_widths(self):
        """Compute the column widths if they are not cached.

        Returns:
            A list of column widths.
        """
        client = _get_global_client()
        if self._column_widths_cache is None:
            self._column_widths_cache = client.gather([
                obj.apply(lambda df: len(df.columns)).future
                for obj in self._partitions[0]
            ])
        return self._column_widths_cache
Example #10
0
    def _row_lengths(self):
        """Compute the row lengths if they are not cached.

        Returns:
            A list of row lengths.
        """
        client = _get_global_client()
        if self._row_lengths_cache is None:
            self._row_lengths_cache = client.gather([
                obj.apply(lambda df: len(df)).future
                for obj in self._partitions.T[0]
            ])
        return self._row_lengths_cache
Example #11
0
 def deploy_axis_func(cls, axis, func, num_splits, kwargs,
                      maintain_partitioning, *partitions):
     client = _get_global_client()
     axis_result = client.submit(PandasFrameAxisPartition.deploy_axis_func,
                                 axis, func, num_splits, kwargs,
                                 maintain_partitioning, *partitions)
     if num_splits == 1:
         return axis_result
     # We have to do this to split it back up. It is already split, but we need to
     # get futures for each.
     return [
         client.submit(lambda l: l[i], axis_result)
         for i in range(num_splits)
     ]
Example #12
0
 def deploy_func_between_two_axis_partitions(
     cls, axis, func, num_splits, len_of_left, kwargs, *partitions
 ):
     client = _get_global_client()
     axis_result = client.submit(
         PandasFrameAxisPartition.deploy_func_between_two_axis_partitions,
         axis,
         func,
         num_splits,
         len_of_left,
         kwargs,
         *partitions
     )
     # We have to do this to split it back up. It is already split, but we need to
     # get futures for each.
     return [client.submit(lambda l: l[i], axis_result) for i in range(num_splits)]
    def broadcast_apply(cls, axis, apply_func, left, right, other_name="r"):
        def mapper(df, others):
            other = pandas.concat(others, axis=axis ^ 1)
            return apply_func(df, **{other_name: other})

        client = _get_global_client()
        return np.array([[
            PandasOnDaskFramePartition(
                client.submit(
                    deploy_func,
                    part.future,
                    mapper,
                    part.call_queue,
                    [obj[col_idx].call_queue for obj in right]
                    if axis else [obj.call_queue for obj in right[row_idx]],
                    *([obj[col_idx].future for obj in right]
                      if axis else [obj.future for obj in right[row_idx]]),
                    pure=False,
                )) for col_idx, part in enumerate(left[row_idx])
        ] for row_idx in range(len(left))])
    def get_indices(cls, axis, partitions, index_func):
        """
        This gets the internal indices stored in the partitions.

        Parameters
        ----------
            axis : 0 or 1
                This axis to extract the labels (0 - index, 1 - columns).
            partitions : NumPy array
                The array of partitions from which need to extract the labels.
            index_func : callable
                The function to be used to extract the function.

        Returns
        -------
        Index
            A Pandas Index object.

        Notes
        -----
        These are the global indices of the object. This is mostly useful
        when you have deleted rows/columns internally, but do not know
        which ones were deleted.
        """
        client = _get_global_client()
        ErrorMessage.catch_bugs_and_request_email(not callable(index_func))
        func = cls.preprocess_func(index_func)
        if axis == 0:
            # We grab the first column of blocks and extract the indices
            new_idx = ([idx.apply(func).future for idx in partitions.T[0]]
                       if len(partitions.T) else [])
        else:
            new_idx = ([idx.apply(func).future
                        for idx in partitions[0]] if len(partitions) else [])
        new_idx = client.gather(new_idx)
        return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
Example #15
0
    def dataset(self):

        client = _get_global_client()
        if client is None:
            print(f'WARNING! No Dask client available in environment!')

        _full_dataset = xr.open_mfdataset(self.path_to_files,
                                          chunks=self.chunks,
                                          concat_dim='time',
                                          preprocess=self.preprocess_mf)
        self.year_range = np.unique(_full_dataset.time.dt.year)[[0, -1]]
        if self.season == 'DJF':
            try:
                _full_dataset['time'] = _full_dataset.indexes[
                    'time'].normalize()
            except AttributeError:
                _full_dataset['time'] = _full_dataset.indexes[
                    'time'].to_datetimeindex().normalize()
            _dataset = self.sel_winters(_full_dataset, *self.year_range)
            return _dataset
        elif season == 'all':
            return _full_dataset
        else:
            raise NotImplementedError
Example #16
0
        if threading.current_thread().name == "MainThread":
            ray.init(
                redirect_output=True,
                include_webui=False,
                redirect_worker_output=True,
                ignore_reinit_error=True,
            )
            num_cpus = ray.global_state.cluster_resources()["CPU"]
    except AssertionError:
        pass
elif execution_engine == "Dask":
    from distributed.client import _get_global_client

    if threading.current_thread().name == "MainThread":
        # initialize the dask client
        client = _get_global_client()
        if client is None:
            from distributed import Client

            client = Client()
        num_cpus = sum(client.ncores().values())
elif execution_engine != "Python":
    raise ImportError(
        "Unrecognized execution engine: {}.".format(execution_engine))

DEFAULT_NPARTITIONS = max(4, int(num_cpus))

__all__ = [
    "DataFrame",
    "Series",
    "read_csv",
Example #17
0
 def materialize(cls, future):
     client = _get_global_client()
     return client.gather(future)