def broadcast_apply(cls, axis, apply_func, left, right): client = _get_global_client() right_parts = np.squeeze(right) if len(right_parts.shape) == 0: right_parts = np.array([right_parts.item()]) assert ( len(right_parts.shape) == 1 ), "Invalid broadcast partitions shape {}\n{}".format( right_parts.shape, [[i.get() for i in j] for j in right_parts] ) return np.array( [ [ PandasOnDaskFramePartition( client.submit( deploy_func, part.future, right_parts[col_idx].future if axis else right_parts[row_idx].future, apply_func, part.call_queue, right_parts[col_idx].call_queue if axis else right_parts[row_idx].call_queue, pure=False, ) ) for col_idx, part in enumerate(left[row_idx]) ] for row_idx in range(len(left)) ] )
def deploy(cls, func, num_returns, kwargs): client = _get_global_client() remote_task_future = client.submit(func, **kwargs) return [ client.submit(lambda l, i: l[i], remote_task_future, i) for i in range(num_returns) ]
def get_indices(cls, axis, partitions, index_func): """This gets the internal indices stored in the partitions. Note: These are the global indices of the object. This is mostly useful when you have deleted rows/columns internally, but do not know which ones were deleted. Args: axis: This axis to extract the labels. (0 - index, 1 - columns). index_func: The function to be used to extract the function. old_blocks: An optional previous object that this object was created from. This is used to compute the correct offsets. Returns: A Pandas Index object. """ client = _get_global_client() ErrorMessage.catch_bugs_and_request_email(not callable(index_func)) func = cls.preprocess_func(index_func) if axis == 0: # We grab the first column of blocks and extract the indices new_idx = ( [idx.apply(func).future for idx in partitions.T[0]] if len(partitions.T) else [] ) else: new_idx = ( [idx.apply(func).future for idx in partitions[0]] if len(partitions) else [] ) new_idx = client.gather(new_idx) return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
def data_array(self): """ Lazy load model/analysis data into memory. """ client = _get_global_client() xr_data = xr.open_mfdataset(self.path_to_files, chunks=self.chunks, parallel=True) if not all(x in list(xr_data.coords) for x in self.DIMS): xr_data = xr_data.rename({ 'latitude': 'lat', 'longitude': 'lon', }) if isinstance(xr_data.time.values[0], cftime._cftime.DatetimeNoLeap): datetime_index = xr_data.indexes['time'].to_datetimeindex() xr_data['time'] = datetime_index if self.subset_dict is not None: xr_data = self.cut(xr_data) print('Cut data') if self.season is not None: xr_data = xr_data.where(xr_data.time.dt.season == self.season, drop=True) if self.rescale_longitude is True: xr_data = xr_data.assign_coords(lon=(((xr_data.lon + 180) % 360) - 180)).sortby('lon') return xr_data[self.DIMS + [self.temp_var]].squeeze()
def __init__(self, future, length=None, width=None, call_queue=None): self.future = future if call_queue is None: call_queue = [] self.call_queue = call_queue self._length_cache = length self._width_cache = width self.client = _get_global_client()
def _get_client(client): if client is None: return _get_global_client() elif isinstance(client, Client): return client else: # e.g., connection string return Client(client)
def _determine_worker(self): try: get_worker() self.worker = True self.fs = filesystem(self.protocol, **self.storage_options) except ValueError: self.worker = False self.client = _get_global_client() self.rfs = dask.delayed(self)
def put(cls, obj): """A factory classmethod to format a given object. Args: obj: An object. Returns: A `RemotePartitions` object. """ client = _get_global_client() return cls(client.scatter(obj))
def _column_widths(self): """Compute the column widths if they are not cached. Returns: A list of column widths. """ client = _get_global_client() if self._column_widths_cache is None: self._column_widths_cache = client.gather([ obj.apply(lambda df: len(df.columns)).future for obj in self._partitions[0] ]) return self._column_widths_cache
def _row_lengths(self): """Compute the row lengths if they are not cached. Returns: A list of row lengths. """ client = _get_global_client() if self._row_lengths_cache is None: self._row_lengths_cache = client.gather([ obj.apply(lambda df: len(df)).future for obj in self._partitions.T[0] ]) return self._row_lengths_cache
def deploy_axis_func(cls, axis, func, num_splits, kwargs, maintain_partitioning, *partitions): client = _get_global_client() axis_result = client.submit(PandasFrameAxisPartition.deploy_axis_func, axis, func, num_splits, kwargs, maintain_partitioning, *partitions) if num_splits == 1: return axis_result # We have to do this to split it back up. It is already split, but we need to # get futures for each. return [ client.submit(lambda l: l[i], axis_result) for i in range(num_splits) ]
def deploy_func_between_two_axis_partitions( cls, axis, func, num_splits, len_of_left, kwargs, *partitions ): client = _get_global_client() axis_result = client.submit( PandasFrameAxisPartition.deploy_func_between_two_axis_partitions, axis, func, num_splits, len_of_left, kwargs, *partitions ) # We have to do this to split it back up. It is already split, but we need to # get futures for each. return [client.submit(lambda l: l[i], axis_result) for i in range(num_splits)]
def broadcast_apply(cls, axis, apply_func, left, right, other_name="r"): def mapper(df, others): other = pandas.concat(others, axis=axis ^ 1) return apply_func(df, **{other_name: other}) client = _get_global_client() return np.array([[ PandasOnDaskFramePartition( client.submit( deploy_func, part.future, mapper, part.call_queue, [obj[col_idx].call_queue for obj in right] if axis else [obj.call_queue for obj in right[row_idx]], *([obj[col_idx].future for obj in right] if axis else [obj.future for obj in right[row_idx]]), pure=False, )) for col_idx, part in enumerate(left[row_idx]) ] for row_idx in range(len(left))])
def get_indices(cls, axis, partitions, index_func): """ This gets the internal indices stored in the partitions. Parameters ---------- axis : 0 or 1 This axis to extract the labels (0 - index, 1 - columns). partitions : NumPy array The array of partitions from which need to extract the labels. index_func : callable The function to be used to extract the function. Returns ------- Index A Pandas Index object. Notes ----- These are the global indices of the object. This is mostly useful when you have deleted rows/columns internally, but do not know which ones were deleted. """ client = _get_global_client() ErrorMessage.catch_bugs_and_request_email(not callable(index_func)) func = cls.preprocess_func(index_func) if axis == 0: # We grab the first column of blocks and extract the indices new_idx = ([idx.apply(func).future for idx in partitions.T[0]] if len(partitions.T) else []) else: new_idx = ([idx.apply(func).future for idx in partitions[0]] if len(partitions) else []) new_idx = client.gather(new_idx) return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
def dataset(self): client = _get_global_client() if client is None: print(f'WARNING! No Dask client available in environment!') _full_dataset = xr.open_mfdataset(self.path_to_files, chunks=self.chunks, concat_dim='time', preprocess=self.preprocess_mf) self.year_range = np.unique(_full_dataset.time.dt.year)[[0, -1]] if self.season == 'DJF': try: _full_dataset['time'] = _full_dataset.indexes[ 'time'].normalize() except AttributeError: _full_dataset['time'] = _full_dataset.indexes[ 'time'].to_datetimeindex().normalize() _dataset = self.sel_winters(_full_dataset, *self.year_range) return _dataset elif season == 'all': return _full_dataset else: raise NotImplementedError
if threading.current_thread().name == "MainThread": ray.init( redirect_output=True, include_webui=False, redirect_worker_output=True, ignore_reinit_error=True, ) num_cpus = ray.global_state.cluster_resources()["CPU"] except AssertionError: pass elif execution_engine == "Dask": from distributed.client import _get_global_client if threading.current_thread().name == "MainThread": # initialize the dask client client = _get_global_client() if client is None: from distributed import Client client = Client() num_cpus = sum(client.ncores().values()) elif execution_engine != "Python": raise ImportError( "Unrecognized execution engine: {}.".format(execution_engine)) DEFAULT_NPARTITIONS = max(4, int(num_cpus)) __all__ = [ "DataFrame", "Series", "read_csv",
def materialize(cls, future): client = _get_global_client() return client.gather(future)