def get_block_addresses(self, grid: ArrayGrid): addresses: dict = {} if isinstance(self.scheduler, BlockCyclicScheduler): scheduler: BlockCyclicScheduler = self.scheduler for grid_entry in grid.get_entry_iterator(): node: Dict = scheduler.cluster_grid[ scheduler.get_cluster_entry(grid_entry)] node_key = list( filter(lambda key: "node" in key, node["Resources"].keys())) assert len(node_key) == 1 node_key = node_key[0] addresses[grid_entry] = node_key elif isinstance(self.scheduler, TaskScheduler): # Just do round-robin over nodes. nodes = self.nodes() index = 0 for grid_entry in grid.get_entry_iterator(): node = nodes[index] node_key = list( filter(lambda key: "node" in key, node["Resources"].keys())) assert len(node_key) == 1 node_key = node_key[0] addresses[grid_entry] = node_key index = (index + 1) % len(nodes) return addresses
def _group_index_lists_by_block(self, dst_slice_tuples, src_grid: ArrayGrid, dst_index_list, src_index_list): # TODO(hme): Keep this function here until it's needed for greater support of # selection/assignment operations. # Block grid entries needed to write to given dst_slice_selection. src_blocks = {} dst_slice_np = np.array(dst_slice_tuples).T dst_index_arr = np.array(dst_index_list) src_index_arr = np.array(src_index_list) # Pick the smallest type to represent indices. # A set of these indices may be transmitted over the network, # so we want to pick the smallest encoding possible. index_types = [(2**8, np.uint8), (2**16, np.uint16), (2**32, np.uint32), (2**64, np.uint64)] index_type = None for bound, curr_index_type in index_types: if np.all(np.array(src_grid.block_shape) < bound) and np.all(dst_slice_np[1] < bound): index_type = curr_index_type break if index_type is None: raise Exception("Unable to encode block indices, blocks are too large.") for grid_entry in src_grid.get_entry_iterator(): src_slice_np = np.array(src_grid.get_slice_tuples(grid_entry)).T index_pairs = [] for i in range(src_index_arr.shape[0]): src_index = src_index_arr[i] dst_index = dst_index_arr[i] if np.all((src_slice_np[0] <= src_index) & (src_index < src_slice_np[1])): index_pair = ((dst_index - dst_slice_np[0]).astype(index_type), (src_index - src_slice_np[0]).astype(index_type)) index_pairs.append(index_pair) if len(index_pairs) > 0: src_blocks[grid_entry] = index_pairs return src_blocks
def from_oid(cls, oid, shape, dtype, system): block_shape = shape grid = ArrayGrid(shape, block_shape, dtype.__name__) ba = BlockArray(grid, system) for i, grid_entry in enumerate(grid.get_entry_iterator()): assert i == 0 ba.blocks[grid_entry].oid = oid return ba
def from_np(cls, arr, block_shape, copy, system): dtype_str = str(arr.dtype) grid = ArrayGrid(arr.shape, block_shape, dtype_str) rarr = SparseBlockArray(grid, system) grid_entry_iterator = grid.get_entry_iterator() for grid_entry in grid_entry_iterator: grid_slice = grid.get_slice(grid_entry) block = scipy.sparse.csr_matrix(arr[grid_slice]) rarr.blocks[grid_entry].oid = system.put(block) rarr.blocks[grid_entry].dtype = getattr(np, dtype_str) return rarr
def empty(cls, shape, block_shape, dtype, system): grid = ArrayGrid(shape=shape, block_shape=block_shape, dtype=dtype.__name__) grid_meta = grid.to_meta() arr = BlockArray(grid, system) for grid_entry in grid.get_entry_iterator(): arr.blocks[grid_entry].oid = system.empty(grid_entry, grid_meta, syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape }) return arr
def from_np(cls, arr, block_shape, copy, system): dtype_str = str(arr.dtype) grid = ArrayGrid(arr.shape, block_shape, dtype_str) rarr = BlockArray(grid, system) grid_entry_iterator = grid.get_entry_iterator() for grid_entry in grid_entry_iterator: grid_slice = grid.get_slice(grid_entry) block = arr[grid_slice] if copy: block = np.copy(block) rarr.blocks[grid_entry].oid = system.put(block) rarr.blocks[grid_entry].dtype = getattr(np, dtype_str) return rarr
def get_block_addresses(self, grid: ArrayGrid): addresses: dict = {} nodes = self.nodes() index = 0 for grid_entry in grid.get_entry_iterator(): node = nodes[index] node_key = list( filter(lambda key: "node" in key, node["Resources"].keys())) assert len(node_key) == 1 node_key = node_key[0] addresses[grid_entry] = node_key index = (index + 1) % len(nodes) return addresses
def read_csv(self, filename, dtype=float, delimiter=',', has_header=False, num_workers=4): file_size = storage_utils.get_file_size(filename) file_batches: storage_utils.Batch = storage_utils.Batch.from_num_batches( file_size, num_workers) blocks = [] shape_oids = [] for i, batch in enumerate(file_batches.batches): file_start, file_end = batch block_oid, shape_oid = self.system.call("read_csv_block", filename, file_start, file_end, dtype, delimiter, has_header, syskwargs={ "grid_entry": (i, ), "grid_shape": (num_workers, ), "options": { "num_returns": 2 } }) blocks.append(block_oid) shape_oids.append(shape_oid) shapes = self.system.get(shape_oids) arrays = [] for i in range(len(shapes)): shape = shapes[i] if shape[0] == 0: continue block = blocks[i] grid = ArrayGrid(shape=shape, block_shape=shape, dtype=dtype.__name__) arr = BlockArray(grid, self.system) iter_one = True for grid_entry in grid.get_entry_iterator(): assert iter_one iter_one = False arr.blocks[grid_entry].oid = block arrays.append(arr) return arrays
def arange(self, shape, block_shape, step=1, dtype=np.int64) -> BlockArray: assert step == 1 # Generate ranges per block. grid = ArrayGrid(shape, block_shape, dtype.__name__) rarr = BlockArray(grid, self.system) for _, grid_entry in enumerate(grid.get_entry_iterator()): syskwargs = { "grid_entry": grid_entry, "grid_shape": grid.grid_shape } start = block_shape[0] * grid_entry[0] entry_shape = grid.get_block_shape(grid_entry) stop = start + entry_shape[0] rarr.blocks[grid_entry].oid = self.system.arange( start, stop, step, dtype, syskwargs=syskwargs) return rarr
def _block_map_bop(self, op_name: str, arr_a: BlockArray, arr_b: BlockArray) -> BlockArray: shape = arr_a.shape block_shape = arr_a.block_shape dtype = array_utils.get_bop_output_type("log", arr_a.dtype, arr_b.dtype) assert len(shape) == len(block_shape) grid = ArrayGrid(shape, block_shape, dtype.__name__) rarr = BlockArray(grid, self._system) op = self._system.__getattribute__(op_name) for grid_entry in grid.get_entry_iterator(): rarr.blocks[grid_entry].oid = op(arr_a.blocks[grid_entry].oid, arr_b.blocks[grid_entry].oid, syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape }) return rarr
def loadtxt(self, fname, dtype=float, comments='# ', delimiter=' ', converters=None, skiprows=0, usecols=None, unpack=False, ndmin=0, encoding='bytes', max_rows=None, num_workers=4) -> BlockArray: # pylint: disable=unused-variable bytes_per_char, bytes_per_row, bytes_per_col, num_cols = storage_utils.get_np_txt_info( fname, comments, delimiter ) chars_per_row = bytes_per_row // bytes_per_char assert np.allclose(float(chars_per_row), bytes_per_row / bytes_per_char) comment_lines, trailing_newlines = storage_utils.get_np_comments(fname, comments) nonrow_chars = trailing_newlines for line in comment_lines: nonrow_chars += len(line) file_size = storage_utils.get_file_size(fname) file_chars = file_size // bytes_per_char assert np.allclose(float(file_chars), file_size / bytes_per_char) row_chars = file_chars - nonrow_chars num_rows = row_chars // chars_per_row assert np.allclose(float(num_rows), float(row_chars / chars_per_row)) num_rows_final = num_rows - skiprows if max_rows is not None: num_rows_final = (num_rows_final, max_rows) row_batches: storage_utils.Batch = storage_utils.Batch.from_num_batches(num_rows_final, num_workers) grid = ArrayGrid(shape=(num_rows_final, num_cols), block_shape=(row_batches.batch_size, num_cols), dtype=np.float64.__name__ if dtype is float else dtype.__name__) result: BlockArray = BlockArray(grid, system=self.system) for i, grid_entry in enumerate(grid.get_entry_iterator()): row_start, row_end = row_batches.batches[i] batch_skiprows = skiprows + row_start + 1 batch_max_rows = grid.get_block_shape(grid_entry)[0] assert batch_max_rows == row_end - row_start result.blocks[grid_entry].oid = self.loadtxt_block( fname, dtype=dtype, comments=comments, delimiter=delimiter, converters=converters, skiprows=batch_skiprows, usecols=usecols, unpack=unpack, ndmin=ndmin, encoding=encoding, max_rows=batch_max_rows, syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape } ) return result
def eye(self, shape: tuple, block_shape: tuple, dtype: np.dtype = None): assert len(shape) == len(block_shape) == 2 if dtype is None: dtype = np.float64 grid = ArrayGrid(shape, block_shape, dtype.__name__) grid_meta = grid.to_meta() rarr = BlockArray(grid, self.system) for grid_entry in grid.get_entry_iterator(): syskwargs = { "grid_entry": grid_entry, "grid_shape": grid.grid_shape } if np.all(np.diff(grid_entry) == 0): # This is a diagonal block. rarr.blocks[grid_entry].oid = self.system.new_block( "eye", grid_entry, grid_meta, syskwargs=syskwargs) else: rarr.blocks[grid_entry].oid = self.system.new_block( "zeros", grid_entry, grid_meta, syskwargs=syskwargs) return rarr
def _simple_reshape(self, arr, shape, block_shape): # Reshape the array of blocks only. # This is only used when the difference in shape are factors of 1s, # and the ordering of other factors are maintained. # Check assumptions. assert len(self._strip_ones(arr.shape)) == len(self._strip_ones(shape)) # Create new grid, and perform reshape on blocks # to simplify access to source blocks. grid = ArrayGrid(shape, block_shape, dtype=arr.dtype.__name__) src_blocks = arr.blocks.reshape(grid.grid_shape) rarr = BlockArray(grid, arr.system) for grid_entry in grid.get_entry_iterator(): src_block: Block = src_blocks[grid_entry] dst_block: Block = rarr.blocks[grid_entry] syskwargs = {"grid_entry": grid_entry, "grid_shape": grid.grid_shape} dst_block.oid = arr.system.reshape(src_block.oid, dst_block.shape, syskwargs=syskwargs) return rarr
def _new_array(self, op_name: str, shape: tuple, block_shape: tuple, dtype: np.dtype = None): assert len(shape) == len(block_shape) if dtype is None: dtype = np.float64 grid = ArrayGrid(shape, block_shape, dtype.__name__) grid_meta = grid.to_meta() rarr = BlockArray(grid, self._system) for grid_entry in grid.get_entry_iterator(): rarr.blocks[grid_entry].oid = self._system.new_block( op_name, grid_entry, grid_meta, syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape }) return rarr
def diag(self, X: BlockArray) -> BlockArray: if len(X.shape) == 1: shape = X.shape[0], X.shape[0] block_shape = X.block_shape[0], X.block_shape[0] grid = ArrayGrid(shape, block_shape, X.dtype.__name__) grid_meta = grid.to_meta() rarr = BlockArray(grid, self.system) for grid_entry in grid.get_entry_iterator(): syskwargs = { "grid_entry": grid_entry, "grid_shape": grid.grid_shape } if np.all(np.diff(grid_entry) == 0): # This is a diagonal block. rarr.blocks[grid_entry].oid = self.system.diag( X.blocks[grid_entry[0]].oid, syskwargs=syskwargs) else: rarr.blocks[grid_entry].oid = self.system.new_block( "zeros", grid_entry, grid_meta, syskwargs=syskwargs) elif len(X.shape) == 2: assert X.shape[0] == X.shape[1] assert X.block_shape[0] == X.block_shape[1] shape = X.shape[0], block_shape = X.block_shape[0], grid = ArrayGrid(shape, block_shape, X.dtype.__name__) rarr = BlockArray(grid, self.system) for grid_entry in X.grid.get_entry_iterator(): out_grid_entry = grid_entry[:1] out_grid_shape = grid.grid_shape[:1] syskwargs = { "grid_entry": out_grid_entry, "grid_shape": out_grid_shape } if np.all(np.diff(grid_entry) == 0): # This is a diagonal block. rarr.blocks[out_grid_entry].oid = self.system.diag( X.blocks[grid_entry].oid, syskwargs=syskwargs) else: raise ValueError("X must have 1 or 2 axes.") return rarr
def map_uop(self, op_name: str, arr: BlockArray, out: BlockArray = None, where=True, args=None, kwargs=None) -> BlockArray: """ A map, for unary operators, that applies to every entry of an array. :param op_name: An element-wise unary operator. :param arr: A BlockArray. :param out: A BlockArray to which the result is written. :param where: An indicator specifying the indices to which op is applied. :param args: Args provided to op. :param kwargs: Keyword args provided to op. :return: A BlockArray. """ if where is not True: raise NotImplementedError("'where' argument is not yet supported.") args = () if args is None else args kwargs = {} if kwargs is None else kwargs shape = arr.shape block_shape = arr.block_shape dtype = array_utils.get_uop_output_type(op_name, arr.dtype) assert len(shape) == len(block_shape) if out is None: grid = ArrayGrid(shape, block_shape, dtype.__name__) rarr = BlockArray(grid, self.system) else: rarr = out grid = rarr.grid assert rarr.shape == arr.shape and rarr.block_shape == arr.block_shape for grid_entry in grid.get_entry_iterator(): # TODO(hme): Faster to create ndarray first, # and instantiate block array on return # to avoid instantiating blocks on BlockArray initialization. rarr.blocks[grid_entry] = arr.blocks[grid_entry].uop_map( op_name, args=args, kwargs=kwargs) return rarr
def from_blocks(cls, arr: np.ndarray, result_shape, system): sample_idx = tuple(0 for dim in arr.shape) if isinstance(arr, Block): sample_block = arr result_shape = () else: sample_block = arr[sample_idx] if result_shape is None: result_shape = array_utils.shape_from_block_array(arr) result_block_shape = sample_block.shape result_dtype_str = sample_block.dtype.__name__ result_grid = ArrayGrid(shape=result_shape, block_shape=result_block_shape, dtype=result_dtype_str) assert arr.shape == result_grid.grid_shape result = BlockArray(result_grid, system) for grid_entry in result_grid.get_entry_iterator(): if isinstance(arr, Block): block: Block = arr else: block: Block = arr[grid_entry] result.blocks[grid_entry] = block return result
def reshape(self, shape=None, block_shape=None): # TODO (hme): Add support for arbitrary reshape. if shape is None: shape = self.shape if block_shape is None: block_shape = self.block_shape if shape == self.shape and block_shape == self.block_shape: return self temp_shape = shape temp_block_shape = block_shape shape = [] block_shape = [] negative_one = False for i, dim in enumerate(temp_shape): if dim == -1: assert len(self.shape) == 1 if negative_one: raise Exception("Only one -1 permitted in reshape.") negative_one = True shape.append(self.shape[i]) assert temp_block_shape[i] == -1 block_shape.append(self.block_shape[0]) else: shape.append(dim) block_shape.append(temp_block_shape[i]) del temp_shape shape = tuple(shape) block_shape = tuple(block_shape) assert np.product(shape) == np.product(self.shape) # Make sure the difference is either a preceding or succeeding one. if len(shape) > len(self.shape): if shape[0] == 1: grid_entry_op = "shift" assert shape[1:] == self.shape elif shape[-1] == 1: grid_entry_op = "pop" assert shape[:-1] == self.shape else: raise Exception() elif len(shape) < len(self.shape): if self.shape[0] == 1: grid_entry_op = "prep" assert self.shape[1:] == shape elif self.shape[-1] == 1: grid_entry_op = "app" assert self.shape[:-1] == shape else: raise Exception() else: grid_entry_op = "none" assert self.shape == shape grid = ArrayGrid(shape=shape, block_shape=block_shape, dtype=self.grid.dtype.__name__) grid_meta = grid.to_meta() rarr = BlockArray(grid, self.system) for grid_entry in grid.get_entry_iterator(): rarr.blocks[grid_entry].oid = self.system.empty(grid_entry, grid_meta, syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape }) grid_entry_slice = grid.get_slice(grid_entry) if grid_entry_op == "shift": grid_entry_slice = tuple([0] + list(grid_entry_slice)[1:]) self_grid_entry_slice = self.grid.get_slice(grid_entry[1:]) elif grid_entry_op == "pop": grid_entry_slice = tuple(list(grid_entry_slice)[:-1] + [0]) self_grid_entry_slice = self.grid.get_slice(grid_entry[:-1]) elif grid_entry_op == "prep": self_grid_entry_slice = self.grid.get_slice(tuple([0] + list(grid_entry))) elif grid_entry_op == "prep": self_grid_entry_slice = self.grid.get_slice(tuple(list(grid_entry) + [0])) else: assert grid_entry_op == "none" self_grid_entry_slice = grid_entry_slice # TODO (hme): This is costly. rarr[grid_entry_slice] = self[self_grid_entry_slice] return rarr
def reduce_axis(self, op_name, axis, keepdims=False): if not (axis is None or isinstance(axis, (int, np.int32, np.int64))): raise NotImplementedError("Only integer axis is currently supported.") result_blocks = np.empty_like(self.blocks, dtype=Block) for grid_entry in self.grid.get_entry_iterator(): result_blocks[grid_entry] = self.blocks[grid_entry].reduce_axis(op_name, axis, keepdims=keepdims) result_shape = [] result_block_shape = [] for curr_axis in range(len(self.shape)): axis_size, axis_block_size = self.shape[curr_axis], self.block_shape[curr_axis] if curr_axis == axis or axis is None: if keepdims: axis_size, axis_block_size = 1, 1 else: continue result_shape.append(axis_size) result_block_shape.append(axis_block_size) result_shape = tuple(result_shape) result_block_shape = tuple(result_block_shape) result_dtype = array_utils.get_reduce_output_type(op_name, self.dtype) result_grid = ArrayGrid(shape=result_shape, block_shape=result_block_shape, dtype=result_dtype.__name__) result = BlockArray(result_grid, self.system) if op_name in settings.np_pairwise_reduction_map: # Do a pairwise reduction with the pairwise reduction op. pairwise_op_name = settings.np_pairwise_reduction_map.get(op_name, op_name) if axis is None: reduced_block: Block = None for grid_entry in self.grid.get_entry_iterator(): if reduced_block is None: reduced_block = result_blocks[grid_entry] continue next_block = result_blocks[grid_entry] reduced_block = reduced_block.bop(pairwise_op_name, next_block, {}) if result.shape == (): result.blocks[()] = reduced_block else: result.blocks[:] = reduced_block else: for result_grid_entry in result_grid.get_entry_iterator(): reduced_block: Block = None for sum_dim in range(self.grid.grid_shape[axis]): grid_entry = list(result_grid_entry) if keepdims: grid_entry[axis] = sum_dim else: grid_entry = grid_entry[:axis] + [sum_dim] + grid_entry[axis:] grid_entry = tuple(grid_entry) next_block: Block = result_blocks[grid_entry] if reduced_block is None: reduced_block = next_block else: reduced_block = reduced_block.bop(pairwise_op_name, next_block, {}) result.blocks[result_grid_entry] = reduced_block else: op_func = np.__getattribute__(op_name) if result.shape == (): result.blocks[()] = op_func(result_blocks, axis=axis, keepdims=keepdims) else: result.blocks = op_func(result_blocks, axis=axis, keepdims=keepdims) return result