def arange(self, start_in, shape, block_shape, step=1, dtype=None) -> BlockArray: assert step == 1 if dtype is None: dtype = np.__getattribute__( str(np.result_type(start_in, shape[0] + start_in))) # Generate ranges per block. grid = ArrayGrid(shape, block_shape, dtype.__name__) rarr = BlockArray(grid, self.cm) for _, grid_entry in enumerate(grid.get_entry_iterator()): syskwargs = { "grid_entry": grid_entry, "grid_shape": grid.grid_shape } start = start_in + block_shape[0] * grid_entry[0] entry_shape = grid.get_block_shape(grid_entry) stop = start + entry_shape[0] rarr.blocks[grid_entry].oid = self.cm.arange(start, stop, step, dtype, syskwargs=syskwargs) return rarr
def from_oid(cls, oid, shape, dtype, cm): block_shape = shape grid = ArrayGrid(shape, block_shape, dtype.__name__) ba = BlockArray(grid, cm) for i, grid_entry in enumerate(grid.get_entry_iterator()): assert i == 0 ba.blocks[grid_entry].oid = oid return ba
def from_np(cls, arr, block_shape, copy, cm): dtype_str = str(arr.dtype) grid = ArrayGrid(arr.shape, block_shape, dtype_str) rarr = BlockArray(grid, cm) grid_entry_iterator = grid.get_entry_iterator() for grid_entry in grid_entry_iterator: grid_slice = grid.get_slice(grid_entry) block = arr[grid_slice] if copy: block = np.copy(block) rarr.blocks[grid_entry].oid = cm.put(block) rarr.blocks[grid_entry].dtype = getattr(np, dtype_str) return rarr
def diag(self, X: BlockArray) -> BlockArray: if len(X.shape) == 1: shape = X.shape[0], X.shape[0] block_shape = X.block_shape[0], X.block_shape[0] grid = ArrayGrid(shape, block_shape, X.dtype.__name__) grid_meta = grid.to_meta() rarr = BlockArray(grid, self.cm) for grid_entry in grid.get_entry_iterator(): syskwargs = {"grid_entry": grid_entry, "grid_shape": grid.grid_shape} if np.all(np.diff(grid_entry) == 0): # This is a diagonal block. rarr.blocks[grid_entry].oid = self.cm.diag( X.blocks[grid_entry[0]].oid, 0, syskwargs=syskwargs ) else: rarr.blocks[grid_entry].oid = self.cm.new_block( "zeros", grid_entry, grid_meta, syskwargs=syskwargs ) elif len(X.shape) == 2: out_shape = (min(X.shape),) out_block_shape = (min(X.block_shape),) # Obtain the block indices which contain the diagonal of the matrix. diag_meta = array_utils.find_diag_output_blocks(X.blocks, out_shape[0]) output_block_arrays = [] out_grid_shape = (len(diag_meta),) count = 0 # Obtain the diagonals. for block_indices, offset, total_elements in diag_meta: syskwargs = {"grid_entry": (count,), "grid_shape": out_grid_shape} result_block_shape = (total_elements,) block_grid = ArrayGrid( result_block_shape, result_block_shape, X.blocks[block_indices].dtype.__name__, ) block_array = BlockArray(block_grid, self.cm) block_array.blocks[0].oid = self.cm.diag( X.blocks[block_indices].oid, offset, syskwargs=syskwargs ) output_block_arrays.append(block_array) count += 1 if len(output_block_arrays) > 1: # If there are multiple blocks, concatenate them. return self.concatenate( output_block_arrays, axis=0, axis_block_size=out_block_shape[0] ) return output_block_arrays[0] else: raise ValueError("X must have 1 or 2 axes.") return rarr
def read_csv(self, filename, dtype=float, delimiter=",", has_header=False, num_workers=4): file_size = storage_utils.get_file_size(filename) file_batches: storage_utils.Batch = storage_utils.Batch.from_num_batches( file_size, num_workers) blocks = [] shape_oids = [] for i, batch in enumerate(file_batches.batches): file_start, file_end = batch block_oid, shape_oid = self.cm.call( "read_csv_block", filename, file_start, file_end, dtype, delimiter, has_header, syskwargs={ "grid_entry": (i, ), "grid_shape": (num_workers, ), "options": { "num_returns": 2 }, }, ) blocks.append(block_oid) shape_oids.append(shape_oid) shapes = self.cm.get(shape_oids) arrays = [] for i in range(len(shapes)): shape = shapes[i] if shape[0] == 0: continue block = blocks[i] grid = ArrayGrid(shape=shape, block_shape=shape, dtype=dtype.__name__) arr = BlockArray(grid, self.cm) iter_one = True for grid_entry in grid.get_entry_iterator(): assert iter_one iter_one = False arr.blocks[grid_entry].oid = block arrays.append(arr) return arrays
def empty(cls, shape, block_shape, dtype, cm: ComputeManager): grid = ArrayGrid(shape=shape, block_shape=block_shape, dtype=dtype.__name__) grid_meta = grid.to_meta() arr = BlockArray(grid, cm) for grid_entry in grid.get_entry_iterator(): arr.blocks[grid_entry].oid = cm.empty(grid_entry, grid_meta, syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape }) return arr
def _new_array( self, op_name: str, shape: tuple, block_shape: tuple, dtype: np.dtype = None ): assert len(shape) == len(block_shape) if dtype is None: dtype = np.float64 grid = ArrayGrid(shape, block_shape, dtype.__name__) grid_meta = grid.to_meta() rarr = BlockArray(grid, self.cm) for grid_entry in grid.get_entry_iterator(): rarr.blocks[grid_entry].oid = self.cm.new_block( op_name, grid_entry, grid_meta, syskwargs={"grid_entry": grid_entry, "grid_shape": grid.grid_shape}, ) return rarr
def map_uop( self, op_name: str, arr: BlockArray, out: BlockArray = None, where=True, args=None, kwargs=None, ) -> BlockArray: """A map, for unary operators, that applies to every entry of an array. Args: op_name: An element-wise unary operator. arr: A BlockArray. out: A BlockArray to which the result is written. where: An indicator specifying the indices to which op is applied. args: Args provided to op. kwargs: Keyword args provided to op. Returns: A BlockArray. """ if where is not True: raise NotImplementedError("'where' argument is not yet supported.") args = () if args is None else args kwargs = {} if kwargs is None else kwargs shape = arr.shape block_shape = arr.block_shape dtype = array_utils.get_uop_output_type(op_name, arr.dtype) assert len(shape) == len(block_shape) if out is None: grid = ArrayGrid(shape, block_shape, dtype.__name__) rarr = BlockArray(grid, self.cm) else: rarr = out grid = rarr.grid assert rarr.shape == arr.shape and rarr.block_shape == arr.block_shape for grid_entry in grid.get_entry_iterator(): # TODO(hme): Faster to create ndarray first, # and instantiate block array on return # to avoid instantiating blocks on BlockArray initialization. rarr.blocks[grid_entry] = arr.blocks[grid_entry].uop_map( op_name, args=args, kwargs=kwargs ) return rarr
def _group_index_lists_by_block(self, dst_slice_tuples, src_grid: ArrayGrid, dst_index_list, src_index_list): # TODO(hme): Keep this function here until it's needed for greater support of # selection/assignment operations. # Block grid entries needed to write to given dst_slice_selection. src_blocks = {} dst_slice_np = np.array(dst_slice_tuples).T dst_index_arr = np.array(dst_index_list) src_index_arr = np.array(src_index_list) # Pick the smallest type to represent indices. # A set of these indices may be transmitted over the network, # so we want to pick the smallest encoding possible. index_types = [ (2**8, np.uint8), (2**16, np.uint16), (2**32, np.uint32), (2**64, np.uint64), ] index_type = None for bound, curr_index_type in index_types: if np.all(np.array(src_grid.block_shape) < bound) and np.all( dst_slice_np[1] < bound): index_type = curr_index_type break if index_type is None: raise Exception( "Unable to encode block indices, blocks are too large.") for grid_entry in src_grid.get_entry_iterator(): src_slice_np = np.array(src_grid.get_slice_tuples(grid_entry)).T index_pairs = [] for i in range(src_index_arr.shape[0]): src_index = src_index_arr[i] dst_index = dst_index_arr[i] if np.all((src_slice_np[0] <= src_index) & (src_index < src_slice_np[1])): index_pair = ( (dst_index - dst_slice_np[0]).astype(index_type), (src_index - src_slice_np[0]).astype(index_type), ) index_pairs.append(index_pair) if len(index_pairs) > 0: src_blocks[grid_entry] = index_pairs return src_blocks
def eye(self, shape: tuple, block_shape: tuple, dtype: np.dtype = None): assert len(shape) == len(block_shape) == 2 if dtype is None: dtype = np.float64 grid = ArrayGrid(shape, block_shape, dtype.__name__) grid_meta = grid.to_meta() rarr = BlockArray(grid, self.cm) for grid_entry in grid.get_entry_iterator(): syskwargs = {"grid_entry": grid_entry, "grid_shape": grid.grid_shape} if np.all(np.diff(grid_entry) == 0): # This is a diagonal block. rarr.blocks[grid_entry].oid = self.cm.new_block( "eye", grid_entry, grid_meta, syskwargs=syskwargs ) else: rarr.blocks[grid_entry].oid = self.cm.new_block( "zeros", grid_entry, grid_meta, syskwargs=syskwargs ) return rarr
def diag(self, X: BlockArray) -> BlockArray: if len(X.shape) == 1: shape = X.shape[0], X.shape[0] block_shape = X.block_shape[0], X.block_shape[0] grid = ArrayGrid(shape, block_shape, X.dtype.__name__) grid_meta = grid.to_meta() rarr = BlockArray(grid, self.cm) for grid_entry in grid.get_entry_iterator(): syskwargs = { "grid_entry": grid_entry, "grid_shape": grid.grid_shape } if np.all(np.diff(grid_entry) == 0): # This is a diagonal block. rarr.blocks[grid_entry].oid = self.cm.diag( X.blocks[grid_entry[0]].oid, syskwargs=syskwargs) else: rarr.blocks[grid_entry].oid = self.cm.new_block( "zeros", grid_entry, grid_meta, syskwargs=syskwargs) elif len(X.shape) == 2: assert X.shape[0] == X.shape[1], "X must be a square array." assert X.block_shape[0] == X.block_shape[ 1], "block_shape must be square." shape = X.shape[0], block_shape = X.block_shape[0], grid = ArrayGrid(shape, block_shape, X.dtype.__name__) rarr = BlockArray(grid, self.cm) for grid_entry in X.grid.get_entry_iterator(): out_grid_entry = grid_entry[:1] out_grid_shape = grid.grid_shape[:1] syskwargs = { "grid_entry": out_grid_entry, "grid_shape": out_grid_shape } if np.all(np.diff(grid_entry) == 0): # This is a diagonal block. rarr.blocks[out_grid_entry].oid = self.cm.diag( X.blocks[grid_entry].oid, syskwargs=syskwargs) else: raise ValueError("X must have 1 or 2 axes.") return rarr
def from_blocks(cls, arr: np.ndarray, result_shape, cm): sample_idx = tuple(0 for dim in arr.shape) if isinstance(arr, Block): sample_block = arr result_shape = () else: sample_block = arr[sample_idx] if result_shape is None: result_shape = array_utils.shape_from_block_array(arr) result_block_shape = sample_block.shape result_dtype_str = sample_block.dtype.__name__ result_grid = ArrayGrid(shape=result_shape, block_shape=result_block_shape, dtype=result_dtype_str) assert arr.shape == result_grid.grid_shape result = BlockArray(result_grid, cm) for grid_entry in result_grid.get_entry_iterator(): if isinstance(arr, Block): block: Block = arr else: block: Block = arr[grid_entry] result.blocks[grid_entry] = block return result
def _simple_reshape(self, arr, shape, block_shape): # Reshape the array of blocks only. # This is only used when the difference in shape are factors of 1s, # and the ordering of other factors are maintained. # Check assumptions. assert len(self._strip_ones(arr.shape)) == len(self._strip_ones(shape)) # Create new grid, and perform reshape on blocks # to simplify access to source blocks. grid = ArrayGrid(shape, block_shape, dtype=arr.dtype.__name__) src_blocks = arr.blocks.reshape(grid.grid_shape) rarr = BlockArray(grid, arr.cm) for grid_entry in grid.get_entry_iterator(): src_block: Block = src_blocks[grid_entry] dst_block: Block = rarr.blocks[grid_entry] syskwargs = { "grid_entry": grid_entry, "grid_shape": grid.grid_shape } dst_block.oid = arr.cm.reshape(src_block.oid, dst_block.shape, syskwargs=syskwargs) return rarr
def loadtxt(self, fname, dtype=float, comments='# ', delimiter=' ', converters=None, skiprows=0, usecols=None, unpack=False, ndmin=0, encoding='bytes', max_rows=None, num_workers=4) -> BlockArray: # pylint: disable=unused-variable bytes_per_char, bytes_per_row, bytes_per_col, num_cols = storage_utils.get_np_txt_info( fname, comments, delimiter) chars_per_row = bytes_per_row // bytes_per_char assert np.allclose(float(chars_per_row), bytes_per_row / bytes_per_char) comment_lines, trailing_newlines = storage_utils.get_np_comments( fname, comments) nonrow_chars = trailing_newlines for line in comment_lines: nonrow_chars += len(line) file_size = storage_utils.get_file_size(fname) file_chars = file_size // bytes_per_char assert np.allclose(float(file_chars), file_size / bytes_per_char) row_chars = file_chars - nonrow_chars num_rows = row_chars // chars_per_row assert np.allclose(float(num_rows), float(row_chars / chars_per_row)) num_rows_final = num_rows - skiprows if max_rows is not None: num_rows_final = (num_rows_final, max_rows) row_batches: storage_utils.Batch = storage_utils.Batch.from_num_batches( num_rows_final, num_workers) grid = ArrayGrid( shape=(num_rows_final, num_cols), block_shape=(row_batches.batch_size, num_cols), dtype=np.float64.__name__ if dtype is float else dtype.__name__) result: BlockArray = BlockArray(grid, cm=self.cm) for i, grid_entry in enumerate(grid.get_entry_iterator()): row_start, row_end = row_batches.batches[i] batch_skiprows = skiprows + row_start batch_max_rows = grid.get_block_shape(grid_entry)[0] assert batch_max_rows == row_end - row_start result.blocks[grid_entry].oid = self.loadtxt_block( fname, dtype=dtype, comments=comments, delimiter=delimiter, converters=converters, skiprows=batch_skiprows, usecols=usecols, unpack=unpack, ndmin=ndmin, encoding=encoding, max_rows=batch_max_rows, syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape }) return result
def diag(self, X: BlockArray) -> BlockArray: def find_diag_output_blocks(X: BlockArray, total_elements: int): # The i,j entry corresponding to a block in X_blocks. block_i, block_j = 0, 0 # The i,j entry within the current block. element_i, element_j = 0, 0 # Keep track of the no of elements found so far. count = 0 # Start at block 0,0. block = X.blocks[(0, 0)] # Each element contains block indices, diag offset, # and the total elements required from the block. diag_meta = [] while count < total_elements: if element_i > block.shape[0] - 1: block_i = block_i + 1 element_i = 0 if element_j > block.shape[1] - 1: block_j = block_j + 1 element_j = 0 block = X.blocks[(block_i, block_j)] block_rows, block_cols = block.shape[0], block.shape[1] offset = -element_i if element_i > element_j else element_j total_elements_block = (min(block_rows - 1 - element_i, block_cols - 1 - element_j) + 1) diag_meta.append( ((block_i, block_j), offset, total_elements_block)) count, element_i = ( count + total_elements_block, element_i + total_elements_block, ) element_j = element_j + total_elements_block return diag_meta if len(X.shape) == 1: shape = X.shape[0], X.shape[0] block_shape = X.block_shape[0], X.block_shape[0] grid = ArrayGrid(shape, block_shape, X.dtype.__name__) grid_meta = grid.to_meta() rarr = BlockArray(grid, self.cm) for grid_entry in grid.get_entry_iterator(): syskwargs = { "grid_entry": grid_entry, "grid_shape": grid.grid_shape } if np.all(np.diff(grid_entry) == 0): # This is a diagonal block. rarr.blocks[grid_entry].oid = self.cm.diag( X.blocks[grid_entry[0]].oid, 0, syskwargs=syskwargs) else: rarr.blocks[grid_entry].oid = self.cm.new_block( "zeros", grid_entry, grid_meta, syskwargs=syskwargs) elif len(X.shape) == 2: out_shape = (min(X.shape), ) out_block_shape = (min(X.block_shape), ) # Obtain the block indices which contain the diagonal of the matrix. diag_meta = find_diag_output_blocks(X, out_shape[0]) output_block_arrays = [] out_grid_shape = (len(diag_meta), ) count = 0 # Obtain the diagonals. for block_indices, offset, total_elements in diag_meta: syskwargs = { "grid_entry": (count, ), "grid_shape": out_grid_shape } result_block_shape = (total_elements, ) block_grid = ArrayGrid( result_block_shape, result_block_shape, X.blocks[block_indices].dtype.__name__, ) block_array = BlockArray(block_grid, self.cm) block_array.blocks[0].oid = self.cm.diag( X.blocks[block_indices].oid, offset, syskwargs=syskwargs) output_block_arrays.append(block_array) count += 1 if len(output_block_arrays) > 1: # If there are multiple blocks, concatenate them. return self.concatenate(output_block_arrays, axis=0, axis_block_size=out_block_shape[0]) return output_block_arrays[0] else: raise ValueError("X must have 1 or 2 axes.") return rarr
def reduce_axis(self, op_name, axis, keepdims=False): if not (axis is None or isinstance(axis, (int, np.int32, np.int64))): raise NotImplementedError( "Only integer axis is currently supported.") block_reduced_oids = np.empty_like(self.blocks, dtype=tuple) for grid_entry in self.grid.get_entry_iterator(): block = self.blocks[grid_entry] block_oid = self.cm.reduce_axis( op_name=op_name, arr=block.oid, axis=axis, keepdims=keepdims, transposed=block.transposed, syskwargs={ "grid_entry": block.grid_entry, "grid_shape": block.grid_shape, }, ) block_reduced_oids[grid_entry] = ( block_oid, block.grid_entry, block.grid_shape, False, ) result_shape = [] result_block_shape = [] for curr_axis in range(len(self.shape)): axis_size, axis_block_size = ( self.shape[curr_axis], self.block_shape[curr_axis], ) if curr_axis == axis or axis is None: if keepdims: axis_size, axis_block_size = 1, 1 else: continue result_shape.append(axis_size) result_block_shape.append(axis_block_size) result_shape = tuple(result_shape) result_block_shape = tuple(result_block_shape) result_dtype = array_utils.get_reduce_output_type(op_name, self.dtype) result_grid = ArrayGrid( shape=result_shape, block_shape=result_block_shape, dtype=result_dtype.__name__, ) result = BlockArray(result_grid, self.cm) if axis is None: if result.shape == (): result_block: Block = result.blocks[()] else: result_block: Block = result.blocks[:].item() result_block.oid = self._tree_reduce( op_name, block_reduced_oids.flatten().tolist(), result_block.grid_entry, result_block.grid_shape, ) else: for result_grid_entry in result_grid.get_entry_iterator(): block_reduced_oids_axis = [] for sum_dim in range(self.grid.grid_shape[axis]): grid_entry = list(result_grid_entry) if keepdims: grid_entry[axis] = sum_dim else: grid_entry = grid_entry[:axis] + [sum_dim ] + grid_entry[axis:] grid_entry = tuple(grid_entry) block_reduced_oids_axis.append( block_reduced_oids[grid_entry]) result_block: Block = result.blocks[result_grid_entry] result_block.oid = self._tree_reduce( op_name, block_reduced_oids_axis, result_block.grid_entry, result_block.grid_shape, ) return result