def assign_references(self, dst_sel: BasicSelection, value): # TODO (hme): This seems overly complicated, but correct. Double check it. # Also, revisit some of the variable names. They will likely # be confusing in the future. # The destination has same block shape as value, # but the destination selection may not have the same shape as value. # May need to broadcast value to destination selection output shape. dst_offset = dst_sel.position().value // np.array( self._source.block_shape, dtype=np.int) # Do we need to broadcast? if (isinstance(value, ArrayView) and (dst_sel.get_output_shape() != value.sel.get_output_shape())): value = value.create() if isinstance(value, ArrayView): # This is the best case. # We don't need to create value to perform the reference copy. # No broadcasting required, so this should be okay. src_offset = value.sel.position().value // np.array( value._source.block_shape, dtype=np.int) src_inflated_shape = dst_sel.get_broadcastable_shape() src_inflated_block_shape = dst_sel.get_broadcastable_block_shape( value.block_shape) src_inflated_grid: ArrayGrid = ArrayGrid(src_inflated_shape, src_inflated_block_shape, self.grid.dtype.__name__) for src_grid_entry_inflated in src_inflated_grid.get_entry_iterator( ): # Num axes in value grid may be too small. dst_grid_entry = tuple( (np.array(src_grid_entry_inflated, dtype=np.int) + dst_offset).tolist()) src_grid_entry = tuple( (np.array(src_grid_entry_inflated, dtype=np.int) + src_offset).tolist()) self._source.blocks[dst_grid_entry] = value._source.blocks[ src_grid_entry].copy() elif isinstance(value, BlockArrayBase): # The value has already been created, so just leverage value's existing grid iterator. if value.shape != dst_sel.get_output_shape(): # Need to broadcast. src_ba: BlockArrayBase = value.broadcast_to( dst_sel.get_output_shape()) else: src_ba: BlockArrayBase = value src_inflated_shape = dst_sel.get_broadcastable_shape() src_inflated_block_shape = dst_sel.get_broadcastable_block_shape( src_ba.block_shape) src_inflated_grid: ArrayGrid = ArrayGrid(src_inflated_shape, src_inflated_block_shape, self.grid.dtype.__name__) src_grid_entry_iterator = list(src_ba.grid.get_entry_iterator()) for src_index, src_grid_entry_inflated in \ enumerate(src_inflated_grid.get_entry_iterator()): src_grid_entry = src_grid_entry_iterator[src_index] dst_grid_entry = tuple( (np.array(src_grid_entry_inflated, dtype=np.int) + dst_offset).tolist()) self._source.blocks[dst_grid_entry] = src_ba.blocks[ src_grid_entry].copy()
def diag(self, X: BlockArray) -> BlockArray: if len(X.shape) == 1: shape = X.shape[0], X.shape[0] block_shape = X.block_shape[0], X.block_shape[0] grid = ArrayGrid(shape, block_shape, X.dtype.__name__) grid_meta = grid.to_meta() rarr = BlockArray(grid, self.cm) for grid_entry in grid.get_entry_iterator(): syskwargs = {"grid_entry": grid_entry, "grid_shape": grid.grid_shape} if np.all(np.diff(grid_entry) == 0): # This is a diagonal block. rarr.blocks[grid_entry].oid = self.cm.diag( X.blocks[grid_entry[0]].oid, 0, syskwargs=syskwargs ) else: rarr.blocks[grid_entry].oid = self.cm.new_block( "zeros", grid_entry, grid_meta, syskwargs=syskwargs ) elif len(X.shape) == 2: out_shape = (min(X.shape),) out_block_shape = (min(X.block_shape),) # Obtain the block indices which contain the diagonal of the matrix. diag_meta = array_utils.find_diag_output_blocks(X.blocks, out_shape[0]) output_block_arrays = [] out_grid_shape = (len(diag_meta),) count = 0 # Obtain the diagonals. for block_indices, offset, total_elements in diag_meta: syskwargs = {"grid_entry": (count,), "grid_shape": out_grid_shape} result_block_shape = (total_elements,) block_grid = ArrayGrid( result_block_shape, result_block_shape, X.blocks[block_indices].dtype.__name__, ) block_array = BlockArray(block_grid, self.cm) block_array.blocks[0].oid = self.cm.diag( X.blocks[block_indices].oid, offset, syskwargs=syskwargs ) output_block_arrays.append(block_array) count += 1 if len(output_block_arrays) > 1: # If there are multiple blocks, concatenate them. return self.concatenate( output_block_arrays, axis=0, axis_block_size=out_block_shape[0] ) return output_block_arrays[0] else: raise ValueError("X must have 1 or 2 axes.") return rarr
def test_compute_block_shape(app_inst: ArrayApplication): dtype = np.float32 cores_per_node = 64 # Tall-skinny. for size in [64, 128, 256, 512, 1024]: size_str = "%sGB" % size num_nodes = size // 64 cluster_shape = (16, 1) shape, expected_block_shape, expected_grid_shape = ideal_tall_skinny_shapes( size_str, dtype) block_shape = app_inst.cm.compute_block_shape( shape, dtype, cluster_shape, num_nodes * cores_per_node) grid: ArrayGrid = ArrayGrid(shape, block_shape, dtype.__name__) print( "tall-skinny", "cluster_shape=%s" % str(cluster_shape), "grid_shape=%s" % str(expected_grid_shape), "size=%s" % size_str, "bytes computed=%s" % (grid.nbytes() / 10**9), ) assert expected_grid_shape == grid.grid_shape assert expected_block_shape == block_shape # Square. for size in [4, 16, 64, 256, 1024]: size_str = "%sGB" % size num_nodes = 1 if size < 64 else size // 64 cluster_shape = int(np.sqrt(num_nodes)), int(np.sqrt(num_nodes)) shape, expected_block_shape, expected_grid_shape = ideal_square_shapes( size_str, dtype) block_shape = app_inst.cm.compute_block_shape( shape, dtype, cluster_shape, num_nodes * cores_per_node) grid: ArrayGrid = ArrayGrid(shape, block_shape, dtype.__name__) print( "square", "cluster_shape=%s" % str(cluster_shape), "grid_shape=%s" % str(expected_grid_shape), "size=%s" % size_str, "bytes computed=%s" % (grid.nbytes() / 10**9), ) assert expected_grid_shape == grid.grid_shape, "%s != %s" % ( expected_grid_shape, grid.grid_shape, ) assert expected_block_shape == block_shape, "%s != %s" % ( expected_block_shape, block_shape, )
def _stack_copy(self, X): assert len(X.shape) == 1 output_shape = (max(X.shape), max(X.shape)) output_block_shape = (X.block_shape[0], X.block_shape[0]) output_arr_grid = ArrayGrid(output_shape, output_block_shape, X.dtype.__name__) output_block_array = BlockArray(output_arr_grid, self.cm) max_block_rows, max_block_cols = ( output_block_array.blocks.shape[0], output_block_array.blocks.shape[1], ) block_row_index = 0 for i in range(max_block_rows): block_row_index = 0 for j in range(max_block_cols): syskwargs = { "grid_entry": (i, j), "grid_shape": output_arr_grid.grid_shape, } block = output_block_array.blocks[(i, j)] rows, cols = block.shape[0], block.shape[1] output_block_array.blocks[(i, j)].oid = self.cm.triu_copy( X.blocks[block_row_index].oid, rows, cols, syskwargs=syskwargs ) block_row_index += 1 return output_block_array
def _inspect_block_shape(nps_app_inst): app = nps_app_inst dtypes = [np.float32, np.float64] shapes = [(10**9, 250), (10**4, 10**4), (10**7, 10), (10, 10**7)] cluster_shapes = [(1, 1), (2, 1), (4, 1), (16, 1)] cores_per_node = 64 combos = itertools.product(dtypes, shapes, cluster_shapes) for dtype, shape, cluster_shape in combos: num_cores = np.product(cluster_shape) * cores_per_node block_shape = app.compute_block_shape(shape=shape, dtype=dtype, cluster_shape=cluster_shape, num_cores=num_cores) grid: ArrayGrid = ArrayGrid(shape, block_shape, dtype.__name__) print() print( "dtype=%s" % dtype.__name__, "cluster_shape=%s" % str(cluster_shape), "shape=%s" % str(shape), ) print("grid_shape", grid.grid_shape, "block_shape", block_shape) print( "array size (GB)", np.product(shape) * dtype().nbytes / 10**9, "block size (GB)", np.product(block_shape) * dtype().nbytes / 10**9, )
def predict(self, X: BlockArray): app: ArrayApplication = _instance() cm: ComputeManager = app.cm cm.register("xgb_predict", xgb_predict_remote, {}) model_block: Block = self.model.blocks[0] result: BlockArray = BlockArray( ArrayGrid( shape=(X.shape[0], ), block_shape=(X.block_shape[0], ), dtype=nps.int.__name__, ), cm, ) for grid_entry in X.grid.get_entry_iterator(): i = grid_entry[0] X_block: Block = X.blocks[grid_entry] r_block: Block = result.blocks[i] syskwargs = { "grid_entry": grid_entry, "grid_shape": X.grid.grid_shape } r_block.oid = cm.call("xgb_predict", model_block.oid, X_block.oid, syskwargs=syskwargs) return result
def argop(self, op_name: str, arr: BlockArray, axis=None): if len(arr.shape) > 1: raise NotImplementedError( "%s currently supports one-dimensional arrays." % op_name) if axis is None: axis = 0 assert axis == 0 grid = ArrayGrid(shape=(), block_shape=(), dtype=np.int64.__name__) result = BlockArray(grid, self.cm) reduction_result = None, None for grid_entry in arr.grid.get_entry_iterator(): block_slice: slice = arr.grid.get_slice(grid_entry)[0] block: Block = arr.blocks[grid_entry] syskwargs = { "grid_entry": grid_entry, "grid_shape": arr.grid.grid_shape, "options": { "num_returns": 2 }, } reduction_result = self.cm.arg_op(op_name, block.oid, block_slice, *reduction_result, syskwargs=syskwargs) argoptima, _ = reduction_result result.blocks[()].oid = argoptima return result
def _broadcast_bop(self, op_name, arr_1, arr_2) -> BlockArray: """We want to avoid invoking this op whenever possible; NumPy's imp is faster. Args: op_name: Name of binary operation. arr_1: A BlockArray. arr_2: A BlockArray. Returns: A BlockArray. """ if arr_1.shape != arr_2.shape: output_grid_shape = array_utils.broadcast_shape( arr_1.grid.grid_shape, arr_2.grid.grid_shape) arr_1 = arr_1.broadcast_to(output_grid_shape) arr_2 = arr_2.broadcast_to(output_grid_shape) dtype = array_utils.get_bop_output_type(op_name, arr_1.dtype, arr_2.dtype) grid = ArrayGrid(arr_1.shape, arr_1.block_shape, dtype.__name__) rarr = BlockArray(grid, self.cm) for grid_entry in rarr.grid.get_entry_iterator(): block_1: Block = arr_1.blocks[grid_entry] block_2: Block = arr_2.blocks[grid_entry] rarr.blocks[grid_entry] = block_1.bop(op_name, block_2, {}) return rarr
def test_computations(): grid: ArrayGrid = ArrayGrid(shape=(2, 6, 10), block_shape=(1, 2, 5), dtype="float32") cluster_shapes = list( itertools.product(list(range(1, 5)), list(range(1, 7)), list(range(1, 11)))) for cluster_shape in cluster_shapes: device_ids = mock_device_ids(int(np.product(cluster_shape))) cyclic_grid: CyclicDeviceGrid = CyclicDeviceGrid( cluster_shape, "cpu", device_ids) for grid_entry in grid.get_entry_iterator(): cluster_entry = cyclic_grid.get_cluster_entry( grid_entry, grid.grid_shape) assert cluster_entry == tuple( np.array(grid_entry) % np.array(cluster_shape)) def true_packed_entry(grid_entry, grid_shape, cluster_shape): grid_entry = np.array(grid_entry) grid_shape = np.array(grid_shape) cluster_shape = np.array(cluster_shape) r = grid_entry / grid_shape * cluster_shape # r = np.min(cluster_shape-1, r, axis=1) return tuple(r.astype(int).tolist()) for cluster_shape in cluster_shapes: device_ids = mock_device_ids(int(np.product(cluster_shape))) packed_grid: PackedDeviceGrid = PackedDeviceGrid( cluster_shape, "cpu", device_ids) for grid_entry in grid.get_entry_iterator(): cluster_entry = packed_grid.get_cluster_entry( grid_entry, grid.grid_shape) assert cluster_entry == true_packed_entry(grid_entry, grid.grid_shape, cluster_shape)
def test_device_id(): cluster_shape = (1, 2, 3) device_ids = mock_device_ids(int(np.product(cluster_shape))) grid: ArrayGrid = ArrayGrid(shape=(8, 20, 12), block_shape=(2, 5, 3), dtype="float32") # A basic smoke test. device_ids: List[DeviceID] = mock_device_ids(int( np.product(cluster_shape))) cyclic_grid: CyclicDeviceGrid = CyclicDeviceGrid(cluster_shape, "cpu", device_ids) touched_devices = set() for grid_entry in grid.get_entry_iterator(): touched_devices.add( cyclic_grid.get_device_id(grid_entry, grid.grid_shape)) assert len(touched_devices) == len(device_ids) packed_grid: PackedDeviceGrid = PackedDeviceGrid(cluster_shape, "cpu", device_ids) touched_devices = set() for grid_entry in grid.get_entry_iterator(): touched_devices.add( packed_grid.get_device_id(grid_entry, grid.grid_shape)) assert len(touched_devices) == len(device_ids)
def create_references(self, concrete_cls) -> BlockArrayBase: # TODO (hme): Double check this. array_cls = BlockArrayBase if concrete_cls is None else concrete_cls dst_ba: BlockArrayBase = array_cls(self.grid, self._cm) if 0 in self.shape: return dst_ba grid_offset = self.sel.position().value // np.array( self._source.block_shape, dtype=np.int) dst_inflated_shape = self.sel.get_broadcastable_shape() dst_inflated_block_shape = self.sel.get_broadcastable_block_shape( self.block_shape) dst_inflated_grid: ArrayGrid = ArrayGrid(dst_inflated_shape, dst_inflated_block_shape, self.grid.dtype.__name__) dst_grid_entry_iterator = list(dst_ba.grid.get_entry_iterator()) for dst_index, dst_inflated_grid_entry in enumerate( dst_inflated_grid.get_entry_iterator()): dst_grid_entry = dst_grid_entry_iterator[dst_index] src_grid_entry = tuple( (np.array(dst_inflated_grid_entry, dtype=np.int) + grid_offset).tolist()) dst_ba.blocks[dst_grid_entry].oid = self._source.blocks[ src_grid_entry].oid dst_ba.blocks[dst_grid_entry].transposed \ = self._source.blocks[src_grid_entry].transposed return dst_ba
def arange(self, start_in, shape, block_shape, step=1, dtype=None) -> BlockArray: assert step == 1 if dtype is None: dtype = np.__getattribute__( str(np.result_type(start_in, shape[0] + start_in))) # Generate ranges per block. grid = ArrayGrid(shape, block_shape, dtype.__name__) rarr = BlockArray(grid, self.cm) for _, grid_entry in enumerate(grid.get_entry_iterator()): syskwargs = { "grid_entry": grid_entry, "grid_shape": grid.grid_shape } start = start_in + block_shape[0] * grid_entry[0] entry_shape = grid.get_block_shape(grid_entry) stop = start + entry_shape[0] rarr.blocks[grid_entry].oid = self.cm.arange(start, stop, step, dtype, syskwargs=syskwargs) return rarr
def from_oid(cls, oid, shape, dtype, cm): block_shape = shape grid = ArrayGrid(shape, block_shape, dtype.__name__) ba = BlockArray(grid, cm) for i, grid_entry in enumerate(grid.get_entry_iterator()): assert i == 0 ba.blocks[grid_entry].oid = oid return ba
def triu(self, X: BlockArray): if len(X.shape) == 1: return self.triu(self._stack_copy(X)) elif len(X.shape) == 2: if X.shape[0] == 1: return X diag_meta = array_utils.find_diag_output_blocks(X.blocks, min(X.shape)) output_arr_grid = ArrayGrid(X.shape, X.block_shape, X.dtype.__name__) output_block_array = BlockArray(output_arr_grid, self.cm) visited = dict() total_row_blocks, total_col_blocks = X.blocks.shape[0], X.blocks.shape[1] for block_indices, offset, total_elements in diag_meta: syskwargs = { "grid_entry": block_indices, "grid_shape": output_arr_grid.grid_shape, } output_block_array.blocks[block_indices].oid = self.cm.triu( X.blocks[block_indices].oid, offset, False, total_elements, syskwargs=syskwargs, ) visited[block_indices] = 1 for block_indices, offset, total_elements in diag_meta: row_c, col_c = block_indices[0] + 1, block_indices[1] while row_c < total_row_blocks: syskwargs = { "grid_entry": (row_c, col_c), "grid_shape": output_arr_grid.grid_shape, } if (row_c, col_c) in visited: output_block_array.blocks[(row_c, col_c)].oid = self.cm.triu( output_block_array.blocks[(row_c, col_c)].oid, offset, True, total_elements, syskwargs=syskwargs, ) else: output_block_array.blocks[(row_c, col_c)].oid = self.cm.triu( X.blocks[(row_c, col_c)].oid, offset, True, total_elements, syskwargs=syskwargs, ) visited[(row_c, col_c)] = 1 row_c += 1 for i in range(total_row_blocks): for j in range(total_col_blocks): if (i, j) not in visited: output_block_array.blocks[(i, j)].oid = X.blocks[(i, j)].oid return output_block_array else: raise NotImplementedError()
def indirect_tsr(app: ArrayApplication, X: BlockArray, reshape_output=True): assert len(X.shape) == 2 # TODO (hme): This assertion is temporary and ensures returned # shape of qr of block is correct. assert X.block_shape[0] >= X.shape[1] # Compute R for each block. grid = X.grid grid_shape = grid.grid_shape shape = X.shape block_shape = X.block_shape R_oids = [] # Assume no blocking along second dim. for i in range(grid_shape[0]): # Select a row according to block_shape. row = [] for j in range(grid_shape[1]): row.append(X.blocks[i, j].oid) R_oids.append( app.cm.qr(*row, mode="r", axis=1, syskwargs={ "grid_entry": (i, 0), "grid_shape": (grid_shape[0], 1), "options": { "num_returns": 1 }, })) # Construct R by summing over R blocks. # TODO (hme): Communication may be inefficient due to redundancy of data. R_shape = (shape[1], shape[1]) R_block_shape = (block_shape[1], block_shape[1]) tsR = BlockArray( ArrayGrid(shape=R_shape, block_shape=R_shape, dtype=X.dtype.__name__), app.cm) tsR.blocks[0, 0].oid = app.cm.qr(*R_oids, mode="r", axis=0, syskwargs={ "grid_entry": (0, 0), "grid_shape": (1, 1), "options": { "num_returns": 1 }, }) # If blocking is "tall-skinny," then we're done. if R_shape != R_block_shape: if reshape_output: R = tsR.reshape(R_shape, block_shape=R_block_shape) else: R = tsR else: R = tsR return R
def diag(self, X: BlockArray) -> BlockArray: if len(X.shape) == 1: shape = X.shape[0], X.shape[0] block_shape = X.block_shape[0], X.block_shape[0] grid = ArrayGrid(shape, block_shape, X.dtype.__name__) grid_meta = grid.to_meta() rarr = BlockArray(grid, self.cm) for grid_entry in grid.get_entry_iterator(): syskwargs = { "grid_entry": grid_entry, "grid_shape": grid.grid_shape } if np.all(np.diff(grid_entry) == 0): # This is a diagonal block. rarr.blocks[grid_entry].oid = self.cm.diag( X.blocks[grid_entry[0]].oid, syskwargs=syskwargs) else: rarr.blocks[grid_entry].oid = self.cm.new_block( "zeros", grid_entry, grid_meta, syskwargs=syskwargs) elif len(X.shape) == 2: assert X.shape[0] == X.shape[1], "X must be a square array." assert X.block_shape[0] == X.block_shape[ 1], "block_shape must be square." shape = X.shape[0], block_shape = X.block_shape[0], grid = ArrayGrid(shape, block_shape, X.dtype.__name__) rarr = BlockArray(grid, self.cm) for grid_entry in X.grid.get_entry_iterator(): out_grid_entry = grid_entry[:1] out_grid_shape = grid.grid_shape[:1] syskwargs = { "grid_entry": out_grid_entry, "grid_shape": out_grid_shape } if np.all(np.diff(grid_entry) == 0): # This is a diagonal block. rarr.blocks[out_grid_entry].oid = self.cm.diag( X.blocks[grid_entry].oid, syskwargs=syskwargs) else: raise ValueError("X must have 1 or 2 axes.") return rarr
def vec_from_oids(self, oids, shape, block_shape, dtype): arr = BlockArray( ArrayGrid(shape=shape, block_shape=shape, dtype=dtype.__name__), self.cm) # Make sure resulting grid shape is a vector (1 dimensional). assert np.sum(arr.grid.grid_shape) == (max(arr.grid.grid_shape) + len(arr.grid.grid_shape) - 1) for i, grid_entry in enumerate(arr.grid.get_entry_iterator()): arr.blocks[grid_entry].oid = oids[i] if block_shape != shape: return arr.reshape(block_shape=block_shape) return arr
def from_np(cls, arr, block_shape, copy, cm): dtype_str = str(arr.dtype) grid = ArrayGrid(arr.shape, block_shape, dtype_str) rarr = BlockArray(grid, cm) grid_entry_iterator = grid.get_entry_iterator() for grid_entry in grid_entry_iterator: grid_slice = grid.get_slice(grid_entry) block = arr[grid_slice] if copy: block = np.copy(block) rarr.blocks[grid_entry].oid = cm.put(block) rarr.blocks[grid_entry].dtype = getattr(np, dtype_str) return rarr
def train(params: Dict, data: NumsDMatrix, *args, evals=(), **kwargs): X: BlockArray = data.X y: BlockArray = data.y assert len(X.shape) == 2 assert X.shape[0] == X.shape[0] and X.block_shape[0] == y.block_shape[0] assert len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1) app: ArrayApplication = _instance() cm: ComputeManager = app.cm cm.register("xgb_train", xgb_train_remote, {}) # Start tracker num_workers = X.grid.grid_shape[0] env = _start_rabit_tracker(num_workers) rabit_args = [("%s=%s" % item).encode() for item in env.items()] evals_flat = [] for eval_X, eval_y, eval_method in evals: if eval_X.shape != eval_X.block_shape: eval_X = eval_X.reshape(shape=eval_X.shape, block_shape=eval_X.shape) if eval_y.shape != eval_y.block_shape: eval_y = eval_y.reshape(shape=eval_y.shape, block_shape=eval_y.shape) eval_X_oid = eval_X.blocks.item().oid eval_y_oid = eval_y.blocks.item().oid evals_flat += [eval_X_oid, eval_y_oid, eval_method] X: BlockArray = X.reshape(block_shape=(X.block_shape[0], X.shape[1])) result: BlockArray = BlockArray( ArrayGrid(shape=(X.grid.grid_shape[0], ), block_shape=(1, ), dtype="dict"), cm) for grid_entry in X.grid.get_entry_iterator(): X_block: Block = X.blocks[grid_entry] i = grid_entry[0] if len(y.shape) == 1: y_block: Block = y.blocks[i] else: y_block: Block = y.blocks[i, 0] syskwargs = {"grid_entry": grid_entry, "grid_shape": X.grid.grid_shape} result.blocks[i].oid = cm.call("xgb_train", X_block.oid, y_block.oid, rabit_args, params, args, kwargs, *evals_flat, syskwargs=syskwargs) return result
def read_csv(self, filename, dtype=float, delimiter=",", has_header=False, num_workers=4): file_size = storage_utils.get_file_size(filename) file_batches: storage_utils.Batch = storage_utils.Batch.from_num_batches( file_size, num_workers) blocks = [] shape_oids = [] for i, batch in enumerate(file_batches.batches): file_start, file_end = batch block_oid, shape_oid = self.cm.call( "read_csv_block", filename, file_start, file_end, dtype, delimiter, has_header, syskwargs={ "grid_entry": (i, ), "grid_shape": (num_workers, ), "options": { "num_returns": 2 }, }, ) blocks.append(block_oid) shape_oids.append(shape_oid) shapes = self.cm.get(shape_oids) arrays = [] for i in range(len(shapes)): shape = shapes[i] if shape[0] == 0: continue block = blocks[i] grid = ArrayGrid(shape=shape, block_shape=shape, dtype=dtype.__name__) arr = BlockArray(grid, self.cm) iter_one = True for grid_entry in grid.get_entry_iterator(): assert iter_one iter_one = False arr.blocks[grid_entry].oid = block arrays.append(arr) return arrays
def test_array_rwd(): conn = boto3.resource("s3", region_name="us-east-1") assert conn.Bucket("darrays") not in conn.buckets.all() conn.create_bucket(Bucket="darrays") X: np.ndarray = np.random.random(3) stored_X = StoredArrayS3("darrays/%s_X" % "__test__") stored_X.put_grid( ArrayGrid(shape=X.shape, block_shape=X.shape, dtype=np.float64.__name__) ) stored_X.init_grid() stored_X.put_array(X) assert np.allclose(X, stored_X.get_array()) stored_X.del_array() stored_X.delete_grid()
def empty(cls, shape, block_shape, dtype, cm: ComputeManager): grid = ArrayGrid(shape=shape, block_shape=block_shape, dtype=dtype.__name__) grid_meta = grid.to_meta() arr = BlockArray(grid, cm) for grid_entry in grid.get_entry_iterator(): arr.blocks[grid_entry].oid = cm.empty(grid_entry, grid_meta, syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape }) return arr
def test_bounds(): grid: ArrayGrid = ArrayGrid(shape=(2, 6, 10), block_shape=(1, 2, 5), dtype="float32") for cluster_shape in [(1, ), (1, 1), (1, 1, 1), (1, 1, 1, 1)]: cyclic_grid: CyclicDeviceGrid = CyclicDeviceGrid( cluster_shape, "cpu", mock_device_ids(1)) packed_grid: PackedDeviceGrid = PackedDeviceGrid( cluster_shape, "cpu", mock_device_ids(1)) for grid_entry in grid.get_entry_iterator(): cluster_entry = cyclic_grid.get_cluster_entry( grid_entry, grid.grid_shape) assert cluster_entry == tuple([0] * len(cyclic_grid.grid_shape)) cluster_entry = packed_grid.get_cluster_entry( grid_entry, grid.grid_shape) assert cluster_entry == tuple([0] * len(packed_grid.grid_shape))
def _delete(self, filename, store_cls, remote_func): grid = self._get_array_grid(filename, store_cls) result_grid = ArrayGrid( grid.grid_shape, tuple(np.ones_like(grid.shape, dtype=np.int)), dtype=dict.__name__, ) rarr = BlockArray(result_grid, self.cm) for grid_entry in grid.get_entry_iterator(): rarr.blocks[grid_entry].oid = remote_func( filename, grid_entry, grid.to_meta(), syskwargs={"grid_entry": grid_entry, "grid_shape": grid.grid_shape}, ) return rarr
def _write(self, ba: BlockArray, filename, remote_func): grid = ba.grid result_grid = ArrayGrid(grid.grid_shape, tuple(np.ones_like(grid.shape, dtype=np.int)), dtype=dict.__name__) rarr = BlockArray(result_grid, self.cm) for grid_entry in grid.get_entry_iterator(): rarr.blocks[grid_entry].oid = remote_func( ba.blocks[grid_entry].oid, filename, grid_entry, grid.to_meta(), syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape }) return rarr
def _new_array( self, op_name: str, shape: tuple, block_shape: tuple, dtype: np.dtype = None ): assert len(shape) == len(block_shape) if dtype is None: dtype = np.float64 grid = ArrayGrid(shape, block_shape, dtype.__name__) grid_meta = grid.to_meta() rarr = BlockArray(grid, self.cm) for grid_entry in grid.get_entry_iterator(): rarr.blocks[grid_entry].oid = self.cm.new_block( op_name, grid_entry, grid_meta, syskwargs={"grid_entry": grid_entry, "grid_shape": grid.grid_shape}, ) return rarr
def test_split(app_inst: ArrayApplication): # TODO (hme): Implement a split leveraging block_shape param in reshape op. x = app_inst.array(np.array([1.0, 2.0, 3.0, 4.0]), block_shape=(4,)) syskwargs = { "grid_entry": x.blocks[0].grid_entry, "grid_shape": x.blocks[0].grid_shape, "options": {"num_returns": 2} } res1, res2 = x.cm.split(x.blocks[0].oid, 2, axis=0, transposed=False, syskwargs=syskwargs) ba = BlockArray(ArrayGrid((4,), (2,), x.dtype.__name__), x.cm) ba.blocks[0].oid = res1 ba.blocks[1].oid = res2 assert np.allclose([1.0, 2.0, 3.0, 4.0], ba.get())
def permutation(self, size, block_size): shape = (size, ) block_shape = (block_size, ) grid: ArrayGrid = ArrayGrid(shape=shape, block_shape=shape, dtype=np.int64.__name__) ba = BlockArray(grid, self._cm) for grid_entry in ba.grid.get_entry_iterator(): rng_params = list(self._rng.new_block_rng_params()) block: Block = ba.blocks[grid_entry] block.oid = self._cm.permutation(rng_params, size, syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape }) return ba.reshape(block_shape=block_shape)
def map_uop( self, op_name: str, arr: BlockArray, out: BlockArray = None, where=True, args=None, kwargs=None, ) -> BlockArray: """A map, for unary operators, that applies to every entry of an array. Args: op_name: An element-wise unary operator. arr: A BlockArray. out: A BlockArray to which the result is written. where: An indicator specifying the indices to which op is applied. args: Args provided to op. kwargs: Keyword args provided to op. Returns: A BlockArray. """ if where is not True: raise NotImplementedError("'where' argument is not yet supported.") args = () if args is None else args kwargs = {} if kwargs is None else kwargs shape = arr.shape block_shape = arr.block_shape dtype = array_utils.get_uop_output_type(op_name, arr.dtype) assert len(shape) == len(block_shape) if out is None: grid = ArrayGrid(shape, block_shape, dtype.__name__) rarr = BlockArray(grid, self.cm) else: rarr = out grid = rarr.grid assert rarr.shape == arr.shape and rarr.block_shape == arr.block_shape for grid_entry in grid.get_entry_iterator(): # TODO(hme): Faster to create ndarray first, # and instantiate block array on return # to avoid instantiating blocks on BlockArray initialization. rarr.blocks[grid_entry] = arr.blocks[grid_entry].uop_map( op_name, args=args, kwargs=kwargs ) return rarr
def delete_fs(self, filename: str): meta = self._fs.read_meta_fs(filename) addresses = meta["addresses"] grid_meta = meta["grid_meta"] grid = ArrayGrid.from_meta(grid_meta) result_grid = ArrayGrid(grid.grid_shape, tuple(np.ones_like(grid.shape, dtype=np.int)), dtype=dict.__name__) rarr = BlockArray(result_grid, self.cm) for grid_entry in addresses: device_id: DeviceID = DeviceID.from_str(addresses[grid_entry]) rarr.blocks[grid_entry].oid = self._fs.delete_block_fs( filename, grid_entry, grid_meta, syskwargs={"device_id": device_id}) self._fs.delete_meta_fs(filename) return rarr