def assign_references(self, dst_sel: BasicSelection, value): # TODO (hme): This seems overly complicated, but correct. Double check it. # Also, revisit some of the variable names. They will likely # be confusing in the future. # The destination has same block shape as value, # but the destination selection may not have the same shape as value. # May need to broadcast value to destination selection output shape. dst_offset = dst_sel.position().value // np.array( self._source.block_shape, dtype=np.int) # Do we need to broadcast? if (isinstance(value, ArrayView) and (dst_sel.get_output_shape() != value.sel.get_output_shape())): value = value.create() if isinstance(value, ArrayView): # This is the best case. # We don't need to create value to perform the reference copy. # No broadcasting required, so this should be okay. src_offset = value.sel.position().value // np.array( value._source.block_shape, dtype=np.int) src_inflated_shape = dst_sel.get_broadcastable_shape() src_inflated_block_shape = dst_sel.get_broadcastable_block_shape( value.block_shape) src_inflated_grid: ArrayGrid = ArrayGrid(src_inflated_shape, src_inflated_block_shape, self.grid.dtype.__name__) for src_grid_entry_inflated in src_inflated_grid.get_entry_iterator( ): # Num axes in value grid may be too small. dst_grid_entry = tuple( (np.array(src_grid_entry_inflated, dtype=np.int) + dst_offset).tolist()) src_grid_entry = tuple( (np.array(src_grid_entry_inflated, dtype=np.int) + src_offset).tolist()) self._source.blocks[dst_grid_entry] = value._source.blocks[ src_grid_entry].copy() elif isinstance(value, BlockArrayBase): # The value has already been created, so just leverage value's existing grid iterator. if value.shape != dst_sel.get_output_shape(): # Need to broadcast. src_ba: BlockArrayBase = broadcast_to( value, dst_sel.get_output_shape()) else: src_ba: BlockArrayBase = value src_inflated_shape = dst_sel.get_broadcastable_shape() src_inflated_block_shape = dst_sel.get_broadcastable_block_shape( src_ba.block_shape) src_inflated_grid: ArrayGrid = ArrayGrid(src_inflated_shape, src_inflated_block_shape, self.grid.dtype.__name__) src_grid_entry_iterator = list(src_ba.grid.get_entry_iterator()) for src_index, src_grid_entry_inflated in \ enumerate(src_inflated_grid.get_entry_iterator()): src_grid_entry = src_grid_entry_iterator[src_index] dst_grid_entry = tuple( (np.array(src_grid_entry_inflated, dtype=np.int) + dst_offset).tolist()) self._source.blocks[dst_grid_entry] = src_ba.blocks[ src_grid_entry].copy()
def argop(self, op_name: str, arr: BlockArray, axis=None): if len(arr.shape) > 1: raise NotImplementedError( "%s currently supports one-dimensional arrays." % op_name) if axis is None: axis = 0 assert axis == 0 grid = ArrayGrid(shape=(), block_shape=(), dtype=np.int64.__name__) result = BlockArray(grid, self.system) reduction_result = None, None for grid_entry in arr.grid.get_entry_iterator(): block_slice: slice = arr.grid.get_slice(grid_entry)[0] block: Block = arr.blocks[grid_entry] syskwargs = { "grid_entry": grid_entry, "grid_shape": arr.grid.grid_shape, "options": { "num_returns": 2 }, } reduction_result = self.system.arg_op(op_name, block.oid, block_slice, *reduction_result, syskwargs=syskwargs) argoptima, _ = reduction_result result.blocks[()].oid = argoptima return result
def reduce_axis(self, op_name, axis, keepdims=False): result_blocks = np.empty_like(self.blocks, dtype=Block) for grid_entry in self.grid.get_entry_iterator(): result_blocks[grid_entry] = self.blocks[grid_entry].reduce_axis(op_name, axis, keepdims=keepdims) result_shape = [] result_block_shape = [] for curr_axis in range(len(self.shape)): axis_size, axis_block_size = self.shape[curr_axis], self.block_shape[curr_axis] if curr_axis == axis: if keepdims: axis_size, axis_block_size = 1, 1 else: continue result_shape.append(axis_size) result_block_shape.append(axis_block_size) result_shape = tuple(result_shape) result_block_shape = tuple(result_block_shape) result_dtype = array_utils.get_reduce_output_type(op_name, self.dtype) result_grid = ArrayGrid(shape=result_shape, block_shape=result_block_shape, dtype=result_dtype.__name__) result = BlockArray(result_grid, self.system) op_func = np.__getattribute__(op_name) reduced_blocks = op_func(result_blocks, axis=axis, keepdims=keepdims) if result.shape == (): result.blocks[()] = reduced_blocks else: result.blocks = reduced_blocks return result
def _tensordot(self, other, axes): this_axes = self.grid.grid_shape[:-axes] this_sum_axes = self.grid.grid_shape[-axes:] other_axes = other.grid.grid_shape[axes:] other_sum_axes = other.grid.grid_shape[:axes] assert this_sum_axes == other_sum_axes result_shape = tuple(self.shape[:-axes] + other.shape[axes:]) result_block_shape = tuple(self.block_shape[:-axes] + other.block_shape[axes:]) result_grid = ArrayGrid(shape=result_shape, block_shape=result_block_shape, dtype=array_utils.get_bop_output_type("tensordot", self.dtype, other.dtype).__name__) assert result_grid.grid_shape == tuple(this_axes + other_axes) result = BlockArray(result_grid, self.system) this_dims = list(itertools.product(*map(range, this_axes))) other_dims = list(itertools.product(*map(range, other_axes))) sum_dims = list(itertools.product(*map(range, this_sum_axes))) for i in this_dims: for j in other_dims: grid_entry = tuple(i + j) result_block = None for k in sum_dims: self_block: Block = self.blocks[tuple(i + k)] other_block: Block = other.blocks[tuple(k + j)] dotted_block = self_block.tensordot(other_block, axes=axes) if result_block is None: result_block = dotted_block else: result_block += dotted_block result.blocks[grid_entry] = result_block return result
def _sample_basic_sparse(self, density, format, shape, block_shape, dtype) -> BlockArray: if shape is None: assert block_shape is None shape = () block_shape = () else: assert block_shape is not None if dtype is None: dtype = np.float64 assert isinstance(dtype, type) grid: ArrayGrid = ArrayGrid(shape, block_shape, dtype=dtype.__name__) ba: SparseBlockArray = SparseBlockArray(grid, self._system) for grid_entry in ba.grid.get_entry_iterator(): # Size and dtype to begin with. m, n = grid.get_block_shape(grid_entry) block = ba.blocks[grid_entry] block.oid = self._system.random_block_sparse(m, n, density, format, dtype, syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape }) return ba
def create_references(self, concrete_cls) -> BlockArrayBase: # TODO (hme): Double check this. array_cls = BlockArrayBase if concrete_cls is None else concrete_cls dst_ba: BlockArrayBase = array_cls(self.grid, self._system) if 0 in self.shape: return dst_ba grid_offset = self.sel.position().value // np.array( self._source.block_shape, dtype=np.int) dst_inflated_shape = self.sel.get_broadcastable_shape() dst_inflated_block_shape = self.sel.get_broadcastable_block_shape( self.block_shape) dst_inflated_grid: ArrayGrid = ArrayGrid(dst_inflated_shape, dst_inflated_block_shape, self.grid.dtype.__name__) dst_grid_entry_iterator = list(dst_ba.grid.get_entry_iterator()) for dst_index, dst_inflated_grid_entry in enumerate( dst_inflated_grid.get_entry_iterator()): dst_grid_entry = dst_grid_entry_iterator[dst_index] src_grid_entry = tuple( (np.array(dst_inflated_grid_entry, dtype=np.int) + grid_offset).tolist()) dst_ba.blocks[dst_grid_entry].oid = self._source.blocks[ src_grid_entry].oid dst_ba.blocks[dst_grid_entry].transposed \ = self._source.blocks[src_grid_entry].transposed return dst_ba
def from_oid(cls, oid, shape, dtype, system): block_shape = shape grid = ArrayGrid(shape, block_shape, dtype.__name__) ba = BlockArray(grid, system) for i, grid_entry in enumerate(grid.get_entry_iterator()): assert i == 0 ba.blocks[grid_entry].oid = oid return ba
def _vecdot(self, other): assert self.shape[-1] == other.shape[0], str((self.shape[1], other.shape[0])) result_shape = tuple(self.shape[:-1] + other.shape[1:]) result_block_shape = tuple(self.block_shape[:-1] + other.block_shape[1:]) result_grid = ArrayGrid(shape=result_shape, block_shape=result_block_shape, dtype=self.dtype.__name__) result = BlockArray(result_grid, self.system) self_num_axes = len(self.grid.grid_shape) other_num_axes = len(other.grid.grid_shape) oids = [] for i in range(self.grid.grid_shape[-1]): self_grid_entry = tuple(i if axis == self_num_axes-1 else 0 for axis in range(self_num_axes)) other_grid_entry = tuple(i if axis == 0 else 0 for axis in range(other_num_axes)) self_block: Block = self.blocks[self_grid_entry] other_block: Block = other.blocks[other_grid_entry] if self_block.transposed != other_block.transposed: # The vectors are aligned if their transpositions satisfy the xor relation. if self_block.transposed: # Use other grid entry for dot, # because physically, # other block is located on same node as self block. sch_grid_entry = other_grid_entry sch_grid_shape = other.grid.grid_shape elif other_block.transposed: # Use self grid entry for dot. sch_grid_entry = self_grid_entry sch_grid_shape = self.grid.grid_shape else: raise Exception("Impossible.") else: # They're either both transposed or not. # Either way, one will need to be transmitted, so transmit other. sch_grid_entry = self_grid_entry sch_grid_shape = self.grid.grid_shape dot_oid = self.system.bop("tensordot", a1=self_block.oid, a2=other_block.oid, a1_shape=self_block.shape, a2_shape=other_block.shape, a1_T=self_block.transposed, a2_T=other_block.transposed, axes=1, syskwargs={ "grid_entry": sch_grid_entry, "grid_shape": sch_grid_shape }) oids.append(dot_oid) result_grid_entry = tuple(0 for _ in range(len(result.grid.grid_shape))) result_oid = self.system.sum_reduce(*oids, syskwargs={ "grid_entry": result_grid_entry, "grid_shape": result.grid.grid_shape }) result.blocks[result_grid_entry].oid = result_oid return result
def indirect_tsr(self, X: BlockArray, reshape_output=True): assert len(X.shape) == 2 # TODO (hme): This assertion is temporary and ensures returned # shape of qr of block is correct. assert X.block_shape[0] >= X.shape[1] # Compute R for each block. grid = X.grid grid_shape = grid.grid_shape shape = X.shape block_shape = X.block_shape R_oids = [] # Assume no blocking along second dim. for i in range(grid_shape[0]): # Select a row according to block_shape. row = [] for j in range(grid_shape[1]): row.append(X.blocks[i, j].oid) R_oids.append( self._system.qr(*row, mode="r", axis=1, syskwargs={ "grid_entry": (i, 0), "grid_shape": (grid_shape[0], 1), "options": { "num_return_vals": 1 } })) # Construct R by summing over R blocks. # TODO (hme): Communication may be inefficient due to redundancy of data. R_shape = (shape[1], shape[1]) R_block_shape = (block_shape[1], block_shape[1]) tsR = BlockArray( ArrayGrid(shape=R_shape, block_shape=R_shape, dtype=X.dtype.__name__), self._system) tsR.blocks[0, 0].oid = self._system.qr(*R_oids, mode="r", axis=0, syskwargs={ "grid_entry": (0, 0), "grid_shape": (1, 1), "options": { "num_return_vals": 1 } }) # If blocking is "tall-skinny," then we're done. if R_shape != R_block_shape: if reshape_output: R = tsR.reshape(shape=R_shape, block_shape=R_block_shape) else: R = tsR else: R = tsR return R
def test_array_rwd(): X: np.ndarray = np.random.random(3) stored_X = StoredArrayS3("darrays/%s_X" % "__test__") stored_X.put_grid( ArrayGrid(shape=X.shape, block_shape=X.shape, dtype=np.float64.__name__)) stored_X.init_grid() stored_X.put_array(X) assert np.allclose(X, stored_X.get_array()) stored_X.del_array() stored_X.delete_grid()
def create_basic_single_step(self, concrete_cls) -> BlockArrayBase: array_cls = BlockArrayBase if concrete_cls is None else concrete_cls dst_ba: BlockArrayBase = array_cls(self.grid, self._system) if 0 in self.shape: return dst_ba src_sel_arr: np.ndarray = selection.BasicSelection.block_selection(self._source.shape, self._source.block_shape) # TODO(hme): The following op is very slow for integer subscripts of large arrays. src_sel_clipped: np.ndarray = src_sel_arr & self.sel assert src_sel_clipped.shape == self._source.grid.grid_shape broadcast_shape = self.sel.get_broadcastable_shape() broadcast_block_shape = self.sel.get_broadcastable_block_shape(dst_ba.block_shape) dst_grid_bc: ArrayGrid = ArrayGrid(broadcast_shape, broadcast_block_shape, self.grid.dtype.__name__) dst_sel_arr: np.ndarray = selection.BasicSelection.block_selection(broadcast_shape, broadcast_block_shape) dst_sel_offset: np.ndarray = dst_sel_arr + self.sel.position() dst_entry_iterator = list(dst_ba.grid.get_entry_iterator()) for dst_index, dst_grid_entry_bc in enumerate(dst_grid_bc.get_entry_iterator()): dst_sel_offset_block: BasicSelection = dst_sel_offset[dst_grid_entry_bc] if dst_sel_offset_block.is_empty(): continue src_dst_intersection_arr = src_sel_clipped & dst_sel_offset_block sys: System = self._system src_oids = [] src_params = [] dst_params = [] for _, src_grid_entry in enumerate(self._source.grid.get_entry_iterator()): src_dst_intersection_block: BasicSelection = src_dst_intersection_arr[ src_grid_entry] if src_dst_intersection_block.is_empty(): continue src_block: Block = self._source.blocks[src_grid_entry] src_oids.append(src_block.oid) src_sel_block: BasicSelection = src_sel_arr[src_grid_entry] src_dep_sel_loc = src_dst_intersection_block - src_sel_block.position() src_params.append((src_dep_sel_loc.selector(), src_block.transposed)) dst_block_sel_loc = src_dst_intersection_block - dst_sel_offset_block.position() dst_params.append((dst_block_sel_loc.selector(), False)) dst_block: Block = dst_ba.blocks.reshape(dst_grid_bc.grid_shape)[dst_grid_entry_bc] dst_block.oid = sys.create_block(*src_oids, src_params=src_params, dst_params=dst_params, dst_shape=dst_block.shape, dst_shape_bc=dst_sel_offset_block.get_output_shape(), syskwargs={ "grid_entry": dst_entry_iterator[dst_index], "grid_shape": self.grid.grid_shape }) return dst_ba
def _vec_from_oids(self, oids, shape, block_shape, dtype): arr = BlockArray( ArrayGrid(shape=shape, block_shape=shape, dtype=dtype.__name__), self._system) # Make sure resulting grid shape is a vector (1 dimensional). assert np.sum(arr.grid.grid_shape) == (max(arr.grid.grid_shape) + len(arr.grid.grid_shape) - 1) for i, grid_entry in enumerate(arr.grid.get_entry_iterator()): arr.blocks[grid_entry].oid = oids[i] if block_shape != shape: return arr.reshape(block_shape=block_shape) return arr
def from_np(cls, arr, block_shape, copy, system): dtype_str = str(arr.dtype) grid = ArrayGrid(arr.shape, block_shape, dtype_str) rarr = SparseBlockArray(grid, system) grid_entry_iterator = grid.get_entry_iterator() for grid_entry in grid_entry_iterator: grid_slice = grid.get_slice(grid_entry) block = scipy.sparse.csr_matrix(arr[grid_slice]) rarr.blocks[grid_entry].oid = system.put(block) rarr.blocks[grid_entry].dtype = getattr(np, dtype_str) return rarr
def diag(self, X: BlockArray) -> BlockArray: if len(X.shape) == 1: shape = X.shape[0], X.shape[0] block_shape = X.block_shape[0], X.block_shape[0] grid = ArrayGrid(shape, block_shape, X.dtype.__name__) grid_meta = grid.to_meta() rarr = BlockArray(grid, self.system) for grid_entry in grid.get_entry_iterator(): syskwargs = { "grid_entry": grid_entry, "grid_shape": grid.grid_shape } if np.all(np.diff(grid_entry) == 0): # This is a diagonal block. rarr.blocks[grid_entry].oid = self.system.diag( X.blocks[grid_entry[0]].oid, syskwargs=syskwargs) else: rarr.blocks[grid_entry].oid = self.system.new_block( "zeros", grid_entry, grid_meta, syskwargs=syskwargs) elif len(X.shape) == 2: assert X.shape[0] == X.shape[1] assert X.block_shape[0] == X.block_shape[1] shape = X.shape[0], block_shape = X.block_shape[0], grid = ArrayGrid(shape, block_shape, X.dtype.__name__) rarr = BlockArray(grid, self.system) for grid_entry in X.grid.get_entry_iterator(): out_grid_entry = grid_entry[:1] out_grid_shape = grid.grid_shape[:1] syskwargs = { "grid_entry": out_grid_entry, "grid_shape": out_grid_shape } if np.all(np.diff(grid_entry) == 0): # This is a diagonal block. rarr.blocks[out_grid_entry].oid = self.system.diag( X.blocks[grid_entry].oid, syskwargs=syskwargs) else: raise ValueError("X must have 1 or 2 axes.") return rarr
def _matvec(self, other): # Schedule block matmult on existing block nodes of the matrix. # This is cheaper than moving matrix and vec blocks to result node. assert self.shape[1] == other.shape[0], str( (self.shape[1], other.shape[0])) result_shape = tuple(self.shape[:1] + other.shape[1:]) result_block_shape = tuple(self.block_shape[:1] + other.block_shape[1:]) result_grid = ArrayGrid(shape=result_shape, block_shape=result_block_shape, dtype=self.dtype.__name__) result = BlockArray(result_grid, self.system) for i in range(self.grid.grid_shape[0]): row = [] for j in range(self.grid.grid_shape[1]): grid_entry = (i, j) self_block: Block = self.blocks[grid_entry] if len(other.shape) == 2: other_block: Block = other.blocks[(grid_entry[1], 0)] result_grid_entry = (i, 0) else: other_block: Block = other.blocks[grid_entry[1]] result_grid_entry = (i, ) if self_block.transposed: # Reverse grid shape and entry to obtain virtual layout of matrix blocks. sch_grid_shape = tuple(reversed(self.grid.grid_shape)) sch_grid_entry = tuple(reversed(grid_entry)) else: sch_grid_shape = self.grid.grid_shape sch_grid_entry = grid_entry dot_oid = self.system.bop("tensordot", a1=self_block.oid, a2=other_block.oid, a1_shape=self_block.shape, a2_shape=other_block.shape, a1_T=self_block.transposed, a2_T=other_block.transposed, axes=1, syskwargs={ "grid_entry": sch_grid_entry, "grid_shape": sch_grid_shape }) row.append(dot_oid) result_oid = self.system.sum_reduce(*row, syskwargs={ "grid_entry": result_grid_entry, "grid_shape": result.grid.grid_shape }) result.blocks[result_grid_entry].oid = result_oid return result
def empty(cls, shape, block_shape, dtype, system): grid = ArrayGrid(shape=shape, block_shape=block_shape, dtype=dtype.__name__) grid_meta = grid.to_meta() arr = BlockArray(grid, system) for grid_entry in grid.get_entry_iterator(): arr.blocks[grid_entry].oid = system.empty(grid_entry, grid_meta, syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape }) return arr
def from_np(cls, arr, block_shape, copy, system): dtype_str = str(arr.dtype) grid = ArrayGrid(arr.shape, block_shape, dtype_str) rarr = BlockArray(grid, system) grid_entry_iterator = grid.get_entry_iterator() for grid_entry in grid_entry_iterator: grid_slice = grid.get_slice(grid_entry) block = arr[grid_slice] if copy: block = np.copy(block) rarr.blocks[grid_entry].oid = system.put(block) rarr.blocks[grid_entry].dtype = getattr(np, dtype_str) return rarr
def train(params: Dict, data: NumsDMatrix, *args, evals=(), **kwargs): X: BlockArray = data.X y: BlockArray = data.y assert len(X.shape) == 2 assert X.shape[0] == X.shape[0] and X.block_shape[0] == y.block_shape[0] assert len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1) app: ArrayApplication = _instance() sys: System = app.system sys.register("xgb_train", xgb_train_remote, {}) # Start tracker num_workers = X.grid.grid_shape[0] env = _start_rabit_tracker(num_workers) rabit_args = [("%s=%s" % item).encode() for item in env.items()] evals_flat = [] for eval_X, eval_y, eval_method in evals: if eval_X.shape != eval_X.block_shape: eval_X = eval_X.reshape(shape=eval_X.shape, block_shape=eval_X.shape) if eval_y.shape != eval_y.block_shape: eval_y = eval_y.reshape(shape=eval_y.shape, block_shape=eval_y.shape) eval_X_oid = eval_X.blocks.item().oid eval_y_oid = eval_y.blocks.item().oid evals_flat += [eval_X_oid, eval_y_oid, eval_method] X: BlockArray = X.reshape(block_shape=(X.block_shape[0], X.shape[1])) result: BlockArray = BlockArray( ArrayGrid(shape=(X.grid.grid_shape[0], ), block_shape=(1, ), dtype="dict"), sys) for grid_entry in X.grid.get_entry_iterator(): X_block: Block = X.blocks[grid_entry] i = grid_entry[0] if len(y.shape) == 1: y_block: Block = y.blocks[i] else: y_block: Block = y.blocks[i, 0] syskwargs = {"grid_entry": grid_entry, "grid_shape": X.grid.grid_shape} result.blocks[i].oid = sys.call("xgb_train", X_block.oid, y_block.oid, rabit_args, params, args, kwargs, *evals_flat, syskwargs=syskwargs) return result
def tensordot(self, other, axes=2): other = self.other_to_ba(other) # TODO: Reuse BlockArrayBase tensordot operator. this_axes = self.grid.grid_shape[:-axes] this_sum_axes = self.grid.grid_shape[-axes:] other_axes = other.grid.grid_shape[axes:] other_sum_axes = other.grid.grid_shape[:axes] assert this_sum_axes == other_sum_axes result_shape = tuple(self.shape[:-axes] + other.shape[axes:]) result_block_shape = tuple(self.block_shape[:-axes] + other.block_shape[axes:]) result_grid = ArrayGrid(shape=result_shape, block_shape=result_block_shape, dtype=self.dtype.__name__) assert result_grid.grid_shape == tuple(this_axes + other_axes) result_graphs = np.empty(shape=result_grid.grid_shape, dtype=np.object) this_dims = list(itertools.product(*map(range, this_axes))) other_dims = list(itertools.product(*map(range, other_axes))) sum_dims = list(itertools.product(*map(range, this_sum_axes))) for i in this_dims: for j in other_dims: grid_entry = tuple(i + j) if len(sum_dims) == 1: k = sum_dims[0] self_node: TreeNode = self.graphs[tuple(i + k)] other_node: TreeNode = other.graphs[tuple(k + j)] dot_node: TreeNode = self_node.tensordot(other_node, axes=axes) result_graphs[grid_entry] = dot_node else: add_reduce_op = ReductionOp() add_reduce_op.cluster_state = self.cluster_state add_reduce_op.op_name = "add" add_reduce_op.copy_on_op = self.copy_on_op for k in sum_dims: self_node: TreeNode = self.graphs[tuple(i + k)] other_node: TreeNode = other.graphs[tuple(k + j)] dot_node: TreeNode = self_node.tensordot(other_node, axes=axes) # Explicitly add parent here, since sum depends on prod. # Not needed for other ops; make_bop takes care of it. # We don't need to copy the node here since the local # tree structure here is never exposed. dot_node.parent = add_reduce_op add_reduce_op.add_child(dot_node) result_graphs[grid_entry] = add_reduce_op return GraphArray(result_grid, self.cluster_state, result_graphs, copy_on_op=self.copy_on_op)
def test_split(app_inst: ArrayApplication): # TODO (hme): Implement a split leveraging block_shape param in reshape op. x = app_inst.array(np.array([1.0, 2.0, 3.0, 4.0]), block_shape=(4, )) syskwargs = x.blocks[0].syskwargs() syskwargs["options"] = {"num_returns": 2} res1, res2 = x.system.split(x.blocks[0].oid, 2, axis=0, transposed=False, syskwargs=syskwargs) ba = BlockArray(ArrayGrid((4, ), (2, ), x.dtype.__name__), x.system) ba.blocks[0].oid = res1 ba.blocks[1].oid = res2 assert np.allclose([1.0, 2.0, 3.0, 4.0], ba.get())
def permutation(self, size, block_size): shape = (size,) block_shape = (block_size,) grid: ArrayGrid = ArrayGrid(shape=shape, block_shape=shape, dtype=np.int64.__name__) ba = BlockArray(grid, self._system) for grid_entry in ba.grid.get_entry_iterator(): rng_params = list(self._rng.new_block_rng_params()) block: Block = ba.blocks[grid_entry] block.oid = self._system.permutation(rng_params, size, syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape }) return ba.reshape(block_shape=block_shape)
def read_csv(self, filename, dtype=float, delimiter=',', has_header=False, num_workers=4): file_size = storage_utils.get_file_size(filename) file_batches: storage_utils.Batch = storage_utils.Batch.from_num_batches( file_size, num_workers) blocks = [] shape_oids = [] for i, batch in enumerate(file_batches.batches): file_start, file_end = batch block_oid, shape_oid = self.system.call("read_csv_block", filename, file_start, file_end, dtype, delimiter, has_header, syskwargs={ "grid_entry": (i, ), "grid_shape": (num_workers, ), "options": { "num_returns": 2 } }) blocks.append(block_oid) shape_oids.append(shape_oid) shapes = self.system.get(shape_oids) arrays = [] for i in range(len(shapes)): shape = shapes[i] if shape[0] == 0: continue block = blocks[i] grid = ArrayGrid(shape=shape, block_shape=shape, dtype=dtype.__name__) arr = BlockArray(grid, self.system) iter_one = True for grid_entry in grid.get_entry_iterator(): assert iter_one iter_one = False arr.blocks[grid_entry].oid = block arrays.append(arr) return arrays
def arange(self, shape, block_shape, step=1, dtype=np.int64) -> BlockArray: assert step == 1 # Generate ranges per block. grid = ArrayGrid(shape, block_shape, dtype.__name__) rarr = BlockArray(grid, self.system) for _, grid_entry in enumerate(grid.get_entry_iterator()): syskwargs = { "grid_entry": grid_entry, "grid_shape": grid.grid_shape } start = block_shape[0] * grid_entry[0] entry_shape = grid.get_block_shape(grid_entry) stop = start + entry_shape[0] rarr.blocks[grid_entry].oid = self.system.arange( start, stop, step, dtype, syskwargs=syskwargs) return rarr
def test_array_rwd(): conn = boto3.resource('s3', region_name='us-east-1') assert conn.Bucket('darrays') not in conn.buckets.all() conn.create_bucket(Bucket='darrays') X: np.ndarray = np.random.random(3) stored_X = StoredArrayS3("darrays/%s_X" % "__test__") stored_X.put_grid( ArrayGrid(shape=X.shape, block_shape=X.shape, dtype=np.float64.__name__)) stored_X.init_grid() stored_X.put_array(X) assert np.allclose(X, stored_X.get_array()) stored_X.del_array() stored_X.delete_grid()
def delete_fs(self, filename: str): meta = self._filesystem.read_meta_fs(filename) addresses = meta["addresses"] grid_meta = meta["grid_meta"] grid = ArrayGrid.from_meta(grid_meta) result_grid = ArrayGrid(grid.grid_shape, tuple(np.ones_like(grid.shape, dtype=np.int)), dtype=dict.__name__) rarr = BlockArray(result_grid, self._system) for grid_entry in addresses: node_address = addresses[grid_entry] options = {"resources": {node_address: 1.0 / 10**4}} rarr.blocks[grid_entry].oid = self._filesystem.delete_block_fs( filename, grid_entry, grid_meta, options=options) self._filesystem.delete_meta_fs(filename) return rarr
def broadcast_to(self, shape): b = array_utils.broadcast(self.shape, shape) result_block_shape = array_utils.broadcast_block_shape(self.shape, shape, self.block_shape) result: BlockArrayBase = BlockArrayBase(ArrayGrid(b.shape, result_block_shape, self.grid.dtype.__name__), self.system) extras = [] # Below taken directly from _broadcast_to in numpy's stride_tricks.py. it = np.nditer( (self.blocks,), flags=['multi_index', 'refs_ok', 'zerosize_ok'] + extras, op_flags=['readonly'], itershape=result.grid.grid_shape, order='C') with it: # never really has writebackifcopy semantics broadcast = it.itviews[0] result.blocks = broadcast return result
def ga_from_arr(self, arr, result_shape): sample_idx = tuple(0 for dim in arr.shape) if isinstance(arr, TreeNode): sample_node: TreeNode = arr assert result_shape == () else: sample_node: TreeNode = arr[sample_idx] result_block_shape = sample_node.shape() result_dtype_str = self.grid.dtype.__name__ result_grid = ArrayGrid(shape=result_shape, block_shape=result_block_shape, dtype=result_dtype_str) assert arr.shape == result_grid.grid_shape return GraphArray(result_grid, self.cluster_state, arr, copy_on_op=self.copy_on_op)
def _delete(self, filename, store_cls, remote_func): grid = self._get_array_grid(filename, store_cls) result_grid = ArrayGrid(grid.grid_shape, tuple(np.ones_like(grid.shape, dtype=np.int)), dtype=dict.__name__) rarr = BlockArray(result_grid, self._system) for grid_entry in grid.get_entry_iterator(): rarr.blocks[grid_entry].oid = remote_func(filename, grid_entry, grid.to_meta(), syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape }) return rarr
def _write(self, ba: BlockArray, filename, remote_func): grid = ba.grid result_grid = ArrayGrid(grid.grid_shape, tuple(np.ones_like(grid.shape, dtype=np.int)), dtype=dict.__name__) rarr = BlockArray(result_grid, self.system) for grid_entry in grid.get_entry_iterator(): rarr.blocks[grid_entry].oid = remote_func( ba.blocks[grid_entry].oid, filename, grid_entry, grid.to_meta(), syskwargs={ "grid_entry": grid_entry, "grid_shape": grid.grid_shape }) return rarr
def __init__(self, source, sel: BasicSelection = None, block_shape: tuple = None): self._source: BlockArrayBase = source self._system: System = self._source.system if sel is None: sel = BasicSelection.from_shape(self._source.shape) # Currently, this is all we support. assert len(sel.axes) == len(self._source.shape) self.sel = sel self.shape: tuple = self.sel.get_output_shape() if block_shape is None: block_shape: tuple = array_utils.block_shape_from_subscript(self.sel.selector(), self._source.block_shape) self.block_shape = block_shape assert len(self.block_shape) == len(self.shape) self.grid: ArrayGrid = ArrayGrid(self.shape, self.block_shape, dtype=self._source.dtype.__name__)